ichar_utf8 Function

public pure function ichar_utf8(chara) result(res)

Take a UTF-8 character as an argument and return the integer (also known as "code point" in Unicode) representing its UTF-8 binary string.

This function is like an extension of char() for the UTF-8 codeset.

Arguments

Type IntentOptional Attributes Name
character(len=*), intent(in) :: chara

Return Value integer(kind=int32)


Source Code

   pure function ichar_utf8 (chara) result(res)
      use, intrinsic :: iso_fortran_env
      implicit none
      character(*), intent(in) :: chara   ! Input one UTF-8 character

      integer(int32) :: res         ! Resulting integer representing an UTF-8 binary string.
      integer(int8)  :: byte(4)     ! Byte array (32bit)
      integer(int8)  :: shift_3, shift_4, shift_5, shift_7              ! Shift values
      integer(int8)  :: mask_2_bit, mask_3_bit, mask_4_bit, mask_5_bit  ! Masks for bit operations
      integer(int32) :: buf         ! Buffer for bit operations

      character(8) :: binary        ! 8-byte character string representing binary.

      binary = '00111111'           ! 6-bit mask for continuation bytes.
      read(binary, '(b8.8)') mask_2_bit

      binary = '00011111'           ! 5-bit mask for 2-byte characters.
      read(binary, '(b8.8)') mask_3_bit

      binary = '00001111'           ! 4-bit mask for 3-byte characters.
      read(binary, '(b8.8)') mask_4_bit

      binary = '00000111'           ! 3-bit mask for 4-byte characters.
      read(binary, '(b8.8)') mask_5_bit

      res = 0     ! Initialize result

      if (len(chara) > 4)  then     ! Check if the length of input character is more than 4 bytes.
         res = -1                   ! Invalid UTF-8 character.
         return
      end if

      ! Convert a multi-byte character to thier integer byte representation.
      byte(1) = int(ichar(chara(1:1)),kind(byte))
      if (len(chara) >= 2) byte(2) = int(ichar(chara(2:2)), kind(byte))
      if (len(chara) >= 3) byte(3) = int(ichar(chara(3:3)), kind(byte))
      if (len(chara) >= 4) byte(4) = int(ichar(chara(4:4)), kind(byte))

      ! Perform bit shifts to determine character's byte-length.
      shift_3 = ishft(byte(1), -3)
      shift_4 = ishft(byte(1), -4)
      shift_5 = ishft(byte(1), -5)
      shift_7 = ishft(byte(1), -7)

      ! 1-byte character
      if (shift_7 == 0) then

         res = byte(1)
         return

      ! 4-byte character
      else if (shift_3 == 30) then

         ! First 1 byte
         res =  iand(byte(1), mask_5_bit)

         ! Continuation bytes
         res = ishft(res, 6)     ! Left shift by 6 bits and store into res
         buf =  iand(byte(2), mask_2_bit) ! Mask `byte(2)` with `mask_2_bit` and store the result into `buf`.
         res =   ior(res, buf)   ! Take the bitwise OR of `res` and `buf`. The same applies below.

         res = ishft(res, 6)
         buf =  iand(byte(3), mask_2_bit)
         res =   ior(res, buf)

         res = ishft(res, 6)
         buf =  iand(byte(4), mask_2_bit)
         res =   ior(res, buf)

      ! 3-byte character
      else if (shift_4 == 14) then

         res =  iand(byte(1), mask_4_bit)

         res = ishft(res, 6)
         buf =  iand(byte(2), mask_2_bit)
         res =   ior(res, buf)

         res = ishft(res, 6)
         buf =  iand(byte(3), mask_2_bit)
         res =   ior(res, buf)

      ! 2-byte character
      else if (shift_5 == 6) then

         res =  iand(byte(1), mask_3_bit)

         res = ishft(res, 6)
         buf =  iand(byte(2), mask_2_bit)
         res =   ior(res, buf)

      end if
   end function ichar_utf8