ichar_utf8 Function

public pure function ichar_utf8(chara) result(res)

Take a UTF-8 character as an argument and return the integer (also known as "code point" in Unicode) representing its UTF-8 binary string.

This function is like an extension of char() for the UTF-8 codeset.

Arguments

Type IntentOptional Attributes Name
character(len=*), intent(in) :: chara

Return Value integer(kind=int32)


Source Code

   pure function ichar_utf8 (chara) result(res)
      use, intrinsic :: iso_fortran_env
      implicit none
      character(*), intent(in) :: chara   ! Input one UTF-8 character

      integer(int32) :: res         ! Resulting integer representing an UTF-8 binary string.
      integer(int8)  :: byte(4)     ! Byte array (32bit)
      integer(int8)  :: shift_3, shift_4, shift_5, shift_7              ! Shift values
      integer(int32) :: buf         ! Buffer for bit operations
      character(4) :: cache
      integer(int8), parameter :: mask_2_bit = int(z'3f', kind=int8) ! '00111111' 6-bit mask for continuation bytes.
      integer(int8), parameter :: mask_3_bit = int(z'1f', kind=int8) ! '00011111' 5-bit mask for 2-byte characters.
      integer(int8), parameter :: mask_4_bit = int(z'0f', kind=int8) ! '00001111' 4-bit mask for 3-byte characters.
      integer(int8), parameter :: mask_5_bit = int(z'07', kind=int8) ! '00000111' 3-bit mask for 4-byte characters.

      res = 0     ! Initialize result

      if (len(chara) > 4)  then     ! Check if the length of input character is more than 4 bytes.
         res = -1                   ! Invalid UTF-8 character.
         return
      end if
      cache = adjustl(chara)
      ! Convert a multi-byte character to thier integer byte representation.
      byte(1) = int(ichar(cache(1:1)),kind(byte))
      byte(2) = int(ichar(cache(2:2)), kind(byte))
      byte(3) = int(ichar(cache(3:3)), kind(byte))
      byte(4) = int(ichar(cache(4:4)), kind(byte))


      select case (iand(byte(1), -8_int8)) ! -8 = 0xf8
      case (0:127)
         res = byte(1)
      case (-64:-33)
         res = ior(ishft(int(iand(byte(1), mask_3_bit),kind=int32),6), &
                   int(iand(byte(2), mask_2_bit),kind=int32))
      case (-32:-17)
         res = ior(&
                  ishft( &
                     ior(&
                        ishft(int(iand(byte(1), mask_4_bit), kind=int32),6),&
                        int(iand(byte(2), mask_2_bit), kind=int32)),6),&
                  int(iand(byte(3), mask_2_bit), kind=int32))
      case (-16:-9)
         res = ior(&
                  ishft(&
                     ior(&
                        ishft(&
                            ior(&
                              ishft(int( iand(byte(1), mask_5_bit),kind=int32),6),&
                              int(iand(byte(2), mask_2_bit),kind=int32)),6), &
                        int(iand(byte(3), mask_2_bit),kind=int32)),6), &
                     int(iand(byte(4), mask_2_bit),kind=int32))
      case default
         res = -1
      end select

      ! return

      ! ! Perform bit shifts to determine character's byte-length.
      ! shift_3 = ishft(byte(1), -3)
      ! shift_4 = ishft(byte(1), -4)
      ! shift_5 = ishft(byte(1), -5)
      ! shift_7 = ishft(byte(1), -7)

      ! ! 1-byte character
      ! if (shift_7 == 0) then

      !    res = byte(1)

      ! ! 4-byte character
      ! else if (shift_3 == 30) then

      !    ! First 1 byte
      !    res =  iand(byte(1), mask_5_bit)

      !    ! Continuation bytes
      !    res = ishft(res, 6)     ! Left shift by 6 bits and store into res
      !    buf =  iand(byte(2), mask_2_bit) ! Mask `byte(2)` with `mask_2_bit` and store the result into `buf`.
      !    res =   ior(res, buf)   ! Take the bitwise OR of `res` and `buf`. The same applies below.

      !    res = ishft(res, 6)
      !    buf =  iand(byte(3), mask_2_bit)
      !    res =   ior(res, buf)

      !    res = ishft(res, 6)
      !    buf =  iand(byte(4), mask_2_bit)
      !    res =   ior(res, buf)

      ! ! 3-byte character
      ! else if (shift_4 == 14) then

      !    res =  iand(byte(1), mask_4_bit)

      !    res = ishft(res, 6)
      !    buf =  iand(byte(2), mask_2_bit)
      !    res =   ior(res, buf)

      !    res = ishft(res, 6)
      !    buf =  iand(byte(3), mask_2_bit)
      !    res =   ior(res, buf)

      ! ! 2-byte character
      ! else if (shift_5 == 6) then

      !    res =  iand(byte(1), mask_3_bit)

      !    res = ishft(res, 6)
      !    buf =  iand(byte(2), mask_2_bit)
      !    res =   ior(res, buf)

      ! end if
   end function ichar_utf8