Take a UTF-8 character as an argument and return the integer (also known as "code point" in Unicode) representing its UTF-8 binary string.
This function is like an extension of char() for the UTF-8 codeset.
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
character(len=*), | intent(in) | :: | chara |
pure function ichar_utf8 (chara) result(res) use, intrinsic :: iso_fortran_env implicit none character(*), intent(in) :: chara ! Input one UTF-8 character integer(int32) :: res ! Resulting integer representing an UTF-8 binary string. integer(int8) :: byte(4) ! Byte array (32bit) integer(int8) :: shift_3, shift_4, shift_5, shift_7 ! Shift values integer(int32) :: buf ! Buffer for bit operations character(4) :: cache integer(int8), parameter :: mask_2_bit = int(z'3f', kind=int8) ! '00111111' 6-bit mask for continuation bytes. integer(int8), parameter :: mask_3_bit = int(z'1f', kind=int8) ! '00011111' 5-bit mask for 2-byte characters. integer(int8), parameter :: mask_4_bit = int(z'0f', kind=int8) ! '00001111' 4-bit mask for 3-byte characters. integer(int8), parameter :: mask_5_bit = int(z'07', kind=int8) ! '00000111' 3-bit mask for 4-byte characters. res = 0 ! Initialize result if (len(chara) > 4) then ! Check if the length of input character is more than 4 bytes. res = -1 ! Invalid UTF-8 character. return end if cache = adjustl(chara) ! Convert a multi-byte character to thier integer byte representation. byte(1) = int(ichar(cache(1:1)),kind(byte)) byte(2) = int(ichar(cache(2:2)), kind(byte)) byte(3) = int(ichar(cache(3:3)), kind(byte)) byte(4) = int(ichar(cache(4:4)), kind(byte)) select case (iand(byte(1), -8_int8)) ! -8 = 0xf8 case (0:127) res = byte(1) case (-64:-33) res = ior(ishft(int(iand(byte(1), mask_3_bit),kind=int32),6), & int(iand(byte(2), mask_2_bit),kind=int32)) case (-32:-17) res = ior(& ishft( & ior(& ishft(int(iand(byte(1), mask_4_bit), kind=int32),6),& int(iand(byte(2), mask_2_bit), kind=int32)),6),& int(iand(byte(3), mask_2_bit), kind=int32)) case (-16:-9) res = ior(& ishft(& ior(& ishft(& ior(& ishft(int( iand(byte(1), mask_5_bit),kind=int32),6),& int(iand(byte(2), mask_2_bit),kind=int32)),6), & int(iand(byte(3), mask_2_bit),kind=int32)),6), & int(iand(byte(4), mask_2_bit),kind=int32)) case default res = -1 end select ! return ! ! Perform bit shifts to determine character's byte-length. ! shift_3 = ishft(byte(1), -3) ! shift_4 = ishft(byte(1), -4) ! shift_5 = ishft(byte(1), -5) ! shift_7 = ishft(byte(1), -7) ! ! 1-byte character ! if (shift_7 == 0) then ! res = byte(1) ! ! 4-byte character ! else if (shift_3 == 30) then ! ! First 1 byte ! res = iand(byte(1), mask_5_bit) ! ! Continuation bytes ! res = ishft(res, 6) ! Left shift by 6 bits and store into res ! buf = iand(byte(2), mask_2_bit) ! Mask `byte(2)` with `mask_2_bit` and store the result into `buf`. ! res = ior(res, buf) ! Take the bitwise OR of `res` and `buf`. The same applies below. ! res = ishft(res, 6) ! buf = iand(byte(3), mask_2_bit) ! res = ior(res, buf) ! res = ishft(res, 6) ! buf = iand(byte(4), mask_2_bit) ! res = ior(res, buf) ! ! 3-byte character ! else if (shift_4 == 14) then ! res = iand(byte(1), mask_4_bit) ! res = ishft(res, 6) ! buf = iand(byte(2), mask_2_bit) ! res = ior(res, buf) ! res = ishft(res, 6) ! buf = iand(byte(3), mask_2_bit) ! res = ior(res, buf) ! ! 2-byte character ! else if (shift_5 == 6) then ! res = iand(byte(1), mask_3_bit) ! res = ishft(res, 6) ! buf = iand(byte(2), mask_2_bit) ! res = ior(res, buf) ! end if end function ichar_utf8