Take a UTF-8 character as an argument and return the integer (also known as "code point" in Unicode) representing its UTF-8 binary string.
This function is like an extension of char() for the UTF-8 codeset.
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
character(len=*), | intent(in) | :: | chara |
pure function ichar_utf8 (chara) result(res) use, intrinsic :: iso_fortran_env implicit none character(*), intent(in) :: chara ! Input one UTF-8 character integer(int32) :: res ! Resulting integer representing an UTF-8 binary string. integer(int8) :: byte(4) ! Byte array (32bit) integer(int8) :: shift_3, shift_4, shift_5, shift_7 ! Shift values integer(int8) :: mask_2_bit, mask_3_bit, mask_4_bit, mask_5_bit ! Masks for bit operations integer(int32) :: buf ! Buffer for bit operations character(8) :: binary ! 8-byte character string representing binary. binary = '00111111' ! 6-bit mask for continuation bytes. read(binary, '(b8.8)') mask_2_bit binary = '00011111' ! 5-bit mask for 2-byte characters. read(binary, '(b8.8)') mask_3_bit binary = '00001111' ! 4-bit mask for 3-byte characters. read(binary, '(b8.8)') mask_4_bit binary = '00000111' ! 3-bit mask for 4-byte characters. read(binary, '(b8.8)') mask_5_bit res = 0 ! Initialize result if (len(chara) > 4) then ! Check if the length of input character is more than 4 bytes. res = -1 ! Invalid UTF-8 character. return end if ! Convert a multi-byte character to thier integer byte representation. byte(1) = int(ichar(chara(1:1)),kind(byte)) if (len(chara) >= 2) byte(2) = int(ichar(chara(2:2)), kind(byte)) if (len(chara) >= 3) byte(3) = int(ichar(chara(3:3)), kind(byte)) if (len(chara) >= 4) byte(4) = int(ichar(chara(4:4)), kind(byte)) ! Perform bit shifts to determine character's byte-length. shift_3 = ishft(byte(1), -3) shift_4 = ishft(byte(1), -4) shift_5 = ishft(byte(1), -5) shift_7 = ishft(byte(1), -7) ! 1-byte character if (shift_7 == 0) then res = byte(1) return ! 4-byte character else if (shift_3 == 30) then ! First 1 byte res = iand(byte(1), mask_5_bit) ! Continuation bytes res = ishft(res, 6) ! Left shift by 6 bits and store into res buf = iand(byte(2), mask_2_bit) ! Mask `byte(2)` with `mask_2_bit` and store the result into `buf`. res = ior(res, buf) ! Take the bitwise OR of `res` and `buf`. The same applies below. res = ishft(res, 6) buf = iand(byte(3), mask_2_bit) res = ior(res, buf) res = ishft(res, 6) buf = iand(byte(4), mask_2_bit) res = ior(res, buf) ! 3-byte character else if (shift_4 == 14) then res = iand(byte(1), mask_4_bit) res = ishft(res, 6) buf = iand(byte(2), mask_2_bit) res = ior(res, buf) res = ishft(res, 6) buf = iand(byte(3), mask_2_bit) res = ior(res, buf) ! 2-byte character else if (shift_5 == 6) then res = iand(byte(1), mask_3_bit) res = ishft(res, 6) buf = iand(byte(2), mask_2_bit) res = ior(res, buf) end if end function ichar_utf8