This file contains procedures to handle UTF-8 character set.
! Fortran Regular Expression (Forgex) ! ! MIT License ! ! (C) Amasaki Shinobu, 2023-2025 ! A regular expression engine for Fortran. ! forgex_utf8_m module is a part of Forgex. !! This file contains procedures to handle UTF-8 character set. !> The `forgex_utf8_m` module processes a byte-indexed character strings type as UTF-8 strings. module forgex_utf8_m use :: iso_fortran_env, only: int8 use :: iso_c_binding implicit none private public :: idxutf8 public :: next_idxutf8 ! for processing pattern public :: next_idxutf8_strict ! for processing text (Cannot be assumed to be UTF-8 character) public :: char_utf8, ichar_utf8 public :: make_replacement_char public :: count_token public :: is_first_byte_of_character public :: is_first_byte_of_character_array public :: len_trim_utf8, len_utf8 public :: is_valid_multiple_byte_character public :: adjustl_multi_byte public :: trim_invalid_utf8_byte public :: reverse_utf8 integer(int8), parameter, public :: fullbit = -1 ! 11111111 integer(int8), parameter, public :: ascii_mask= 127 ! 01111111 integer(int8), parameter, public :: lead_2_mask = -33 ! 11011111 integer(int8), parameter, public :: lead_3_mask = -17 ! 11101111 integer(int8), parameter, public :: lead_4_mask= -9 ! 11110111 integer(int8), parameter, public :: continuation_mask = -65 ! 10111111 contains ! INDEX OF UTF8 !> This function returns the index of the end of the (multibyte) character, !> given the string str and the current index curr. pure function idxutf8 (str, curr) result(tail) use, intrinsic :: iso_fortran_env use :: forgex_parameters_m implicit none character(*), intent(in) :: str ! Input string, a multibyte character is expected. integer(int32), intent(in) :: curr ! Current index. integer(int32) :: tail ! Resulting index of the end of the character. integer(int32) :: i ! Loop variable. integer(int8) :: byte ! Variable to hold the byte value of the 1-byte part of the character integer(int8) :: shift_3, shift_4, shift_5, shift_6, shift_7 ! Shifted byte values. ! If the index exceeds the length of str, return the invalid value. if (curr > len(str)) then tail = INVALID_CHAR_INDEX return end if tail = curr ! Initialize tail to the current index. !! Class of invalid UTF-8 characters !! 1. invalid lead byte !! 2. invalid trail byte !! 3. overrun !! 4. over long encoding !! 5. incomplete multibyte sequence !! 6. invalid character range (U+D800-U+DFFF) !! 7. BOM appears in the middle !! 8. isolated trail byte ! !! In the above case, `idxutf8` will returns `curr`. !! Then, you should call `is_valid_multiple_byte_character` at a higher level to validate the substring. outer: do i = 0, 3 ! Loop over the next four bytes to determine the byte-length of the character. ! for terminated incomplete multibyte character if (curr+i > len(str)) then tail = curr return end if byte = int(ichar(str(curr+i:curr+i)), kind(byte)) ! Get the byte value of the character at position `curr+1`. shift_3 = ishft(byte, -3) ! Right shift the byte by 3 bits shift_4 = ishft(byte, -4) ! Right shift the byte by 3 bits shift_5 = ishft(byte, -5) ! Right shift the byte by 5 bits shift_6 = ishft(byte, -6) ! Right shift the byte by 6 bits shift_7 = ishft(byte, -7) ! Right shift the byte by 7 bits if (shift_6 == 2) cycle ! Continue to the next iteration if the `byte` is a continuation byte (10xxxxxx_2). if (i == 0) then ! Check the first byte to determine the character length. if (shift_3 == 30 ) then ! If the byte starts with 11110_2 (4-byte character). tail = curr + 4 - 1 exit outer end if if (shift_4 == 14) then ! If the byte starts witth 1110_2 (3-byte character). tail = curr + 3 - 1 exit outer end if if (shift_5 == 6) then ! If the byte starts with 110_2 (2-byte character). tail = curr + 2 - 1 exit outer end if if (shift_7 == 0) then ! If then byte starts with 0_2 (1-byte character). tail = curr + 1 - 1 exit outer end if else ! Check continuation byptes if (shift_3 == 30 .or. shift_4 == 14 .or. shift_5 == 6 .or. shift_7 == 0) then tail = curr + i - 1 exit outer end if end if end do outer if (tail <= len(str)) then if (.not. is_valid_multiple_byte_character(str(curr:tail))) then tail = curr else return end if else tail = curr end if end function idxutf8 !> This function returns the index of the next character, !> given the string str and the current index curr. !> If the current index is for the last character, it returns the invalid value. pure function next_idxutf8(str, curr) result(res) use :: forgex_parameters_m implicit none character(*), intent(in) :: str integer, intent(in) :: curr integer :: res integer :: curr_end curr_end = idxutf8(str, curr) if (curr_end /= INVALID_CHAR_INDEX) then res = curr_end + 1 else res = INVALID_CHAR_INDEX end if end function next_idxutf8 !> This subroutine returns the index of the next UTF-8 character conteined in `str`. !> This is used to handle strings that may not be encoded by UTF-8. pure subroutine next_idxutf8_strict(str, curr, next, is_valid) use :: forgex_parameters_m implicit none character(*), intent(in) :: str integer, intent(in) :: curr integer, intent(inout) :: next logical, intent(inout) :: is_valid integer :: ib, ie ! initialize is_valid = .false. ib = curr ie = idxutf8(str, ib) if (ie /= INVALID_CHAR_INDEX) then is_valid = is_valid_multiple_byte_character(str(ib:ie)) next = ie + 1 else next = curr+1 is_valid = .false. end if end subroutine next_idxutf8_strict !> This function checks the input byte string is valid as a single UTF-8 character. pure function is_valid_multiple_byte_character(chara) result(res) use, intrinsic :: iso_fortran_env, only: int32, int8 implicit none character(*), intent(in) :: chara logical :: res integer :: siz, i, expected_siz integer(int8) :: shift_3, shift_4, shift_5, shift_6, shift_7 integer(int8) :: byte res = .true. siz = len(chara) byte = ichar(chara(1:1), kind=int8) shift_3 = ishft(byte, -3) ! Right shift the byte by 3 bits shift_4 = ishft(byte, -4) ! Right shift the byte by 4 bits shift_5 = ishft(byte, -5) ! Right shift the byte by 5 bits shift_6 = ishft(byte, -6) ! Right shift the byte by 6 bits shift_7 = ishft(byte, -7) ! Right shift the byte by 7 bits ! 1st byte if (shift_3 == 31) then ! 5-byte character (invalid) 11111xxx_2 res = .false. return else if (shift_3 == 30) then ! 4-byte character `11110xxx_2` expected_siz = 4 else if (shift_4 == 14)then ! 3 byte character `1110xxxx_2` expected_siz = 3 else if (shift_5 == 6) then ! 2-byte character `110xxxxx_2` expected_siz = 2 else if (shift_7 == 0) then ! for 1-byte character `0xxxxxxx` expected_siz = 1 else res = .false. return end if if (expected_siz /= siz) then res = .false. return end if do i = 2, expected_siz byte = ichar(chara(i:i), kind=int8) shift_6 = ishft(byte, -6) ! Right shift the byte by 6 bits such as `10xxxxxx_2` if (shift_6 /= 2) then res = .false. return end if end do end function is_valid_multiple_byte_character !> The `char_utf8` function takes a code point as integer in Unicode character set, !> and returns the corresponding character as UTF-8 binary string. !> !> This function is like an extension of char() for the UTF-8 codeset. pure function char_utf8 (code) result(str) use, intrinsic :: iso_fortran_env implicit none integer(int32), intent(in) :: code ! Input Unicode code point. character(:), allocatable :: str ! Resulting one UTF-8 character. character(32), allocatable :: bin ! A 32-digit number expressed in character format for masking. integer(int32) :: buf, mask ! Buffer and mask for bit operations. integer(int8) :: byte(4) ! Array to hold up 4 bytes of the UTF-8 character. str = '' ! Initialize result string. buf = code ! Initialize buffer with input `code` point. bin = '0000000000000000000000000111111' ! Lower 6-bit mask read(bin, '(b32.32)') mask ! Read the `mask` from the `bin` character string. byte(1) = int(iand(ishft(buf, -18), mask),kind(byte)) ! First byte buf = code byte(2) = int(iand(ishft(buf, -12), mask), kind(byte)) ! Second byte buf = code byte(3) = int(iand(ishft(buf, -6), mask), kind(byte)) ! Third byte buf = code byte(4) = int(iand(buf, mask), kind(byte)) ! Fourth byte if (code > 2**7-1) then ! Check if the `code` point is greater than 127 (non-ASCII character). if (2**16 -1 < code) then ! 4-byte character byte(1) = ibset(byte(1),7) byte(1) = ibset(byte(1),6) byte(1) = ibset(byte(1),5) byte(1) = ibset(byte(1),4) byte(1) = ibclr(byte(1),3) byte(2) = set_continuation_byte(byte(2)) ! Set continuation bytes. byte(3) = set_continuation_byte(byte(3)) byte(4) = set_continuation_byte(byte(4)) else if (2**11 - 1 < code) then ! 3-byte character byte(1) = 32 byte(2) = ibset(byte(2), 7) byte(2) = ibset(byte(2), 6) byte(2) = ibset(byte(2), 5) byte(2) = ibclr(byte(2), 4) byte(3) = set_continuation_byte(byte(3)) byte(4) = set_continuation_byte(byte(4)) else if (2**7 -1 < code) then ! 2-byte character byte(1) = 32 byte(2) = 32 byte(3) = ibset(byte(3), 7) byte(3) = ibset(byte(3), 6) byte(3) = ibclr(byte(3), 5) byte(4) = set_continuation_byte(byte(4)) end if str = char(byte(1)) //char(byte(2)) //char(byte(3)) //char(byte(4)) ! Concatenate bytes into a string. str = trim(adjustl(str)) ! Trim leading and tailing space. else str = char(code) ! For ASCII characters. end if end function char_utf8 !> This function take one byte, set the first two bits to 10, and !> returns one byte of the continuation part. pure function set_continuation_byte(byte) result(res) use, intrinsic :: iso_fortran_env, only: int8 implicit none integer(int8), intent(in) :: byte integer(int8) :: res res = ibset(byte, 7) ! 1xxxxxxx res = ibclr(res, 6) ! 10xxxxxx end function set_continuation_byte !> Take a UTF-8 character as an argument and !> return the integer (also known as "code point" in Unicode) representing !> its UTF-8 binary string. !> !> This function is like an extension of char() for the UTF-8 codeset. pure function ichar_utf8 (chara) result(res) use, intrinsic :: iso_fortran_env implicit none character(*), intent(in) :: chara ! Input one UTF-8 character integer(int32) :: res ! Resulting integer representing an UTF-8 binary string. integer(int8) :: byte(4) ! Byte array (32bit) integer(int8) :: shift_3, shift_4, shift_5, shift_7 ! Shift values integer(int32) :: buf ! Buffer for bit operations character(4) :: cache integer(int8), parameter :: mask_2_bit = int(z'3f', kind=int8) ! '00111111' 6-bit mask for continuation bytes. integer(int8), parameter :: mask_3_bit = int(z'1f', kind=int8) ! '00011111' 5-bit mask for 2-byte characters. integer(int8), parameter :: mask_4_bit = int(z'0f', kind=int8) ! '00001111' 4-bit mask for 3-byte characters. integer(int8), parameter :: mask_5_bit = int(z'07', kind=int8) ! '00000111' 3-bit mask for 4-byte characters. res = 0 ! Initialize result if (len(chara) > 4) then ! Check if the length of input character is more than 4 bytes. res = -1 ! Invalid UTF-8 character. return end if cache = adjustl(chara) ! Convert a multi-byte character to thier integer byte representation. byte(1) = int(ichar(cache(1:1)),kind(byte)) byte(2) = int(ichar(cache(2:2)), kind(byte)) byte(3) = int(ichar(cache(3:3)), kind(byte)) byte(4) = int(ichar(cache(4:4)), kind(byte)) select case (iand(byte(1), -8_int8)) ! -8 = 0xf8 case (0:127) res = byte(1) case (-64:-33) res = ior(ishft(int(iand(byte(1), mask_3_bit),kind=int32),6), & int(iand(byte(2), mask_2_bit),kind=int32)) case (-32:-17) res = ior(& ishft( & ior(& ishft(int(iand(byte(1), mask_4_bit), kind=int32),6),& int(iand(byte(2), mask_2_bit), kind=int32)),6),& int(iand(byte(3), mask_2_bit), kind=int32)) case (-16:-9) res = ior(& ishft(& ior(& ishft(& ior(& ishft(int( iand(byte(1), mask_5_bit),kind=int32),6),& int(iand(byte(2), mask_2_bit),kind=int32)),6), & int(iand(byte(3), mask_2_bit),kind=int32)),6), & int(iand(byte(4), mask_2_bit),kind=int32)) case default res = -1 end select ! return ! ! Perform bit shifts to determine character's byte-length. ! shift_3 = ishft(byte(1), -3) ! shift_4 = ishft(byte(1), -4) ! shift_5 = ishft(byte(1), -5) ! shift_7 = ishft(byte(1), -7) ! ! 1-byte character ! if (shift_7 == 0) then ! res = byte(1) ! ! 4-byte character ! else if (shift_3 == 30) then ! ! First 1 byte ! res = iand(byte(1), mask_5_bit) ! ! Continuation bytes ! res = ishft(res, 6) ! Left shift by 6 bits and store into res ! buf = iand(byte(2), mask_2_bit) ! Mask `byte(2)` with `mask_2_bit` and store the result into `buf`. ! res = ior(res, buf) ! Take the bitwise OR of `res` and `buf`. The same applies below. ! res = ishft(res, 6) ! buf = iand(byte(3), mask_2_bit) ! res = ior(res, buf) ! res = ishft(res, 6) ! buf = iand(byte(4), mask_2_bit) ! res = ior(res, buf) ! ! 3-byte character ! else if (shift_4 == 14) then ! res = iand(byte(1), mask_4_bit) ! res = ishft(res, 6) ! buf = iand(byte(2), mask_2_bit) ! res = ior(res, buf) ! res = ishft(res, 6) ! buf = iand(byte(3), mask_2_bit) ! res = ior(res, buf) ! ! 2-byte character ! else if (shift_5 == 6) then ! res = iand(byte(1), mask_3_bit) ! res = ishft(res, 6) ! buf = iand(byte(2), mask_2_bit) ! res = ior(res, buf) ! end if end function ichar_utf8 pure function make_replacement_char() result(replace) implicit none character(3) :: replace replace = char_utf8(65535) ! U+FFFF end function make_replacement_char !> This function calculates the length of a UTF-8 string excluding tailing spaces. !> !> It takes a UTF-8 string as input and returns the number of characters in the string, !> ignoring any tailing whitespace characters. pure function len_trim_utf8(str) result(count) implicit none character(*), intent(in) :: str integer :: i, inext, count ! Initialize i = 1 count = 0 ! Loop through the string until the end of the trimed string is reached. do while(i <= len_trim(str)) inext = idxutf8(str, i) + 1 ! Get the index of the next UTF-8 character. count = count + 1 ! Increment the character count. i = inext ! Move to the next character. end do end function len_trim_utf8 !> This function calculates the length of a UTF-8 string. !> !> It takes a UTF-8 string as input and returns the number of characters in the string. pure function len_utf8(str) result(count) implicit none character(*), intent(in) :: str integer :: i, inext, count ! Initialize i = 1 count = 0 ! Loop through the string until the end of the string is reached. do while(i <= len(str)) inext = idxutf8(str, i) + 1 ! Get the index of the next UTF-8 character. count = count + 1 ! Increment the character count. i = inext ! Move to the next character. end do end function len_utf8 !> This function determines if a given character is the first byte of !> a UTF-8 multibyte character. It takes a 1-byte character as input !> and returns a logical value indicating if it is the first byte of !> an UTF-8 binary string. pure function is_first_byte_of_character(chara) result(res) use, intrinsic :: iso_fortran_env implicit none character(1), intent(in) :: chara ! Input single byte character logical :: res ! Result indicating if it is the first byte of a multibyte character. integer(int8) :: byte, shift_6 ! Integer representation of the character and shifted value. ! Convert the character to its integer representation byte = int(ichar(chara), kind(byte)) ! Initialize the result to `.true.` (assume it is the first byte). res = .true. ! Shift the byte 6 bits to the right. shift_6 = ishft(byte, -6) ! If the shifted value equals 2 (10_2), it is a continuation byte, not the first byte. if (shift_6 == 2) res = .false. end function is_first_byte_of_character !> This subroutine determines if each character in a given string is the first byte of a UTF-8 multibyte character. !> It takes a UTF-8 string and return a logical array indicating for each position if it is the first byte. pure subroutine is_first_byte_of_character_array (str, array, length) use, intrinsic :: iso_fortran_env, only: int32 implicit none logical, allocatable, intent(inout) :: array(:) ! Output logical array indicating first byte status. integer(int32), intent(in) :: length ! Length of the input string character(len=length), intent(in) :: str ! Input UTF-8 string integer :: i ! Loop index variable ! Deallocate the array if it is already allocated. if (allocated(array)) deallocate(array) ! Allocate the array with the same length as the input string and initialize to `.false.` allocate(array(length), source=.false.) ! Loop through each character in the string concurrently. ! do concurrent (i = 1:length) do i = 1, length ! Call the `is_first_byte_of_character` function for each character and store the result in the `array`. array(i) = is_first_byte_of_character(str(i:i)) end do end subroutine !> This function counts the occurrence of a spcified character(token) in a given string. pure function count_token(str, token) result(count) implicit none character(*), intent(in) :: str ! Input string to be searched. character(1), intent(in) :: token ! Character to be counted in the input string. integer :: count ! Result: number of occurrences of the `token`. integer :: i ! Loop index variable. integer :: siz ! Length of the input string. ! Initialize the count to zero. count = 0 ! Get the length of the input string. siz = len(str) ! Loop through each character in the string. do i = 1, siz ! If the current character matches the `token`, increment the `count`. if (str(i:i) == token) count = count + 1 end do end function count_token pure function adjustl_multi_byte(chara) result(res) implicit none character(*), intent(in) :: chara character(:), allocatable :: res integer :: i res = '' i = 1 do while (i <= len(chara)) if (chara(i:i) == char(0)) then i = i + 1 cycle else exit end if end do res = chara(i:len(chara)) end function adjustl_multi_byte pure function trim_invalid_utf8_byte(chara) result(res) implicit none character(*), intent(in) :: chara character(:), allocatable :: res if (is_valid_multiple_byte_character(chara)) then res = chara else res = '' end if end function trim_invalid_utf8_byte pure function reverse_utf8(str) result(retval) use :: forgex_parameters_m implicit none character(*), intent(in) :: str character(:), allocatable :: retval integer :: i, ie ! Initialize ie = 1 i = 1 retval = '' do while (i /= INVALID_CHAR_INDEX) ie = idxutf8(str, i) retval = str(i:ie)//retval i = next_idxutf8(str, i) end do end function reverse_utf8 end module forgex_utf8_m