utf8_m.f90 Source File

This file contains procedures to handle UTF-8 character set.



Source Code

! Fortran Regular Expression (Forgex)
!
! MIT License
!
! (C) Amasaki Shinobu, 2023-2024
!     A regular expression engine for Fortran.
!     forgex_utf8_m module is a part of Forgex.

!! This file contains procedures to handle UTF-8 character set.

!> The `forgex_utf8_m` module processes a byte-indexed character strings type as UTF-8 strings.
module forgex_utf8_m
   implicit none
   private

   public :: idxutf8
   public :: char_utf8, ichar_utf8
   public :: count_token
   public :: is_first_byte_of_character
   public :: is_first_byte_of_character_array
   public :: len_trim_utf8, len_utf8
   public :: is_valid_multiple_byte_character
   public :: adjustl_multi_byte
   public :: trim_invalid_utf8_byte

contains

   ! INDEX OF UTF8
   !> This function returns the index of the end of the (multibyte) character,
   !> given the string str and the current index curr.
   pure function idxutf8 (str, curr) result(tail)
      use, intrinsic :: iso_fortran_env
      use :: forgex_parameters_m
      implicit none
      character(*),   intent(in) :: str      ! Input string, a multibyte character is expected.
      integer(int32), intent(in) :: curr     ! Current index.
      integer(int32)             :: tail     ! Resulting index of the end of the character.
      integer(int32)             :: i        ! Loop variable.
      integer(int8)              :: byte     ! Variable to hold the byte value of the 1-byte part of the character
      integer(int8) :: shift_3, shift_4, shift_5, shift_6, shift_7
         ! Shifted byte values.


      tail = curr    ! Initialize tail to the current index.

      do i = 0, 3    ! Loop over the next four bytes to determine the byte-length of the character.

         byte = int(ichar(str(curr+i:curr+i)), kind(byte))
            ! Get the byte value of the character at position `curr+1`.

         shift_3 = ishft(byte, -3)  ! Right shift the byte by 3 bits
         shift_4 = ishft(byte, -4)  ! Right shift the byte by 3 bits
         shift_5 = ishft(byte, -5)  ! Right shift the byte by 5 bits
         shift_6 = ishft(byte, -6)  ! Right shift the byte by 6 bits
         shift_7 = ishft(byte, -7)  ! Right shift the byte by 7 bits

         if (shift_6 == 2) cycle    ! Continue to the next iteration if the `byte` is a continuation byte (10xxxxxx_2).

         if (i == 0) then   ! Check the first byte to determine the character length.

            if (shift_3 == 30 ) then ! If the byte starts with 11110_2 (4-byte character).
               tail = curr + 4 - 1
               return
            end if

            if (shift_4 == 14) then ! If the byte starts witth 1110_2 (3-byte character).
               tail = curr + 3 - 1
               return
            end if

            if (shift_5 == 6) then  ! If the byte starts with 110_2 (2-byte character).
               tail = curr + 2 - 1
               return
            end if

            if (shift_7 == 0) then ! If then byte starts with 0_2 (1-byte character).
               tail = curr + 1 - 1
               return
            end if

         else     ! Check continuation byptes

            if (shift_3 == 30 .or. shift_4 == 14 .or. shift_5 == 6 .or. shift_7 == 0) then
               tail = curr + i - 1
               return
            end if

         end if
      end do

   end function idxutf8


   pure function is_valid_multiple_byte_character(chara) result(res)
      use, intrinsic :: iso_fortran_env, only: int32, int8
      implicit none
      character(*), intent(in) :: chara
      logical :: res

      integer :: siz, i, expected_siz
      integer(int8) :: shift_3, shift_4, shift_5, shift_6, shift_7
      integer(int8) :: byte
      
      res = .true.
      siz = len(chara)

      byte = ichar(chara(1:1), kind=int8)
      shift_3 = ishft(byte, -3)  ! Right shift the byte by 3 bits
      shift_4 = ishft(byte, -4)  ! Right shift the byte by 4 bits
      shift_5 = ishft(byte, -5)  ! Right shift the byte by 5 bits
      shift_6 = ishft(byte, -6)  ! Right shift the byte by 6 bits
      shift_7 = ishft(byte, -7)  ! Right shift the byte by 7 bits

      ! 1st byte
      if (shift_3 == 30) then
         expected_siz = 4
      else if (shift_4 == 14)then
         expected_siz = 3 
      else if (shift_5 == 6) then
         expected_siz = 2
      else if (shift_7 == 0) then ! for 1-byte character
         expected_siz = 1         
      else
         res = .false.
         return
      end if

      if (expected_siz /= siz) then
         res = .false.
         return
      end if

      do i = 2, expected_siz
         byte = ichar(chara(i:i), kind=int8)
         shift_6 = ishft(byte, -6)  ! Right shift the byte by 6 bits
         if (shift_6 /= 2) then
            res = .false.
            return
         end if
      end do

   end function is_valid_multiple_byte_character
      

   !> The `char_utf8` function takes a code point as integer in Unicode character set,
   !> and returns the corresponding character as UTF-8 binary string.
   !>
   !> This function is like an extension of char() for the UTF-8 codeset.
   pure function char_utf8 (code) result(str)
      use, intrinsic :: iso_fortran_env
      implicit none
      integer(int32), intent(in) :: code        ! Input Unicode code point.

      character(:), allocatable  :: str         ! Resulting one UTF-8 character.
      character(32), allocatable :: bin         ! A 32-digit number expressed in character format for masking.
      integer(int32)             :: buf, mask   ! Buffer and mask for bit operations.
      integer(int8)              :: byte(4)     ! Array to hold up 4 bytes of the UTF-8 character.

      str = ''    ! Initialize result string.
      buf = code  ! Initialize buffer with input `code` point.

      bin = '0000000000000000000000000111111' ! Lower 6-bit mask
      read(bin, '(b32.32)') mask              ! Read the `mask` from the `bin` character string.

      byte(1) = int(iand(ishft(buf, -18), mask),kind(byte))    ! First byte

      buf = code
      byte(2) = int(iand(ishft(buf, -12), mask), kind(byte))   ! Second byte

      buf = code
      byte(3) = int(iand(ishft(buf, -6), mask), kind(byte))    ! Third byte

      buf = code
      byte(4) = int(iand(buf, mask), kind(byte))               ! Fourth byte

      if (code > 2**7-1) then    ! Check if the `code` point is greater than 127 (non-ASCII character).

         if (2**16 -1 < code) then     ! 4-byte character
            byte(1) = ibset(byte(1),7)
            byte(1) = ibset(byte(1),6)
            byte(1) = ibset(byte(1),5)
            byte(1) = ibset(byte(1),4)
            byte(1) = ibclr(byte(1),3)
            byte(2) = set_continuation_byte(byte(2))  ! Set continuation bytes.
            byte(3) = set_continuation_byte(byte(3))
            byte(4) = set_continuation_byte(byte(4))

         else if (2**11 - 1 < code) then  ! 3-byte character
            byte(1) = 32
            byte(2) = ibset(byte(2), 7)
            byte(2) = ibset(byte(2), 6)
            byte(2) = ibset(byte(2), 5)
            byte(2) = ibclr(byte(2), 4)
            byte(3) = set_continuation_byte(byte(3))
            byte(4) = set_continuation_byte(byte(4))

         else if (2**7 -1 < code) then    ! 2-byte character
            byte(1) = 32
            byte(2) = 32
            byte(3) = ibset(byte(3), 7)
            byte(3) = ibset(byte(3), 6)
            byte(3) = ibclr(byte(3), 5)
            byte(4) = set_continuation_byte(byte(4))
         end if

         str = char(byte(1)) //char(byte(2)) //char(byte(3)) //char(byte(4))  ! Concatenate bytes into a string.
         str = trim(adjustl(str))   ! Trim leading and tailing space.

      else
         str = char(code)  ! For ASCII characters.
      end if

   end function char_utf8


   !> This function take one byte, set the first two bits to 10, and
   !> returns one byte of the continuation part.
   pure function set_continuation_byte(byte) result(res)
      use, intrinsic :: iso_fortran_env, only: int8
      implicit none
      integer(int8), intent(in) :: byte
      integer(int8) :: res

      res = ibset(byte, 7) ! 1xxxxxxx
      res = ibclr(res, 6)  ! 10xxxxxx
   end function set_continuation_byte


   !> Take a UTF-8 character as an argument and
   !> return the integer (also known as "code point" in Unicode) representing
   !> its UTF-8 binary string.
   !>
   !> This function is like an extension of char() for the UTF-8 codeset.
   pure function ichar_utf8 (chara) result(res)
      use, intrinsic :: iso_fortran_env
      implicit none
      character(*), intent(in) :: chara   ! Input one UTF-8 character

      integer(int32) :: res         ! Resulting integer representing an UTF-8 binary string.
      integer(int8)  :: byte(4)     ! Byte array (32bit)
      integer(int8)  :: shift_3, shift_4, shift_5, shift_7              ! Shift values
      integer(int8)  :: mask_2_bit, mask_3_bit, mask_4_bit, mask_5_bit  ! Masks for bit operations
      integer(int32) :: buf         ! Buffer for bit operations

      character(8) :: binary        ! 8-byte character string representing binary.

      binary = '00111111'           ! 6-bit mask for continuation bytes.
      read(binary, '(b8.8)') mask_2_bit

      binary = '00011111'           ! 5-bit mask for 2-byte characters.
      read(binary, '(b8.8)') mask_3_bit

      binary = '00001111'           ! 4-bit mask for 3-byte characters.
      read(binary, '(b8.8)') mask_4_bit

      binary = '00000111'           ! 3-bit mask for 4-byte characters.
      read(binary, '(b8.8)') mask_5_bit

      res = 0     ! Initialize result

      if (len(chara) > 4)  then     ! Check if the length of input character is more than 4 bytes.
         res = -1                   ! Invalid UTF-8 character.
         return
      end if

      ! Convert a multi-byte character to thier integer byte representation.
      byte(1) = int(ichar(chara(1:1)),kind(byte))
      if (len(chara) >= 2) byte(2) = int(ichar(chara(2:2)), kind(byte))
      if (len(chara) >= 3) byte(3) = int(ichar(chara(3:3)), kind(byte))
      if (len(chara) >= 4) byte(4) = int(ichar(chara(4:4)), kind(byte))

      ! Perform bit shifts to determine character's byte-length.
      shift_3 = ishft(byte(1), -3)
      shift_4 = ishft(byte(1), -4)
      shift_5 = ishft(byte(1), -5)
      shift_7 = ishft(byte(1), -7)

      ! 1-byte character
      if (shift_7 == 0) then

         res = byte(1)
         return

      ! 4-byte character
      else if (shift_3 == 30) then

         ! First 1 byte
         res =  iand(byte(1), mask_5_bit)

         ! Continuation bytes
         res = ishft(res, 6)     ! Left shift by 6 bits and store into res
         buf =  iand(byte(2), mask_2_bit) ! Mask `byte(2)` with `mask_2_bit` and store the result into `buf`.
         res =   ior(res, buf)   ! Take the bitwise OR of `res` and `buf`. The same applies below.

         res = ishft(res, 6)
         buf =  iand(byte(3), mask_2_bit)
         res =   ior(res, buf)

         res = ishft(res, 6)
         buf =  iand(byte(4), mask_2_bit)
         res =   ior(res, buf)

      ! 3-byte character
      else if (shift_4 == 14) then

         res =  iand(byte(1), mask_4_bit)

         res = ishft(res, 6)
         buf =  iand(byte(2), mask_2_bit)
         res =   ior(res, buf)

         res = ishft(res, 6)
         buf =  iand(byte(3), mask_2_bit)
         res =   ior(res, buf)

      ! 2-byte character
      else if (shift_5 == 6) then

         res =  iand(byte(1), mask_3_bit)

         res = ishft(res, 6)
         buf =  iand(byte(2), mask_2_bit)
         res =   ior(res, buf)

      end if
   end function ichar_utf8


   !> This function calculates the length of a UTF-8 string excluding tailing spaces.
   !>
   !> It takes a UTF-8 string as input and returns the number of characters in the string,
   !> ignoring any tailing whitespace characters.
   pure function len_trim_utf8(str) result(count)
      implicit none
      character(*), intent(in) :: str
      integer :: i, inext, count

      ! Initialize
      i = 1
      count = 0

      ! Loop through the string until the end of the trimed string is reached.
      do while(i <= len_trim(str))
         inext = idxutf8(str, i) + 1   ! Get the index of the next UTF-8 character.
         count = count + 1             ! Increment the character count.
         i = inext                     ! Move to the next character.
      end do

   end function len_trim_utf8


   !> This function calculates the length of a UTF-8 string.
   !>
   !> It takes a UTF-8 string as input and returns the number of characters in the string.
   pure function len_utf8(str) result(count)
      implicit none
      character(*), intent(in) :: str
      integer :: i, inext, count

      ! Initialize
      i = 1
      count = 0

      ! Loop through the string until the end of the string is reached.
      do while(i <= len(str))
         inext = idxutf8(str, i) + 1   ! Get the index of the next UTF-8 character.
         count = count + 1             ! Increment the character count.
         i = inext                     ! Move to the next character.
      end do
   end function len_utf8


   !> This function determines if a given character is the first byte of
   !> a UTF-8 multibyte character. It takes a 1-byte character as input
   !> and returns a logical value indicating if it is the first byte of
   !> an UTF-8 binary string.
   pure function is_first_byte_of_character(chara) result(res)
      use, intrinsic :: iso_fortran_env
      implicit none
      character(1), intent(in) :: chara   ! Input single byte character

      logical       :: res             ! Result indicating if it is the first byte of a multibyte character.
      integer(int8) :: byte, shift_6   ! Integer representation of the character and shifted value.

      ! Convert the character to its integer representation
      byte = int(ichar(chara), kind(byte))

      ! Initialize the result to `.true.` (assume it is the first byte).
      res = .true.

      ! Shift the byte 6 bits to the right.
      shift_6 = ishft(byte, -6)

      ! If the shifted value equals 2 (10_2), it is a continuation byte, not the first byte.
      if (shift_6 == 2) res = .false.

   end function is_first_byte_of_character


   !> This subroutine determines if each character in a given string is the first byte of a UTF-8 multibyte character.
   !> It takes a UTF-8 string and return a logical array indicating for each position if it is the first byte.
   pure subroutine is_first_byte_of_character_array (str, array, length)
      use, intrinsic :: iso_fortran_env, only: int32
      implicit none
      logical, allocatable,  intent(inout) :: array(:)   ! Output logical array indicating first byte status.
      integer(int32),        intent(in)    :: length     ! Length of the input string
      character(len=length), intent(in)    :: str        ! Input UTF-8 string

      integer :: i      ! Loop index variable

      ! Deallocate the array if it is already allocated.
      if (allocated(array)) deallocate(array)

      ! Allocate the array with the same length as the input string and initialize to `.false.`
      allocate(array(length), source=.false.)

      ! Loop through each character in the string concurrently.
      ! do concurrent (i = 1:length)
      do i = 1, length
         ! Call the `is_first_byte_of_character` function for each character and store the result in the `array`.
         array(i) = is_first_byte_of_character(str(i:i))
      end do

   end subroutine

   !> This function counts the occurrence of a spcified character(token) in a given string.
   pure function count_token(str, token) result(count)
      implicit none
      character(*), intent(in) :: str     ! Input string to be searched.
      character(1), intent(in) :: token   ! Character to be counted in the input string.

      integer :: count     ! Result: number of occurrences of the `token`.
      integer :: i         ! Loop index variable.
      integer :: siz       ! Length of the input string.

      ! Initialize the count to zero.
      count = 0

      ! Get the length of the input string.
      siz = len(str)

      ! Loop through each character in the string.
      do i = 1, siz
         ! If the current character matches the `token`, increment the `count`.
         if (str(i:i) == token) count = count + 1
      end do
   end function count_token


   pure function adjustl_multi_byte(chara) result(res)
      implicit none
      character(*), intent(in) :: chara
      character(:), allocatable :: res

      integer :: i
      res = ''
      i = 1
      do while (i <= len(chara))
         if (chara(i:i) == char(0)) then
            i = i + 1
            cycle
         else
            exit
         end if
      end do

      res = chara(i:len(chara))

   end function adjustl_multi_byte

   pure function trim_invalid_utf8_byte(chara) result(res)
      implicit none
      character(*), intent(in) :: chara
      character(:), allocatable :: res
      
      if (is_valid_multiple_byte_character(chara)) then
         res = chara
      else
         res = ''
      end if
   
   end function trim_invalid_utf8_byte

end module forgex_utf8_m