idxutf8 Function

public pure function idxutf8(str, curr) result(tail)

This function returns the index of the end of the (multibyte) character, given the string str and the current index curr. Class of invalid UTF-8 characters 1. invalid lead byte 2. invalid trail byte 3. overrun 4. over long encoding 5. incomplete multibyte sequence 6. invalid character range (U+D800-U+DFFF) 7. BOM appears in the middle 8. isolated trail byte

In the above case, idxutf8 will returns curr. Then, you should call is_valid_multiple_byte_character at a higher level to validate the substring.

Arguments

Type IntentOptional Attributes Name
character(len=*), intent(in) :: str
integer(kind=int32), intent(in) :: curr

Return Value integer(kind=int32)


Source Code

   pure function idxutf8 (str, curr) result(tail)
      use, intrinsic :: iso_fortran_env
      use :: forgex_parameters_m
      implicit none
      character(*),   intent(in) :: str      ! Input string, a multibyte character is expected.
      integer(int32), intent(in) :: curr     ! Current index.
      integer(int32)             :: tail     ! Resulting index of the end of the character.
      integer(int32)             :: i        ! Loop variable.
      integer(int8)              :: byte     ! Variable to hold the byte value of the 1-byte part of the character
      integer(int8) :: shift_3, shift_4, shift_5, shift_6, shift_7
         ! Shifted byte values.


      ! If the index exceeds the length of str, return the invalid value.
      if (curr > len(str)) then
         tail = INVALID_CHAR_INDEX
         return
      end if

      tail = curr    ! Initialize tail to the current index.

      !! Class of invalid UTF-8 characters
      !! 1. invalid lead byte
      !! 2. invalid trail byte
      !! 3. overrun
      !! 4. over long encoding
      !! 5. incomplete multibyte sequence
      !! 6. invalid character range (U+D800-U+DFFF)
      !! 7. BOM appears in the middle
      !! 8. isolated trail byte
      !
      !! In the above case, `idxutf8` will returns `curr`.
      !! Then, you should call `is_valid_multiple_byte_character` at a higher level to validate the substring.

      outer: do i = 0, 3    ! Loop over the next four bytes to determine the byte-length of the character.

         ! for terminated incomplete multibyte character
         if (curr+i > len(str)) then
            tail = curr
            return
         end if

         byte = int(ichar(str(curr+i:curr+i)), kind(byte))
            ! Get the byte value of the character at position `curr+1`.

         shift_3 = ishft(byte, -3)  ! Right shift the byte by 3 bits
         shift_4 = ishft(byte, -4)  ! Right shift the byte by 3 bits
         shift_5 = ishft(byte, -5)  ! Right shift the byte by 5 bits
         shift_6 = ishft(byte, -6)  ! Right shift the byte by 6 bits
         shift_7 = ishft(byte, -7)  ! Right shift the byte by 7 bits

         if (shift_6 == 2) cycle    ! Continue to the next iteration if the `byte` is a continuation byte (10xxxxxx_2).

         if (i == 0) then   ! Check the first byte to determine the character length.

            if (shift_3 == 30 ) then ! If the byte starts with 11110_2 (4-byte character).
               tail = curr + 4 - 1
               exit outer
            end if

            if (shift_4 == 14) then ! If the byte starts witth 1110_2 (3-byte character).
               tail = curr + 3 - 1
               exit outer
            end if

            if (shift_5 == 6) then  ! If the byte starts with 110_2 (2-byte character).
               tail = curr + 2 - 1
               exit outer
            end if

            if (shift_7 == 0) then ! If then byte starts with 0_2 (1-byte character).
               tail = curr + 1 - 1
               exit outer
            end if

         else     ! Check continuation byptes

            if (shift_3 == 30 .or. shift_4 == 14 .or. shift_5 == 6 .or. shift_7 == 0) then
               tail = curr + i - 1
               exit outer
            end if

         end if
      end do outer

      if (tail <= len(str)) then
         if (.not. is_valid_multiple_byte_character(str(curr:tail))) then
            tail = curr
         else
            return
         end if
      else
         tail = curr
      end if


   end function idxutf8