idxutf8 Function

public pure function idxutf8(str, curr) result(tail)

This function returns the index of the end of the (multibyte) character, given the string str and the current index curr.

Arguments

Type IntentOptional Attributes Name
character(len=*), intent(in) :: str
integer(kind=int32), intent(in) :: curr

Return Value integer(kind=int32)


Source Code

   pure function idxutf8 (str, curr) result(tail)
      use, intrinsic :: iso_fortran_env
      use :: forgex_parameters_m
      implicit none
      character(*),   intent(in) :: str      ! Input string, a multibyte character is expected.
      integer(int32), intent(in) :: curr     ! Current index.
      integer(int32)             :: tail     ! Resulting index of the end of the character.
      integer(int32)             :: i        ! Loop variable.
      integer(int8)              :: byte     ! Variable to hold the byte value of the 1-byte part of the character
      integer(int8) :: shift_3, shift_4, shift_5, shift_6, shift_7
         ! Shifted byte values.


      tail = curr    ! Initialize tail to the current index.

      do i = 0, 3    ! Loop over the next four bytes to determine the byte-length of the character.

         byte = int(ichar(str(curr+i:curr+i)), kind(byte))
            ! Get the byte value of the character at position `curr+1`.

         shift_3 = ishft(byte, -3)  ! Right shift the byte by 3 bits
         shift_4 = ishft(byte, -4)  ! Right shift the byte by 3 bits
         shift_5 = ishft(byte, -5)  ! Right shift the byte by 5 bits
         shift_6 = ishft(byte, -6)  ! Right shift the byte by 6 bits
         shift_7 = ishft(byte, -7)  ! Right shift the byte by 7 bits

         if (shift_6 == 2) cycle    ! Continue to the next iteration if the `byte` is a continuation byte (10xxxxxx_2).

         if (i == 0) then   ! Check the first byte to determine the character length.

            if (shift_3 == 30 ) then ! If the byte starts with 11110_2 (4-byte character).
               tail = curr + 4 - 1
               return
            end if

            if (shift_4 == 14) then ! If the byte starts witth 1110_2 (3-byte character).
               tail = curr + 3 - 1
               return
            end if

            if (shift_5 == 6) then  ! If the byte starts with 110_2 (2-byte character).
               tail = curr + 2 - 1
               return
            end if

            if (shift_7 == 0) then ! If then byte starts with 0_2 (1-byte character).
               tail = curr + 1 - 1
               return
            end if

         else     ! Check continuation byptes

            if (shift_3 == 30 .or. shift_4 == 14 .or. shift_5 == 6 .or. shift_7 == 0) then
               tail = curr + i - 1
               return
            end if

         end if
      end do

   end function idxutf8