is_valid_multiple_byte_character Function

public pure function is_valid_multiple_byte_character(chara) result(res)

This function checks the input byte string is valid as a single UTF-8 character.

Arguments

Type IntentOptional Attributes Name
character(len=*), intent(in) :: chara

Return Value logical


Source Code

   pure function is_valid_multiple_byte_character(chara) result(res)
      use, intrinsic :: iso_fortran_env, only: int32, int8
      implicit none
      character(*), intent(in) :: chara
      logical :: res

      integer :: siz, i, expected_siz
      integer(int8) :: shift_3, shift_4, shift_5, shift_6, shift_7
      integer(int8) :: byte
      
      res = .true.
      siz = len(chara)

      byte = ichar(chara(1:1), kind=int8)
      shift_3 = ishft(byte, -3)  ! Right shift the byte by 3 bits
      shift_4 = ishft(byte, -4)  ! Right shift the byte by 4 bits
      shift_5 = ishft(byte, -5)  ! Right shift the byte by 5 bits
      shift_6 = ishft(byte, -6)  ! Right shift the byte by 6 bits
      shift_7 = ishft(byte, -7)  ! Right shift the byte by 7 bits

      ! 1st byte
      if (shift_3 == 31) then  ! 5-byte character (invalid) 11111xxx_2
         res = .false.
         return
      else if (shift_3 == 30) then  ! 4-byte character `11110xxx_2`
         expected_siz = 4
      else if (shift_4 == 14)then   ! 3 byte character `1110xxxx_2`
         expected_siz = 3 
      else if (shift_5 == 6) then   ! 2-byte character `110xxxxx_2`
         expected_siz = 2
      else if (shift_7 == 0) then   ! for 1-byte character `0xxxxxxx`
         expected_siz = 1         
      else
         res = .false.
         return
      end if

      if (expected_siz /= siz) then
         res = .false.
         return
      end if

      do i = 2, expected_siz
         byte = ichar(chara(i:i), kind=int8)
         shift_6 = ishft(byte, -6)  ! Right shift the byte by 6 bits such as `10xxxxxx_2`
         if (shift_6 /= 2) then
            res = .false.
            return
         end if
      end do

   end function is_valid_multiple_byte_character