character_array_m.F90 Source File


Source Code

! Fortran Regular Expression (Forgex)
!
! MIT License
!
! (C) Amasaki Shinobu, 2023-2025
!     A regular expression engine for Fortran.
!     forgex_character_array_m module is a part of Forgex.
!
#ifdef IMPURE
#define pure
#endif
module forgex_character_array_m
   implicit none
   private

   type :: character_array_t
   !! This derived-type contains single UTF-8 character and two flags.
   !! It will be used to parse character class patterns enclosed in square brackets. 
   !! `is_escaped` is true when the character has preceding backslash.
   !! `is_hyphenated` is true when the character has following hyphen except for First character.
   !! `seg_size` is the number of segments the component represents, which is 1 for normal characters
   !! and greater than 1 for most shorthand escape sequences.
      character(:), allocatable :: c
      logical :: is_escaped = .false.
      logical :: is_hyphenated = .false.
      logical :: is_subtract = .false.
      integer :: seg_size = 0
   end type

   public :: character_array_t
   public :: character_string_to_array
   public :: parse_backslash_and_hyphen_in_char_array
   public :: parse_segment_width_in_char_array
   public :: parse_escape_sequence_with_argument

#ifdef IMPURE
   public :: dump_character_array_t_list
#endif

contains

   !> This subroutine parses a pattern string for character class,
   !> and outputs `character_array_t` type array.
   !> When it encounters invalid value along the way, it returns.
   pure subroutine character_string_to_array(str, array)
      use :: forgex_parameters_m, only: INVALID_CHAR_INDEX
      use :: forgex_error_m
      use :: forgex_utf8_m, only: len_utf8, idxutf8
      implicit none
      character(*), intent(in) :: str
      type(character_array_t), intent(inout), allocatable :: array(:)

      integer :: siz, ib, ie, j

      siz = len_utf8(str)
      if (siz < 1) return

      if (allocated(array)) deallocate(array)
      allocate(array(siz))

      ib = 0
      ie = 0
      do j = 1, siz
         ib = ie + 1
         ie = idxutf8(str, ib)
         if (ib == INVALID_CHAR_INDEX .or. ie == INVALID_CHAR_INDEX) return
         array(j)%c = str(ib:ie)
      end do

   end subroutine character_string_to_array
      
   !> This subroutine processes a character array, and outputs the corresponding
   !> flagged array. It removes backslash and hyphen characters, and then flags 
   !> the current element in `character_array_t` type array.
   pure subroutine parse_backslash_and_hyphen_in_char_array(array, ierr)
      use :: forgex_parameters_m
      use :: forgex_error_m
      implicit none
      type(character_array_t), intent(inout), allocatable :: array(:)
      type(character_array_t), allocatable :: temp(:)
      integer, intent(inout) :: ierr

      integer :: i, k, siz
      logical :: is_already_subtraction_zone

      if (.not. allocated(array)) return
      if (size(array, dim=1) < 1) return 

      allocate(temp(size(array, dim=1)))

      k = 1 ! actual size counter to output.
      is_already_subtraction_zone = .false.

      ! Main loop
      do i = 1, size(array, dim=1) ! i is array's index
         if (1 < i .and. i < size(array,dim=1)) then
           
            ! Handling subtract expression
            if (.not. is_already_subtraction_zone) then
               if (array(i)%c == SYMBOL_HYPN .and. array(i+1)%c == SYMBOL_HYPN) then
                  temp(k:size(temp))%is_subtract = .true.
                  is_already_subtraction_zone = .true.
                  cycle
               end if
            else
               if (array(i)%c == SYMBOL_HYPN .and. array(i+1)%c == SYMBOL_HYPN) then
                  ierr = SYNTAX_ERR_MISPLACED_SUBTRACTION_OPERATOR
                  return
               end if
            end if

            if (array(i-1)%c == SYMBOL_HYPN .and. array(i)%c == SYMBOL_HYPN) then
               cycle
            end if
         end if

         if (array(i)%c == SYMBOL_BSLH .and. .not. temp(k)%is_escaped) then
            ! If the current character is backslash
            ! except the `is_escaped` of `temp(k)` is true.
            temp(k)%is_escaped = .true.
         
         else if (array(i)%c == SYMBOL_HYPN .and. .not. i == 1) then
            ! If the current character is hyphen,
            ! except for the first character. 
            temp(k-1)%is_hyphenated = .true.
         
         else
            ! For characters has no special meaning.
            temp(k)%c = array(i)%c
            k = k + 1
         end if
      end do

      ! Copy from local array to the arguemnt array.
      siz = k - 1
      if (allocated(array)) deallocate(array)
      allocate(array(siz))
      array(:) = temp(1:siz)
      
   end subroutine parse_backslash_and_hyphen_in_char_array


      !> This subroutine assigns the expected segment size from the character `c` of
   !> the current array element to its `seg_size`.
   pure subroutine parse_segment_width_in_char_array (array)
      use :: forgex_parameters_m
      use :: forgex_utf8_m
      use :: forgex_segment_m
      implicit none
      type(character_array_t), intent(inout) :: array(:)
      type(segment_t), allocatable :: seg(:)
      integer :: k, n

      n = 0
      do k = 1, size(array, dim=1)
         if (array(k)%is_escaped) then
            select case(array(k)%c)
            case (ESCAPE_T)
               n = width_of_segment(SEG_TAB)
            case (ESCAPE_N)
               n = width_of_segment(SEG_LF) + width_of_segment(SEG_CR)
            case (ESCAPE_R)
               n = width_of_segment(SEG_CR)
            case (ESCAPE_D)
               n = width_of_segment(SEG_DIGIT)
            case (ESCAPE_D_CAPITAL)
               allocate(seg(1))
               seg(1) = SEG_DIGIT
               call invert_segment_list(seg)
               n = total_width_of_segment(seg)

            case (ESCAPE_W)
               allocate(seg(4))
               seg(1) = SEG_LOWERCASE
               seg(2) = SEG_UPPERCASE
               seg(3) = SEG_DIGIT
               seg(4) = SEG_UNDERSCORE
               n = total_width_of_segment(seg)

            case (ESCAPE_W_CAPITAL)
               allocate(seg(4))
               seg(1) = SEG_LOWERCASE
               seg(2) = SEG_UPPERCASE
               seg(3) = SEG_DIGIT
               seg(4) = SEG_UNDERSCORE
               call invert_segment_list(seg)
               n = total_width_of_segment(seg)

            case (ESCAPE_S)
               n = 6
            case (ESCAPE_S_CAPITAL)
               allocate(seg(6))
               seg(1) = SEG_SPACE
               seg(2) = SEG_TAB
               seg(3) = SEG_CR
               seg(4) = SEG_LF
               seg(5) = SEG_FF
               seg(6) = SEG_ZENKAKU_SPACE
               call invert_segment_list(seg)
               n = total_width_of_segment(seg)
            case (ESCAPE_X)
               n = 1
            case (SYMBOL_BSLH)
               n = 1
            case (SYMBOL_LCRB)
               n = 1
            case (SYMBOL_RCRB)
               n = 1
            case (SYMBOL_LSBK)
               n = 1
            case (SYMBOL_RSBK)
               n = 1
            case default
               n = INVALID_SEGMENT_SIZE
            end select
         else
            n = 1
         end if
         array(k)%seg_size = n
      end do

   end subroutine parse_segment_width_in_char_array


   pure subroutine parse_escape_sequence_with_argument(ca, ierr)
      use :: forgex_segment_m
      use :: forgex_parameters_m
      use :: forgex_utf8_m
      use :: forgex_error_m
      implicit none
      type(character_array_t), intent(inout), allocatable :: ca(:)
      integer, intent(inout) :: ierr

      type(character_array_t), allocatable :: tmp(:)
      character(2) :: hex_two_digit
      character(:), allocatable :: hex_long
      integer :: i, j, k, siz, ib, ie

      ierr = SYNTAX_VALID
      if (.not. allocated(ca)) then
         ierr = ALLOCATION_ERR
         return
      end if


      hex_two_digit = ''
      hex_long = ''

      siz = size(ca, dim=1)
      allocate(tmp(siz))

      k = 1
      j = 1
      outer: do while (j <= siz)
         if (ca(j)%c == ESCAPE_X .and. ca(j)%is_escaped) then
            tmp(k)%c = ESCAPE_X
            tmp(k)%is_escaped = .true.
            j = j + 1
            if (j > siz) exit outer
            k = k + 1

            if (j+1 <= siz) then

               if ((ichar_utf8(ca(j)%c) .in. SEG_HEX) .and. (ichar_utf8(ca(j+1)%c) .in. SEG_HEX) )then
                  hex_two_digit = trim(ca(j)%c)//trim(ca(j+1)%c)
                  tmp(k)%c = trim(adjustl(hex_two_digit))
                  ! tmp(k)%is_escaped = .true.
                  tmp(k)%is_hyphenated = ca(j+1)%is_hyphenated
                  j = j + 2
                  if (j > siz) exit outer
                  k = k + 1
                  cycle
   
               else if (ca(j)%c == SYMBOL_LCRB) then
                  i = j + 1
                  reader: do while (.true.)

                     if (i > siz) then
                        ierr = SYNTAX_ERR_CURLYBRACE_MISSING
                        return
                     end if

                     if (ca(i)%c /= SYMBOL_RCRB .and. .not. (ichar_utf8(ca(i)%c) .in. SEG_HEX)) then
                        ierr = SYNTAX_ERR_INVALID_HEXADECIMAL
                        return
                     else if (ca(i)%c == SYMBOL_RCRB) then
                        exit reader
                     end if
                     hex_long = trim(adjustl(hex_long))//ca(i)%c
                     i = i + 1
                  end do reader

                  tmp(k)%c = trim(adjustl(hex_long))
                  ! tmp(k)%is_escaped = .true.
                  tmp(k)%is_hyphenated = ca(i)%is_hyphenated

                  j = i + 1
                  if (j > siz) exit outer
                  k = k + 1
                  
                  hex_long = ''
                  cycle
               else
                  ierr = SYNTAX_ERR_INVALID_HEXADECIMAL
                  return
               end if

            else
               ierr = SYNTAX_ERR_HEX_DIGITS_NOT_ENOUGH
               return
            end if

         else if (ca(j)%c == ESCAPE_P) then
            ierr = SYNTAX_ERR_UNICODE_PROPERTY_NOT_IMPLEMENTED
            return
         end if

         tmp(k) = ca(j)
         
         j = j + 1
         if (j > siz) exit
         k = k + 1
      end do outer

      deallocate(ca)
      allocate(ca(k))
      ca(:) = tmp(1:k)

      ! call dump_character_array_t_list(tmp)
      ! call dump_character_array_t_list(ca)
      
   end subroutine parse_escape_sequence_with_argument


   pure function index_ca(ca, chara) result(idx)
      use :: forgex_parameters_m
      implicit none
      type(character_array_t), intent(in) :: ca(:)
      character(*), intent(in) :: chara
      integer :: idx
      integer :: i, siz
      idx = 0
      siz = size(ca, dim=1)
      do i = 1, siz
         if (ca(i)%c == chara) then
            idx = i
            return
         end if
      end do
   end function index_ca

!=====================================================================!

#ifdef IMPURE
   subroutine dump_character_array_t_list(list)
      use :: iso_fortran_env, only: stderr => error_unit
      implicit none
      type(character_array_t), intent(in) :: list(:)
      
      character(:), allocatable :: fmt
      integer :: i

      fmt = "('|', a6, 1x, '|', l8, 1x, '|', l11 ,1x, '|', l9, 1x, '|', i7, 1x, '|')"
      write(stderr, '(a)') '+=========== character_array_t output ==============' 
      write(stderr, '(a)') '| chara | escaped | hyphenated | subtract |  size  |' 
      do i = 1, size(list, dim=1)
         write(stderr, fmt) list(i)%c, list(i)%is_escaped, list(i)%is_hyphenated, &
            list(i)%is_subtract, list(i)%seg_size
      end do
      write(stderr, '(a)') '+=======+=========+============+==========+========+' 

   end subroutine dump_character_array_t_list
#endif

end module forgex_character_array_m