segment_m.F90 Source File

This file defines segment_t representing subset of UTF-8 character codeset and contains procedures for that.



Source Code

! Fortran Regular Expression (Forgex)
!
! MIT License
!
! (C) Amasaki Shinobu, 2023-2025
!     A regular expression engine for Fortran.
!     forgex_segment_m module is a part of Forgex.
!
!! This file defines `segment_t` representing subset of UTF-8 character codeset
!! and contains procedures for that.
#ifdef IMPURE
#define pure
#endif
module forgex_segment_m
   use, intrinsic :: iso_fortran_env, only: int32
   use :: forgex_parameters_m, only: UTF8_CODE_MIN, UTF8_CODE_MAX, UTF8_CODE_EMPTY
   implicit none
   private

   public :: operator(==)
   public :: operator(/=)
   public :: operator(.in.)
   public :: invert_segment_list
   public :: which_segment_symbol_belong
   public :: symbol_to_segment

   public :: sort_segment_by_min
   public :: merge_segments
   public :: segment_is_valid
   public :: register_segment_to_list
   public :: join_two_segments
   public :: width_of_segment
   public :: total_width_of_segment
   public :: hex2seg
   public :: prop2seg



   !> This derived-type represents a contiguous range of the Unicode character set
   !> as a `min` and `max` value, providing an effective way to represent ranges of characters
   !> when building automata where a range characters share the same transition destination.
   type, public :: segment_t
      integer(int32) :: min = UTF8_CODE_MAX+2 ! = 2097153
      integer(int32) :: max = UTF8_CODE_MAX+2 ! = 2097153
   contains
      procedure :: print => segment_for_print
      procedure :: validate => segment_is_valid
   end type

   ! See ASCII code set
   type(segment_t), parameter, public :: SEG_INIT  = segment_t(UTF8_CODE_MAX+2, UTF8_CODE_MAX+2)
   type(segment_t), parameter, public :: SEG_ERROR = segment_t(-2, -2)
   type(segment_t), parameter, public :: SEG_EPSILON = segment_t(UTF8_CODE_MAX+3, UTF8_CODE_MAX+3)
   type(segment_t), parameter, public :: SEG_EMPTY = segment_t(UTF8_CODE_EMPTY, UTF8_CODE_EMPTY)
   type(segment_t), parameter, public :: SEG_NULL  = segment_t(0,0)
   type(segment_t), parameter, public :: SEG_ANY   = segment_t(UTF8_CODE_MIN, UTF8_CODE_MAX)
   type(segment_t), parameter, public :: SEG_TAB   = segment_t(9, 9)     ! Horizontal Tab
   type(segment_t), parameter, public :: SEG_LF    = segment_t(10, 10)   ! Line Feed
   type(segment_t), parameter, public :: SEG_FF    = segment_t(12, 12)   ! Form Feed
   type(segment_t), parameter, public :: SEG_CR    = segment_t(13, 13)   ! Carriage Return
   type(segment_t), parameter, public :: SEG_SPACE = segment_t(32, 32)   ! White space
   type(segment_t), parameter, public :: SEG_UNDERSCORE = segment_t(95, 95)
   type(segment_t), parameter, public :: SEG_DIGIT = segment_t(48, 57)   ! 0-9
   type(segment_t), parameter, public :: SEG_UPPERCASE = segment_t(65, 90)   ! A-Z
   type(segment_t), parameter, public :: SEG_LOWERCASE = segment_t(97, 122)  ! a-z
   type(segment_t), parameter, public :: SEG_ZENKAKU_SPACE = segment_t(12288, 12288) ! ' ' U+3000 全角スペース
   type(segment_t), parameter, public :: SEG_UPPER = segment_t(UTF8_CODE_MAX+1, UTF8_CODE_MAX+1)
   type(segment_t), parameter, public :: SEG_WHOLE = segment_t(0, UTF8_CODE_MAX)
   type(segment_t), parameter, public, dimension(3) :: SEG_HEX = [SEG_DIGIT, segment_t(65,70), segment_t(97,102)] 

   interface operator(==)
      !! This interface block provides a equal operator for comparing segments.
      module procedure :: segment_equivalent
   end interface

   interface operator(/=)
      !! This interface block provides a not equal operator for comparing segments.
      module procedure :: segment_not_equiv
   end interface

   interface operator(.in.)
      !! This interface block provides the `.in.` operator, which checks whether
      !! an integer and a segment, an integer and a list of segments, or a segment
      !! and a segment, is contained in the latter, respectively.
      module procedure :: arg_in_segment
      module procedure :: arg_in_segment_list
      module procedure :: seg_in_segment
      module procedure :: seg_in_segment_list
      !! @note Note that this is unrelated to the `.in.` operator provided by `forgex` module,
      !! which is intended to be used only by backend modules that implement Forgex (i.e. only
      !! if the `use forgex_segment_m` statement is declared in some module).
   end interface

   !! @note Support for handling many Unicode whitespace characters is currently not
   !! available, but will be added in the future.

   !! @note We would like to add a procedure to merge adjacent segments with the same transition
   !! destination into a single segment.

contains

   !| Checks if the given integer is within the specified segment.
   !
   !  This function determines whether the integer `a` falls within the
   !  range defined by the `min` and `max` values of the `segment_t` type.
   pure elemental function arg_in_segment(a, seg) result(res)
      implicit none
      integer(int32),  intent(in) :: a
      type(segment_t), intent(in) :: seg
      logical :: res

      res = seg%min <= a .and. a <= seg%max
   end function arg_in_segment

   !| Check if the ginve integer is within any of specified segments in a list.
   !
   !  This function determins whether the integer `a` falls within any of the
   !  ranges defined by the `min` and `max` value of the `segment_t` type
   !  in the provided list of segments.
   pure function arg_in_segment_list(a, seg_list) result(res)
      implicit none
      integer(int32),  intent(in) :: a
      type(segment_t), intent(in) :: seg_list(:)
      logical :: res
      integer :: i

      ! Initialize
      res = .false.

      ! Scan the list of segments
      do i = 1, ubound(seg_list, dim=1)
         res = res .or. (seg_list(i)%min <= a .and. a <= seg_list(i)%max)
      end do
   end function arg_in_segment_list


   !| Check if the one segment is completely within another segment.
   !
   !  This function determines whether the segment `a` is entirely within the
   !  range specified by the segment `b`.
   pure elemental function seg_in_segment(a, b) result(res)
      implicit none
      type(segment_t), intent(in) :: a, b
      logical :: res

      res =  b%min <= a%min .and. a%max <= b%max
   end function seg_in_segment


   pure function seg_in_segment_list(seg, list) result(res)
      implicit none
      type(segment_t), intent(in) :: seg
      type(segment_t), intent(in) :: list(:)
      logical :: res

      res = any(seg_in_segment(seg, list(:)))

   end function seg_in_segment_list


   !| Check if the one segment is exactly equal to another segment.
   !
   !  This function determines wheter the segment `a` is equivalent to the
   !  segment `b`, meaning both their `min` and `max` values are identical.
   pure elemental function segment_equivalent(a, b) result(res)
      implicit none
      type(segment_t), intent(in) :: a, b
      logical :: res

      res = a%max == b%max .and. a%min == b%min
   end function segment_equivalent


   !| Check if two segments are not equivalent.
   !
   !  This function determines whether the segment `a` is not equivalent to the
   !  segment `b`, meaning their `min` or `max` values are different.
   pure elemental function segment_not_equiv(a, b) result(res)
      implicit none
      type(segment_t), intent(in) :: a, b
      logical :: res

      res = a%max /= b%max .or. a%min /= b%min
   end function segment_not_equiv


   !| Checks if a segment is valid.
   !
   !  This function determines whether the segment is valid by ensuring that
   !  the `min` value is less than or equal to the `max` value.
   pure elemental function segment_is_valid(self) result(res)
      implicit none
      class(segment_t), intent(in) :: self
      logical :: res
      type(segment_t) :: init

      res = self%min /= init%min .and. self%max /= init%max &
      .and. self%min <= self%max
   end function segment_is_valid


   !> This subroutine inverts a list of segment ranges representing Unicode characters.
   !> It compute the complement of the given ranges and modifies the list accordingly.
   !>
   pure subroutine invert_segment_list(list)
      implicit none
      type(segment_t), intent(inout), allocatable :: list(:)
      type(segment_t), allocatable :: new_list(:)

      integer :: i, n, count
      integer :: current_min

      if (.not. allocated(list)) return

      ! sort and merge segments
      call sort_segment_by_min(list)
      call merge_segments(list)

      ! Count the number of new segments
      count = 0
      current_min = UTF8_CODE_EMPTY+1
      n = size(list, dim=1)

      do i = 1, n
         if (current_min < list(i)%min) then
            count = count + 1
         end if
         current_min = list(i)%max + 1
      end do

      if (current_min <= UTF8_CODE_MAX) then
         count = count + 1
      end if

      ! Allocate new list
      allocate(new_list(count))

      ! Fill the new list with the component segments
      count = 1
      current_min = UTF8_CODE_MIN

      do i = 1, n
         if (current_min < list(i)%min) then
            new_list(count)%min = current_min
            new_list(count)%max = list(i)%min - 1
            count = count + 1
         end if
         current_min = list(i)%max + 1
      end do

      if (current_min <= UTF8_CODE_MAX) then
         new_list(count)%min = current_min
         new_list(count)%max = UTF8_CODE_MAX
      end if

      ! Deallocate old list and reassign new list
      deallocate(list)
      list = new_list
   end subroutine invert_segment_list

   !> This function takes an array of segments and a character as arguments,
   !> and returns the segment as rank=1 array to which symbol belongs
   !> (included in the segment interval).
   pure function which_segment_symbol_belong (segments, symbol) result(res)
      use :: forgex_utf8_m
      implicit none
      type(segment_t), intent(in) :: segments(:)
      character(*),    intent(in) :: symbol
      type(segment_t)             :: res

      integer         :: i, i_end, j
      type(segment_t) :: target_for_comparison

      ! If `symbol` is a empty character, return SEG_EMPTY
      if (symbol == '') then
         res = SEG_EMPTY
         return
      end if

      ! Initialize indices.
      i = 1
      i_end = idxutf8(symbol, i)

      ! The target to check for inclusion.
      target_for_comparison = symbol_to_segment(symbol(i:i_end))

      ! Scan the segments array.
      do j = 1, size(segments)
         ! Compare segments and return the later element of the segments, which contains the target segment.
         if (target_for_comparison .in. segments(j)) then
            res = segments(j)
            return
         end if
      end do

      ! If not found, returns SEG_EMPTY.
      res = SEG_EMPTY
   end function which_segment_symbol_belong


   !> This function convert an input symbol into the segment corresponding it.
   pure function symbol_to_segment(symbol) result(res)
      use :: forgex_utf8_m
      implicit none
      character(*), intent(in) :: symbol
      type(segment_t)          :: res

      integer(int32) :: i, i_end, code

      ! If `symbol` is a empty character, return SEG_EMPTY
      if (symbol == char(0)) then
         res = SEG_EMPTY
         return
      else if (symbol == char(32)) then
         res = SEG_SPACE
         return
      end if

      ! Initialize indices
      i = 1
      i_end = idxutf8(symbol, i)

      ! Get the code point of the input character.
      code = ichar_utf8(symbol(i:i_end))

      ! Create a segment corresponding to the code, and return it.
      res = segment_t(code, code)
   end function symbol_to_segment


   !> This procedure registers given segment_t value to segment_t type array,
   !> increments counter of the actual size of the array, and initializes temporary variable.
   pure subroutine register_segment_to_list(segment_list, segment, k, ierr)
      use :: forgex_parameters_m, only: SEGMENT_REGISTERED, SEGMENT_REJECTED
      implicit none
      type(segment_t), intent(inout) :: segment_list(:)
      type(segment_t), intent(inout) :: segment
      integer, intent(inout) :: k
      integer, intent(inout) :: ierr

      if (segment%validate() .and. k <= size(segment_list)-1) then
         k = k + 1

         segment_list(k) = segment ! register

         ierr = SEGMENT_REGISTERED
      else
         ierr = SEGMENT_REJECTED
      end if
   end subroutine register_segment_to_list


   !> This subroutine converts character string that represents hexadecimal value to
   !> the segment corresponding its integer type.
   pure subroutine hex2seg (str, seg, ierr)
      use :: forgex_parameters_m
      use :: forgex_utf8_m
      use :: forgex_error_m
      implicit none
      character(*), intent(in) :: str
      type(segment_t), intent(inout) :: seg
      integer, intent(inout) :: ierr

      character(:), allocatable :: buf, fmt
      character(8) :: c_len

      integer :: i, ios, code
      logical :: is_two_digits, is_longer_digit, is_hex_valid
      
      fmt = ''
      c_len = ''
      code = UTF8_CODE_INVALID
      seg = segment_t(code, code)

      is_two_digits = len(str) == 2
      is_longer_digit = 2 < len(str)

      if (str == '' .or. len(str) <2) then
         ierr = SYNTAX_ERR_HEX_DIGITS_NOT_ENOUGH
         return
      end if

      ! Get the string lenght as a character type.
      write(c_len, '(i0)', iostat=ios) len(str)
      if (ios/= 0) then
         ierr = SYNTAX_ERR_INVALID_HEXADECIMAL
         return
      end if
      fmt = '(z'//trim(c_len)//')'

      ! Get the code point as a integer.
      read(str, fmt=fmt, iostat=ios) code
      is_hex_valid = ios == 0

      ! Error handlers
      if (.not. is_hex_valid) then
         ierr = SYNTAX_ERR_INVALID_HEXADECIMAL
         return
      end if
   
      ! Reject if codepoint valud is invalid as Unicode.
      if (.not.(code .in. SEG_WHOLE)) then
         ierr = SYNTAX_ERR_UNICODE_EXCEED
         return
      end if

      seg = segment_t(code, code)
      ierr = SYNTAX_VALID

   end subroutine hex2seg


   pure subroutine prop2seg(property, seglist, ierr)
      ! use :: forgex_unicode_gc_m
      use :: forgex_error_m
      implicit none
      character(*), intent(in) :: property
      type(segment_t), intent(inout), allocatable :: seglist(:)
      integer, intent(inout) :: ierr

      ! logical :: is_single_prop, is_longer_prop
      ! character(:), allocatable :: prop
      
      ! prop = property
      ! is_single_prop = len(prop) == 1
      ! is_longer_prop = 1 < len(prop)

      ! if (prop == '' .or. len(prop) < 1) then
      !    ierr = SYNTAX_ERR_EMPTY_PROPERTY
      !    return
      ! end if


   end subroutine prop2seg
      

!====================================================================-!
!  Helper procedures

   pure elemental function width_of_segment(seg) result(res)
      use :: forgex_parameters_m, only: INVALID_SEGMENT_SIZE
      implicit none
      type(segment_t), intent(in) :: seg
      integer :: res

      if (seg%validate()) then
         res = seg%max - seg%min + 1
      else
         res = INVALID_SEGMENT_SIZE
      end if
   end function width_of_segment

   pure function total_width_of_segment(seg_list) result(res)
      use :: forgex_parameters_m
      implicit none
      type(segment_t), intent(in) :: seg_list(:)
      integer :: res, k
      res = 0
      do k = 1, size(seg_list)
         res = res + width_of_segment(seg_list(k))
      end do
   end function total_width_of_segment

   !> This function converts two isolated segments into single fused segment
   !> and returns it.
   pure function join_two_segments(segA, segB) result(res)
      implicit none
      type(segment_t), intent(in) :: segA, segB
      type(segment_t) :: res

      res = segment_t(segA%min, segB%max)

      if (.not. res%validate()) then
         res = SEG_INIT
      end if
   
   end function join_two_segments


   pure subroutine sort_segment_by_min(segments)
      implicit none
      type(segment_t), allocatable, intent(inout) :: segments(:)

      integer :: i, j, n
      type(segment_t) :: temp ! temporary variable

      if (.not. allocated(segments)) return

      n = size(segments)
      do i = 1, n-1
         do j = i+1, n
            if (segments(i)%min > segments(j)%min) then
               temp = segments(i)
               segments(i) = segments(j)
               segments(j) = temp
            end if
         end do
      end do
   end subroutine sort_segment_by_min


   pure subroutine merge_segments(segments)
      implicit none
      type(segment_t), allocatable, intent(inout) :: segments(:)
      integer :: i, j, n, m

      if (.not. allocated(segments)) return

      n = ubound(segments, dim=1)

      if (n <= 0) return

      m = 1
      do i = 2, n
         if (segments(i) == SEG_INIT) exit
         m = m+1
      end do
      n = m

      if (n <= 1) then
         segments = segments(:n)
         return
      end if

      j = 1
      do i = 2, n
         if (segments(j)%max >= segments(i)%min-1) then
            segments(j)%max = max(segments(j)%max, segments(i)%max)
         else
            j = j + 1
            segments(j) = segments(i)
         endif
      end do

      if (j <= n) then
         segments = segments(:j)    ! reallocation implicitly.
      end if

   end subroutine merge_segments


   !| Converts a segment to a printable string representation.
   !
   !  This function generates a string representation of the segment `seg` for
   !  printing purposes. It converts special segments to predefined strings
   !  like `<ANY>`, `<LF>`, etc., or generates a character range representation
   !  for segments with defined `min` and `max` values.
   function segment_for_print (seg) result(res)
      use :: forgex_utf8_m
      implicit none
      class(segment_t), intent(in) :: seg
      character(:), allocatable :: res
      character(:), allocatable :: cache

      if (seg == SEG_ANY) then
         res = "<ANY>"

      else if (seg == SEG_TAB) then
         res = "<TAB>"
      else if (seg == segment_t(9, 10)) then
         res = "<TAB, LF>"
      else if (seg == segment_t(9, 11)) then
         res = "<TAB, LF, VT>"   
      else if (seg == segment_t(9, 12)) then
         res = "<TAB, LF, VT, FF>"   
      else if (seg == segment_t(9, 13)) then
         res = "<TAB, LF, VT, FF, CR>"

      else if (seg == SEG_LF) then
         res = "<LF>"
      else if (seg == segment_t(10, 11)) then
         res = "<LF, VT>"   
      else if (seg == segment_t(10, 12)) then
         res = "<LF, VT, FF>"   
      else if (seg == segment_t(10, 13)) then
         res = "<LF, VT, FF, CR>"  

      else if (seg == segment_t(11, 11)) then
         res = "<VT>"   
      else if (seg == segment_t(11, 12)) then
         res = "<VT, FF>"   
      else if (seg == segment_t(11, 13)) then
         res = "<VT, FF, CR>"

      else if (seg == SEG_FF) then
         res = "<FF>"
      else if (seg == segment_t(12, 13)) then
         res = "<FF, CR>"

      else if (seg == SEG_CR) then
         res = "<CR>"
      else if (seg == SEG_SPACE) then
         res = "<SPACE>"
      else if (seg == SEG_ZENKAKU_SPACE) then
         res = "<ZENKAKU SPACE>"
      else if (seg == SEG_EPSILON) then
         res = "?"
      else if (seg == SEG_INIT) then
         res = "<INIT>"
      else if (seg == SEG_EMPTY) then
         res = "<EMPTY>"
      else if (seg%min == seg%max) then
         res = char_utf8(seg%min)
      else if (seg%max == UTF8_CODE_MAX) then
         if (seg%min == ichar(' ')) then
            cache = "<SPACE>"
         else
            cache = '"'//char_utf8(seg%min)//'"'
         end if
         res = '['//cache//'-'//"<U+10FFFF>"//']'
      else
         if (seg%min == ichar(' ')) then
            cache = "<SPACE>"
         else
            cache = '"'//char_utf8(seg%min)//'"'
         end if
         res = '['//cache//'-"'//char_utf8(seg%max)//'"]'
      end if

      !!
      !! @note This function contains magic strings, so in the near future we would like
      !! to extract it to `forgex_parameter_m` module and remove the magic strings.

   end function segment_for_print

end module forgex_segment_m