syntax_tree_graph_m.F90 Source File


Source Code

! Fortran Regular Expression (Forgex)
!
! MIT License
!
! (C) Amasaki Shinobu, 2023-2025
!     A regular expression engine for Fortran.
!     forgex_syntax_tree_graph_m module is a part of Forgex.
!
#ifdef IMPURE
#define pure
#endif
module forgex_syntax_tree_graph_m
   use :: forgex_parameters_m
   use :: forgex_enums_m
   use :: forgex_segment_m, register => register_segment_to_list
   use :: forgex_syntax_tree_node_m, &
      only: tree_node_t, tape_t, terminal, make_atom, make_tree_node, make_repeat_node
   use :: forgex_error_m
   ! use :: forgex_unicode_gc_m
   implicit none
   private

   type, public :: tree_t
   !! This derived-type contains all node of syntax-tree in the tree_node_t type array `nodes`.
      type(tree_node_t), allocatable :: nodes(:)
      integer :: top = INVALID_INDEX
      integer :: num_alloc = 0
      type(tape_t) :: tape
      logical :: is_valid = .true.
      integer :: code = SYNTAX_VALID
      integer :: paren_balance
   contains
      procedure :: build => tree_graph__build_syntax_tree
      procedure :: reallocate => tree_graph__reallocate
      procedure :: deallocate => tree_graph__deallocate
      procedure :: register => tree_graph__register_node
      procedure :: register_connector => tree_graph__register_connector
      procedure :: connect_left => tree_graph__connect_left
      procedure :: connect_right => tree_graph__connect_right
      procedure :: get_top => tree_graph__get_top
      procedure :: regex => tree_graph__regex
      procedure :: term => tree_graph__term
      procedure :: suffix_op => tree_graph__suffix_op
      procedure :: primary => tree_graph__primary
      procedure :: char_class =>tree_graph__char_class
      procedure :: caret_dollar => tree_graph__make_tree_caret_dollar
      procedure :: crlf => tree_graph__make_tree_crlf
      procedure :: shorthand => tree_graph__shorthand
      procedure :: hex2seg => tree_graph__hexadecimal_to_segment
      procedure :: hex2cp => tree_graph__hexadecimal_to_codepoint
      procedure :: property => tree_graph__unicode_property
      procedure :: times => tree_graph__times
      procedure :: print => print_tree_wrap
   end type

   public :: dump_tree_table
   public :: interpret_class_string

   public :: hex2seg

contains

   !> This procedure builds an AST corresponding to a given (regular expression) pattern from it.
   pure subroutine tree_graph__build_syntax_tree(self, pattern)
      implicit none
      class(tree_t), intent(inout) :: self
      character(*), intent(in) :: pattern

      integer :: i, status

      ! if (allocated(self%nodes)) deallocate(self%nodes)
      allocate(self%nodes(TREE_NODE_BASE:TREE_NODE_UNIT), stat=status)
      
      self%nodes(TREE_NODE_BASE:TREE_NODE_UNIT)%own_i = [(i, i=TREE_NODE_BASE, TREE_NODE_UNIT)]
      self%num_alloc = 1
      self%tape%idx = 1
      self%tape%str = pattern
      self%top = 0
      self%paren_balance = 0
      call self%tape%get_token()

      ! Generate AST from a given pattern.
      call self%regex()

      ! Check the pattern is valid.
      if (.not. self%is_valid) return

      ! Determine if parentheses are balanced.
      if (self%paren_balance > 0) then
         self%is_valid = .false.
         self%code = SYNTAX_ERR_PARENTHESIS_MISSING
      else if (self%paren_balance < 0) then
         self%is_valid = .false.
         self%code = SYNTAX_ERR_PARENTHESIS_UNEXPECTED
      end if
      
      self%nodes(self%top)%parent_i = TERMINAL_INDEX
   end subroutine tree_graph__build_syntax_tree


   !> This procedure handles the reallcation of the `tree_node_t` type array 
   !> within the component of the `tree_t` object.
   !> However, it is not be used in v4.2.
   pure subroutine tree_graph__reallocate(self)
      implicit none
      class(tree_t), intent(inout) :: self
      integer :: new_part_begin, new_part_end, i
      type(tree_node_t), allocatable :: tmp(:)

      if (.not. allocated(self%nodes)) then
         allocate(self%nodes(TREE_NODE_BASE:TREE_NODE_UNIT))
         self%num_alloc = 1
      end if

      new_part_begin = ubound(self%nodes, dim=1) + 1
      new_part_end   = ubound(self%nodes, dim=1) * 2

      if (new_part_end > TREE_NODE_HARD_LIMIT) then
         error stop "Exceeded the maximum number of tree nodes can be allocated."
      end if

      call move_alloc(self%nodes, tmp)

      allocate(self%nodes(TREE_NODE_BASE:new_part_end))

      self%nodes(TREE_NODE_BASE:new_part_begin-1) = tmp(TREE_NODE_BASE:new_part_begin-1)

      self%nodes(new_part_begin:new_part_end)%own_i = [(i, i = new_part_begin, new_part_end)]

      deallocate(tmp)

   end subroutine tree_graph__reallocate


   !> This procedure deallocates `nodes` of `tree_t`
   pure subroutine tree_graph__deallocate(self)
      implicit none
      class(tree_t), intent(inout) :: self

      deallocate(self%nodes)
   end subroutine tree_graph__deallocate


   pure subroutine tree_graph__register_node(self, node)
      implicit none
      class(tree_t), intent(inout) :: self
      type(tree_node_t), intent(inout) :: node

      integer :: top

      top = self%top + 1
      if (top > ubound(self%nodes, dim=1)) then
         call self%reallocate()
      end if
      node%own_i = top
      self%nodes(top) = node
      self%nodes(top)%is_registered = .true.
      self%top = top

   end subroutine tree_graph__register_node

   
   pure subroutine tree_graph__register_connector(self, node, left, right)
      implicit none
      class(tree_t), intent(inout) :: self
      type(tree_node_t), intent(inout) :: node
      type(tree_node_t), intent(in) :: left, right

      call self%register(node)

      call self%connect_left(self%nodes(self%top)%own_i, left%own_i)
      call self%connect_right(self%nodes(self%top)%own_i, right%own_i)

   end subroutine tree_graph__register_connector

   pure subroutine tree_graph__connect_left(self, parent, child)
      implicit none
      class(tree_t), intent(inout) :: self
      integer, intent(in) :: parent, child

      if (parent /= INVALID_INDEX) self%nodes(parent)%left_i = child
      if (child /= INVALID_INDEX) self%nodes(child)%parent_i = parent

   end subroutine tree_graph__connect_left

   pure subroutine tree_graph__connect_right(self, parent, child)
      implicit none
      class(tree_t), intent(inout) :: self
      integer, intent(in) :: parent, child

      if (parent /= INVALID_INDEX) self%nodes(parent)%right_i = child
      if (child /= INVALID_INDEX) self%nodes(child)%parent_i = parent
   end subroutine tree_graph__connect_right

   pure function tree_graph__get_top(self) result(node)
      implicit none
      class(tree_t), intent(in) :: self
      type(tree_node_t) :: node

      node = self%nodes(self%top)
   end function tree_graph__get_top


!=====================================================================!
!  Parsing procedures

   pure recursive subroutine tree_graph__regex(self)
      implicit none
      class(tree_t), intent(inout) :: self

      type(tree_node_t) :: node, left, right

      call self%term()

      ! When term's analysis is valid,
      if (self%is_valid) then

         left = self%get_top()

         do while (self%tape%current_token == tk_union)
            call self%tape%get_token()

            call self%term()
            if (.not. self%is_valid) exit

            right = self%get_top()

            node = make_tree_node(op_union)
            call self%register_connector(node, left, right)

            left = self%get_top()
         end do
      
      else
         if (self%code /= SYNTAX_VALID) then
            return
         end if
      end if

   end subroutine tree_graph__regex


   pure recursive subroutine tree_graph__term(self)
      implicit none
      class(tree_t), intent(inout) :: self
      type(tree_node_t) :: node, left, right

      if (self%tape%current_token == tk_union &
            .or. self%tape%current_token == tk_rpar &
            .or. self%tape%current_token == tk_end) then
         
         node = make_tree_node(op_empty)
         call self%register_connector(node, terminal, terminal)
      else
         call self%suffix_op()
         if (.not. self%is_valid) return

         left = self%get_top()

         do while (self%tape%current_token /= tk_union &
                     .and. self%tape%current_token /= tk_rpar &
                     .and. self%tape%current_token /= tk_end)
            
            call self%suffix_op()
            if (.not. self%is_valid) return

            right = self%get_top()

            node = make_tree_node(op_concat)
            call self%register_connector(node, left, right)

            left = self%get_top()
         end do
      end if

      if (self%tape%current_token == tk_rpar) then
         self%paren_balance = self%paren_balance -1
      end if

   end subroutine


   pure recursive subroutine tree_graph__suffix_op(self)
      implicit none
      class(tree_t), intent(inout) :: self
      type(tree_node_t) :: node, left, right

      call self%primary()
      if (.not. self%is_valid) return

      left = self%get_top()

      select case (self%tape%current_token)
      case (tk_star)
         node = make_tree_node(op_closure)
         call self%register_connector(node, left, terminal)
         call self%tape%get_token()

      case (tk_plus)
         node = make_tree_node(op_closure)
         call self%register_connector(node, left, terminal)

         right = self%get_top()
         node = make_tree_node(op_concat)
         call self%register_connector(node, left, right)

         call self%tape%get_token()

      case (tk_question)
         node = make_tree_node(op_empty)
         call self%register_connector(node, left, terminal)

         right = self%get_top()
         node = make_tree_node(op_union)
         call self%register_connector(node, left, right)
         call self%tape%get_token()

      case (tk_lcurlybrace)
         call self%times()
         if (.not. self%is_valid) then
            ! self%code = SYNTAX_ERR_INVALID_TIMES
            return
         end if
         call self%tape%get_token()

      ! `case default` must NOT be placed here.
      end select
   end subroutine tree_graph__suffix_op


   pure recursive subroutine tree_graph__primary(self)
      use :: forgex_utf8_m, only: ichar_utf8
      implicit none
      class(tree_t), intent(inout) :: self

      type(tree_node_t) :: node
      type(segment_t) :: seg
      character(:), allocatable :: chara


      select case (self%tape%current_token)
      case (tk_char)
         chara = self%tape%token_char
         seg = segment_t(ichar_utf8(chara), ichar_utf8(chara))
         node = make_atom(seg)
         call self%register_connector(node, terminal, terminal)
         call self%tape%get_token() 
      
      case (tk_lpar)
         
         if (self%tape%current_token == tk_lpar) then
            self%paren_balance = self%paren_balance +1
         end if

         call self%tape%get_token()
         call self%regex()
      
         ! If regex fails, return immediately.
         if (.not. self%is_valid) return

         ! If not a right parenthesis, throw an error.
         if (self%tape%current_token /= tk_rpar) then
            self%code = SYNTAX_ERR_PARENTHESIS_MISSING
            self%is_valid = .false.
            return
         end if
         call self%tape%get_token()

      case (tk_lsbracket)
         call self%char_class()
         if (.not. self%is_valid) then
            return
         end if 
         if (self%tape%current_token /= tk_rsbracket) then
            self%code = SYNTAX_ERR_BRACKET_MISSING
            self%is_valid = .false.
            return
         end if
         call self%tape%get_token()

      case (tk_backslash)
         call self%shorthand()
         if (.not. self%is_valid) then
            return
         end if
         call self%tape%get_token()
      
      case (tk_dot)
         node = make_atom(SEG_ANY)
         call self%register_connector(node, terminal, terminal)
         call self%tape%get_token()

      case (tk_caret)
         call self%caret_dollar()
         call self%tape%get_token()
         
      case (tk_dollar)
         call self%caret_dollar()
         call self%tape%get_token()
      
      case (tk_rsbracket)
         self%code = SYNTAX_ERR_BRACKET_UNEXPECTED
         self%is_valid = .false.
         return

      case (tk_rpar)
         self%code = SYNTAX_ERR_PARENTHESIS_UNEXPECTED
         self%is_valid = .false.
         return
   
      ! Unescaped closing curly brace is allowed.
      case (tk_rcurlybrace)
         chara = self%tape%token_char
         seg = segment_t(ichar_utf8(chara), ichar_utf8(chara))
         node = make_atom(seg)
         call self%register_connector(node, terminal, terminal)
         call self%tape%get_token()

      case (tk_lcurlybrace)
         self%code = SYNTAX_ERR_INVALID_TIMES
         self%is_valid = .false.
         return

      case (tk_star)
         self%code = SYNTAX_ERR_STAR_INCOMPLETE
         self%is_valid = .false.
         return

      case (tk_plus)
         self%code = SYNTAX_ERR_PLUS_INCOMPLETE
         self%is_valid = .false.
         return

      case (tk_question)
         self%code = SYNTAX_ERR_QUESTION_INCOMPLETE
         self%is_valid = .false.
         return

      case default
         self%code = SYNTAX_ERR_THIS_SHOULD_NOT_HAPPEN
         self%is_valid = .false.
         return
      end select

   end subroutine tree_graph__primary
      

   !> This subroutine treats character class expression,
   !> and does not call any other recursive procedures.
   pure subroutine tree_graph__char_class(self)
      use :: forgex_utf8_m, only: idxutf8, len_utf8, count_token, ichar_utf8
      use :: forgex_cube_m, only: cube_t, assignment(=)
      use :: forgex_enums_m
      implicit none
      class(tree_t), intent(inout) :: self

      ! type(segment_t), allocatable :: seglist(:)
      type(cube_t) :: cube
      character(:), allocatable :: buf
      type(tree_node_t) :: node

      integer :: siz, ie
      logical :: is_inverted, backslashed
      character(:), allocatable :: prev, curr

      siz = 0

      call self%tape%get_token(class_flag=.true.)

      ! The variable buf stores the string representing the character class.
      buf = ''
      prev = ''
      curr = ''
      backslashed = .false.
      outer: do while (self%tape%current_token /= tk_rsbracket)
         prev = curr
         if (self%tape%current_token == tk_end) then
            return
         end if

         ie = idxutf8(self%tape%token_char, 1)
         curr = self%tape%token_char(1:ie)
         buf = buf//curr

         if (self%tape%current_token == tk_backslash .and. .not. backslashed) then
            backslashed = .true.
         else
            backslashed = .false.
         end if

         call self%tape%get_token(class_flag=.true.)
         
         ! for an escaped right square bracket 
         if (self%tape%current_token == tk_rsbracket .and. backslashed) then
            ie = idxutf8(self%tape%token_char, 1)
            curr = self%tape%token_char(1:ie)
            buf = buf//curr
            call self%tape%get_token(class_flag=.true.)
         end if

      end do outer

      ! If the character class pattern is empty, return false. 
      if (len(buf) == 0) then
         self%code = SYNTAX_ERR_EMPTY_CHARACTER_CLASS
         self%is_valid = .false.
         return
      end if

      ! Handling a negative class case.
      is_inverted = .false.
      if (buf(1:1) == SYMBOL_CRET) then
         is_inverted = .true.
         buf = buf(2:len(buf))  ! May this assignment be a problem?
      end if

      ! The variable siz stores the length of buf.
      siz = len_utf8(buf)

      if (siz < 1) then
         self%code = SYNTAX_ERR_EMPTY_CHARACTER_CLASS
         self%is_valid = .false.
         return
      end if

      call interpret_class_string(buf, cube, self%is_valid, self%code)

      if (.not. self%is_valid) then
         return
      end if

      ! the seglist array have been allocated near the L362.
      if (is_inverted) then
         call cube%invert()
      end if

      node = make_tree_node(op_char)

      ! Manually cube_t copy
      if (.not. cube%is_switched_to_bmp)then 
         node%c%ascii = cube%ascii
      else
         call node%c%switch_bmp()
         node%c%bmp%b(:) = cube%bmp%b(:)
      end if

      node%c%single_flag = cube%single_flag
      if (allocated(cube%sps)) node%c%sps = cube%sps(:)
      if (cube%is_flagged_epsilon()) call node%c%flag_epsilon()

      call self%register_connector(node, terminal, terminal)

   end subroutine tree_graph__char_class

   ! This procedure registers line feed and carriage return nodes into the AST under costruced.
   pure subroutine tree_graph__make_tree_crlf(self)
      implicit none
      class(tree_t), intent(inout) :: self

      type(tree_node_t) :: cr, lf, right, node

      cr = make_atom(SEG_CR)
      call self%register_connector(cr, terminal, terminal)

      lf = make_atom(SEG_LF)
      call self%register_connector(lf, terminal, terminal)

      right = make_tree_node(op_concat)
      call self%register_connector(right, cr, lf)

      node = make_tree_node(op_union)
      call self%register_connector(node, lf, right)

   end subroutine tree_graph__make_tree_crlf


   !> This function constructs a tree node for carriage return (CR) and line feed (LF) characters.
   pure subroutine tree_graph__make_tree_caret_dollar(self)
      implicit none
      class(tree_t), intent(inout) :: self

      type(tree_node_t) :: cr, lf, node_r_r, node_r, node, empty_r

      cr = make_atom(SEG_CR)
      call self%register_connector(cr, terminal, terminal)

      lf = make_atom(SEG_LF)
      call self%register_connector(lf, terminal, terminal)

      node_r_r = make_tree_node(op_concat)
      call self%register_connector(node_r_r, cr, lf)

      node_r = make_tree_node(op_union)
      call self%register_connector(node_r, lf, node_r_r)

      empty_r = make_atom(SEG_EMPTY)
      call self%register_connector(empty_r, terminal, terminal)

      node = make_tree_node(op_union)
      call self%register_connector(node, node_r, empty_r)

   end subroutine tree_graph__make_tree_caret_dollar


   !> This function handles shorthand escape sequences (`\t`, `\n`, `\r`, `\d`, `\D`,
   !> `\w`, `\W`, `\s`, `\S`).
   !> It does not call any other recursive procedures.
   pure subroutine tree_graph__shorthand(self)
      use :: forgex_utf8_m, only: ichar_utf8
      implicit none
      class(tree_t), intent(inout) :: self
      type(tree_node_t) :: node
      type(segment_t), allocatable :: seglist(:)
      type(segment_t) :: seg
      character(:), allocatable :: chara


      select case (trim(self%tape%token_char))
      case (ESCAPE_T)
         node = make_atom(SEG_TAB)
         call self%register_connector(node, terminal, terminal)
         return

      case (ESCAPE_N)
         call self%crlf()
         return

      case (ESCAPE_R)
         node = make_atom(SEG_CR)
         call self%register_connector(node, terminal, terminal)
         return

      case (ESCAPE_D)
         node = make_atom(SEG_DIGIT)
         call self%register_connector(node, terminal, terminal)
         return

      case (ESCAPE_D_CAPITAL)
         call node%c%add(SEG_DIGIT)
         call node%c%invert()
         ! allocate(seglist(1))
         ! seglist(1) = SEG_DIGIT
         ! call invert_segment_list(seglist)

      case (ESCAPE_W)
         call node%c%add(SEG_LOWERCASE)
         call node%c%add(SEG_UNDERSCORE)
         call node%c%add(SEG_DIGIT)
         call node%c%add(SEG_UPPERCASE)

         ! allocate(seglist(4))
         ! seglist(1) = SEG_LOWERCASE
         ! seglist(2) = SEG_UPPERCASE
         ! seglist(3) = SEG_DIGIT
         ! seglist(4) = SEG_UNDERSCORE

      case (ESCAPE_W_CAPITAL)
         call node%c%add(SEG_LOWERCASE)
         call node%c%add(SEG_UNDERSCORE)
         call node%c%add(SEG_DIGIT)
         call node%c%add(SEG_UPPERCASE)
         call node%c%invert()
         ! allocate(seglist(4))
         ! seglist(1) = SEG_LOWERCASE
         ! seglist(2) = SEG_UPPERCASE
         ! seglist(3) = SEG_DIGIT
         ! seglist(4) = SEG_UNDERSCORE
         ! call invert_segment_list(seglist)

      case (ESCAPE_S)
         call node%c%add([SEG_SPACE, SEG_TAB, SEG_CR, SEG_LF, SEG_FF, SEG_ZENKAKU_SPACE])

         ! allocate(seglist(6))
         ! seglist(1) = SEG_SPACE
         ! seglist(2) = SEG_TAB
         ! seglist(3) = SEG_CR
         ! seglist(4) = SEG_LF
         ! seglist(5) = SEG_FF
         ! seglist(6) = SEG_ZENKAKU_SPACE

      case (ESCAPE_S_CAPITAL)
         call node%c%add([SEG_SPACE, SEG_TAB, SEG_CR, SEG_LF, SEG_FF, SEG_ZENKAKU_SPACE])
         call node%c%invert()

         ! allocate(seglist(6))
         ! seglist(1) = SEG_SPACE
         ! seglist(2) = SEG_TAB
         ! seglist(3) = SEG_CR
         ! seglist(4) = SEG_LF
         ! seglist(5) = SEG_FF
         ! seglist(6) = SEG_ZENKAKU_SPACE
         ! call invert_segment_list(seglist)

      case (ESCAPE_X)
         ! Error handling for x escape sequence is handled by hex2seg.
         call self%hex2seg(seglist)
         if (.not. self%is_valid) return
         ! It is not necessary to call self%tape%get_token() procedure.
         call node%c%add(seglist)

      case (ESCAPE_P)
         call self%property(seglist)
         if (.not. self%is_valid) return

      case (EMPTY_CHAR)
         self%code = SYNTAX_ERR_ESCAPED_SYMBOL_MISSING
         self%is_valid = .false.
         return
      case (SYMBOL_LSBK, SYMBOL_RSBK, &
            SYMBOL_LCRB, SYMBOL_RCRB, &
            SYMBOL_LPAR, SYMBOL_RPAR, &
            SYMBOL_DOLL, SYMBOL_BSLH, &
            SYMBOL_VBAR, SYMBOL_DOT, &
            SYMBOL_QUES, SYMBOL_CRET, &
            SYMBOL_STAR, SYMBOL_PLUS, &
            SYMBOL_HYPN)
         chara = self%tape%token_char
         seg = segment_t(ichar_utf8(chara), ichar_utf8(chara))
         node%op = op_char

         call node%c%add(seg)
         call self%register_connector(node, terminal, terminal)
         return

      case default
         self%code = SYNTAX_ERR_ESCAPED_SYMBOL_INVALID
         self%is_valid = .false.
         ! chara = self%tape%token_char
         ! seg = segment_t(ichar_utf8(chara), ichar_utf8(chara))
         ! node = make_atom(seg)
         ! call self%register_connector(node, terminal, terminal)
         return
      end select

      ! allocate(node%c(size(seglist, dim=1)))
      ! This size function is safe because it is always allocated 
      ! to the non-returned branches of the select case above.

      ! node%c(:) = seglist(:)
      node%op = op_char

      call self%register_connector(node, terminal, terminal)

      ! deallocate(seglist)

   end subroutine tree_graph__shorthand


   pure subroutine tree_graph__hexadecimal_to_codepoint(self, cp)
      implicit none
      class(tree_t), intent(inout) :: self
      integer(int32), intent(inout) :: cp

      character(:), allocatable :: buf, fmt
      character(6) :: hex_c
      character(8) :: hex_len_c
      integer :: i, ios
      logical :: is_two_digit, is_longer_digit

      buf = ''

      call self%tape%get_token()

      is_longer_digit = self%tape%current_token == tk_lcurlybrace
      is_two_digit = .not. is_longer_digit

      if (is_longer_digit) call self%tape%get_token()

      buf = self%tape%token_char(1:1) ! First, get the second digit.
      i = 2

      reader: do while(.true.)
         if (is_two_digit .and. i >= 3) exit reader
         call self%tape%get_token()

         if (is_longer_digit .and. self%tape%current_token /= tk_rcurlybrace .and. self%tape%current_token /= tk_char) then
            self%is_valid = .false.
            self%code = SYNTAX_ERR_CURLYBRACE_MISSING
            return
         end if

         if (self%tape%current_token == tk_rcurlybrace) exit reader
         buf = buf//self%tape%token_char(1:1)
         i = i + 1
      end do reader

      hex_c = trim(adjustl(buf))

      write(hex_len_c, '(i0)', iostat=ios) len_trim(hex_c)
      if (ios /= 0) then
         self%code = SYNTAX_ERR_INVALID_HEXADECIMAL
         return
      endif

      fmt = '(z'//trim(hex_len_c)//')'
      read(hex_c, fmt=fmt, iostat=ios) cp

      if (ios /= 0) then
         self%code = SYNTAX_ERR_INVALID_HEXADECIMAL
         return
      end if

      if (.not. (cp .in. SEG_WHOLE)) then
         self%code = SYNTAX_ERR_UNICODE_EXCEED
         return
      end if

   end subroutine tree_graph__hexadecimal_to_codepoint


   !> This procedure handles a escape sequence with '\x'.
   pure subroutine tree_graph__hexadecimal_to_segment(self, seglist)
      implicit none
      class(tree_t), intent(inout) :: self
      type(segment_t), intent(inout), allocatable :: seglist(:)
      
      character(:), allocatable :: hex
      integer :: i
      logical :: is_two_digit, is_longer_digit

      hex = ''

      call self%tape%get_token()

      is_longer_digit = self%tape%current_token == tk_lcurlybrace
      is_two_digit = .not. is_longer_digit

      if (is_longer_digit) call self%tape%get_token()

      hex = self%tape%token_char(1:1) ! First, get the second digit.
      i = 2

      reader: do while(.true.)
         if (is_two_digit .and. i >= 3) exit reader
         call self%tape%get_token()

         if (is_longer_digit .and. self%tape%current_token /= tk_rcurlybrace .and. self%tape%current_token /= tk_char) then
            self%is_valid = .false.
            self%code = SYNTAX_ERR_CURLYBRACE_MISSING
            return
         end if

         if (self%tape%current_token == tk_rcurlybrace) exit reader
         hex = hex//self%tape%token_char(1:1)
         i = i + 1
      end do reader

      allocate(seglist(1))
      call hex2seg(trim(hex), seglist(1), self%code)

      if (self%code /= SYNTAX_VALID) then
         self%is_valid = .false.
         return
      end if

      self%is_valid = seglist(1) .in. SEG_WHOLE

      if (.not. self%is_valid) self%code  = SYNTAX_ERR_UNICODE_EXCEED


   end subroutine tree_graph__hexadecimal_to_segment


   pure subroutine tree_graph__unicode_property(self, seglist)
      implicit none
      class(tree_t), intent(inout) :: self
      type(segment_t), intent(inout), allocatable :: seglist(:)

      character(:), allocatable :: property
      integer :: i
      logical :: is_single_prop, is_longer_prop

      self%code = SYNTAX_ERR_UNICODE_PROPERTY_NOT_IMPLEMENTED
      return
      
      property = ''

      call self%tape%get_token()

      is_longer_prop = self%tape%current_token == tk_lcurlybrace
      is_single_prop = .not. is_longer_prop

      if (is_longer_prop) call self%tape%get_token()

      property = self%tape%token_char(1:1)
      
      if (is_longer_prop) then
         i = 2
         reader: do while (.true.)
            call self%tape%get_token()

            if (self%tape%current_token /= tk_rcurlybrace .and.  self%tape%current_token /= tk_char) then
               self%is_valid = .false.
               self%code = SYNTAX_ERR_CURLYBRACE_MISSING
               return
            end if

            if (self%tape%current_token == tk_rcurlybrace) exit
            property = property//self%tape%token_char(1:1)
            i = i + 1
         end do reader
      end if

      ! call prop2seg(property, seglist, self%code)

   end subroutine tree_graph__unicode_property 


   !> This subroutine handles a quantifier range, and
   !> does not call any other recursive procedures.
   pure subroutine tree_graph__times(self)
      use :: forgex_utility_m, only: get_index_comma, is_integer
      implicit none
      class(tree_t), intent(inout) :: self
      character(:), allocatable :: buf
      integer(int32) :: arg(2), ios, min, max

      type(tree_node_t) :: left, node

      integer :: i, num_comma
      character(:), allocatable :: c1, c2
      logical :: is_infinite

      ! Initialize
      ios = 0
      buf = ''
      arg(:) = INVALID_REPEAT_VAL
      c1 = ''
      c2 = ''
      is_infinite = .false.
      max = INVALID_REPEAT_VAL
      min = INVALID_REPEAT_VAL

      call self%tape%get_token()

      ! Extract the part of the pattern that is the character class that
      ! this procedure should process.
      do while (self%tape%current_token /= tk_rcurlybrace)
         buf= buf//trim(self%tape%token_char)
         call self%tape%get_token

         if (self%tape%current_token == tk_end) then
            self%code = SYNTAX_ERR_CURLYBRACE_MISSING
            self%is_valid = .false.
            return
         end if
      end do

      if (len(buf) == 0) then
      ! Error for a{}
         self%is_valid = .false.
         self%code = SYNTAX_ERR_INVALID_TIMES
         return
      else if (len(buf) == 1) then
      ! Error for a{,}
         if (buf(1:1) == SYMBOL_COMMA) then
            self%is_valid = .false.
            self%code = SYNTAX_ERR_INVALID_TIMES
            return
         end if
      end if

      if (buf(1:1) == ',') then
         buf = "0"//buf ! e.g. {,2} =>{0,2}
      end if


      if (is_integer(buf)) then
         buf = trim(buf)//","//trim(buf) ! e.g. {0} => {0,0}
      end if

   !----------------
      call get_index_comma(buf, i, num_comma)

      ! ios has a negative value if an end-of-record condition is encountered during non-advancing input,
      ! a different negative value if and endfile condition was detected on the input device, a positive value
      ! if an error was detected, or the value zero otherwise.
      !
      ! cf. Michael Metcalf, John Reid and Malcolm Cohen (2018)
      !       - "Modern Fortran Explained--Incorporating Fortran 2018"

      ! patterns like {1,2,3} are error.
      if (num_comma > 1) then
         self%is_valid = .false.
         self%code = SYNTAX_ERR_INVALID_TIMES
         return
      end if

      c1 = buf(1:i-1)
      if (i+1 <= len_trim(buf)) c2 = buf(i+1:len_trim(buf))

      read(c1, fmt=*, iostat=ios) arg(1)
      if (ios > 0 .or. arg(1)< 0) then
         self%is_valid = .false.
         self%code = SYNTAX_ERR_INVALID_TIMES
         return
      end if

      if (trim(c2) == EMPTY_CHAR) then
         is_infinite = .true.
      else
         read(c2, fmt=*, iostat=ios) arg(2)
         if (ios > 0 .or. arg(2) < 0) then
            self%is_valid = .false.
            self%code = SYNTAX_ERR_INVALID_TIMES
            return
         end if
      end if

      
      if (is_infinite) then
         min = arg(1)
         max = INFINITE
      else
         min = arg(1)
         max = arg(2)
      end if

      if (min == 0 .and. max == 0) then
         continue
      else if (max /= INFINITE .and. min > max) then
         self%is_valid = .false.
         self%code = SYNTAX_ERR_INVALID_TIMES
         return
      else if (max == INVALID_REPEAT_VAL .and. min > max) then
         self%is_valid = .false.
         self%code = SYNTAX_ERR_INVALID_TIMES
         return
      end if

      node = make_repeat_node(min, max)
      left = self%get_top()
      call self%register_connector(node, left, terminal)

   end subroutine tree_graph__times


   !> This subroutine parses a pattern string and outputs a list of `segment_t` type.
   pure subroutine interpret_class_string(str, cube, is_valid, ierr)
      use :: forgex_utf8_m, only: idxutf8, next_idxutf8, len_utf8, ichar_utf8
      use :: forgex_parameters_m
      use :: forgex_segment_m, register => register_segment_to_list
      use :: forgex_character_array_m
      use :: forgex_cube_m, only: cube_t
      implicit none

      character(*), intent(in) :: str
      type(cube_t), intent(inout) :: cube
      logical, intent(inout) :: is_valid
      integer, intent(inout) :: ierr

      integer :: i, j, k
      integer :: jerr

      type(segment_t) :: prev_seg, curr_seg
      type(segment_t), allocatable :: list(:), cache(:)

      logical :: backslashed
      logical :: prev_hyphenated, curr_hyphenated
      type(character_array_t), allocatable :: ca(:) ! character array
      integer :: siz ! total number of segment of `ca` array
      character(:), allocatable :: c ! Temporary variable stores a character of interest.

      ! Initialize
      is_valid = .true.
      backslashed = .false.
      prev_hyphenated = .false.
      curr_hyphenated = .false.
      prev_seg = segment_t()
      curr_seg = segment_t()
      
      if (len(str) >= 2) then
         if (str(1:2) == '--') then
            ierr = SYNTAX_ERR_MISPLACED_SUBTRACTION_OPERATOR
            is_valid = .false.
         end if
      end if

      ! Convert to an array from a pattern string.
      call character_string_to_array(str, ca)
      if (.not. allocated(ca)) then
         ierr = SYNTAX_ERR_EMPTY_CHARACTER_CLASS
         is_valid = .false.
         return
      end if

      ! Remove backslash and hyphen, and raise respective flag for each component.
      call parse_backslash_and_hyphen_in_char_array(ca, ierr)
      if (ierr == SYNTAX_ERR_MISPLACED_SUBTRACTION_OPERATOR) then
         is_valid = .false.
         return
      end if

      ! for escape sequences such as \x, \x{...}, \p{...}.
      call parse_escape_sequence_with_argument(ca, ierr)
      if (ierr /= SYNTAX_VALID) then
         is_valid = .false.
         return
      end if

#ifdef IMPURE
      call dump_character_array_t_list(ca)
#endif

      ! Each ca(:)%seg_size will be set by this procedure calling.
      call parse_segment_width_in_char_array(ca)

      ! If each of the array element is hyphenated,
      ! check that the range is not 1 and return invalid.
      siz = 0
      check: do i = 1, size(ca, dim=1)

         ! If the former hypenated range is invalid, throw an error.
         if (ca(i)%is_hyphenated .and. ca(i)%seg_size /= 1) then
            ierr = SYNTAX_ERR_RANGE_WITH_ESCAPE_SEQUENCES
            is_valid = .false.
            return
         end if

         ! If the range following hyphenataed is invalid, throw an error.
         if (i>1) then
            if (ca(i-1)%is_hyphenated .and. ca(i)%seg_size /= 1) then
               ierr = SYNTAX_ERR_RANGE_WITH_ESCAPE_SEQUENCES
               is_valid = .false.
               return
            end if
         end if

         ! If a subtraction flag appear, throw an error at the moment.
         if (ca(i)%is_subtract) then
            ierr = SYNTAX_ERR_CHAR_CLASS_SUBTRANCTION_NOT_IMPLEMENTED
            is_valid = .false.
            return
         end if

         ! If the loop reaches the end of `ca` array, cancel the hyphenated flag, and
         ! then add a literal hyphen to the end.
         if (i> 1 .and. i == size(ca, dim=1)) then
            if (ca(i)%is_hyphenated) then
               ca(i)%is_hyphenated = .false.

               ca = [ca(1:size(ca)), &
                    character_array_t(SYMBOL_HYPN, .false., .false., ca(size(ca))%is_subtract, 1)]
               siz = siz + 1
               exit check
            end if
         end if
         siz = siz + ca(i)%seg_size
      end do check

      if (siz < 1) then
         ierr = SYNTAX_ERR_THIS_SHOULD_NOT_HAPPEN
         is_valid = .false.
         return
      end if
      allocate(list(siz))

      ! Initialize cache and counter variable.
      j = 0 ! Couter of actual list size for `seglist`.
      c = EMPTY_CHAR
      i = 1
      outer: do while(i <= size(ca, dim=1))
         c = ca(i)%c
         backslashed = ca(i)%is_escaped  ! cache `is_escaped` flag
         curr_hyphenated = ca(i)%is_hyphenated
         if (i > 1) prev_hyphenated = ca(i-1)%is_hyphenated 

         ! For escape sequences that take arguments.
         if (backslashed .and. c == ESCAPE_X) then
            i = i + 1
            if (i> size(ca, dim=1)) then
               ierr = SYNTAX_ERR_THIS_SHOULD_NOT_HAPPEN
               is_valid = .false.
               return
            end if
            c = ca(i)%c
            backslashed = ca(i)%is_escaped
            call hex2seg(c, curr_seg, ierr)
            if (ierr /= SYNTAX_VALID) then
               is_valid = .false.
               return
            end if
         else if (backslashed .and. c == ESCAPE_P) then
            ierr = SYNTAX_ERR_UNICODE_PROPERTY_NOT_IMPLEMENTED
            is_valid = .false.
            return
         else
            curr_seg = segment_t(ichar_utf8(c), ichar_utf8(c))
         end if

         ! For escape sequences that do not take arguments
         if (backslashed) then

            call convert_escaped_character_into_segments(c, cache)
            if (cache(1) == SEG_ERROR) then
               ierr = SYNTAX_ERR_ESCAPED_SYMBOL_INVALID
               is_valid = .false.
               return
            end if

            ! If the number of segemnts is greater than 1, register them to the `list`.
            if (size(cache, dim=1) > 1) then
               do k = 1, size(cache)
                  call register(list, cache(k), j, ierr)
               end do
               deallocate(cache)
               prev_seg = segment_t()
               i = i + 1
               cycle outer
            end if 

            curr_seg = cache(1)
         end if


         if (prev_hyphenated) then
            curr_seg = join_two_segments(prev_seg, curr_seg)
            if (curr_seg == SEG_ERROR) then
               ierr = SYNTAX_ERR_THIS_SHOULD_NOT_HAPPEN
               is_valid = .false.
               return
            end if
         end if
      
         if (.not. curr_hyphenated) then
            call register(list, curr_seg, j, jerr)
            if (jerr == SEGMENT_REJECTED) then
               ierr = SYNTAX_ERR_INVALID_CHARACTER_RANGE
               is_valid = .false.
               return
            end if
         end if

         prev_seg = curr_seg
         i = i + 1
      end do outer

      if (j < 1) then
         ! pattern '[+--]' causes this error for now.
         ! ierr = SYNTAX_ERR_THIS_SHOULD_NOT_HAPPEN
         ierr = SYNTAX_ERR_THIS_SHOULD_NOT_HAPPEN
         is_valid = .false.
         return
      end if

      call cube%add(list(1:j)) ! copy local array into the argument array.

   end subroutine interpret_class_string



   !> This subroutine converts escaped character of the argument `chara` into segment `seg_list`. 
   pure subroutine convert_escaped_character_into_segments(chara, seg_list) !, hexcode)
      use :: forgex_utf8_m, only: ichar_utf8
      implicit none
      character(*), intent(in) :: chara
      type(segment_t), allocatable, intent(inout) :: seg_list(:)

      integer :: unused

      if (allocated(seg_list)) deallocate(seg_list)

      select case (trim(chara))
      case (ESCAPE_T)
         allocate(seg_list(1))
         seg_list(1) = SEG_TAB
      case (ESCAPE_N)
         allocate(seg_list(2))
         seg_list(1) = SEG_LF
         seg_list(2) = SEG_CR
      case (ESCAPE_R)
         allocate(seg_list(1))
         seg_list(1) = SEG_CR
      case (ESCAPE_D)
         allocate(seg_list(1))
         seg_list(1) = SEG_DIGIT
      case (ESCAPE_D_CAPITAL)
         allocate(seg_list(1))
         seg_list(1) = SEG_DIGIT
         call invert_segment_list(seg_list)
      case (ESCAPE_W)
         allocate(seg_list(4))
         seg_list(1) = SEG_LOWERCASE
         seg_list(2) = SEG_UPPERCASE
         seg_list(3) = SEG_DIGIT
         seg_list(4) = SEG_UNDERSCORE
      case (ESCAPE_W_CAPITAL)
         allocate(seg_list(4))
         seg_list(1) = SEG_LOWERCASE
         seg_list(2) = SEG_UPPERCASE
         seg_list(3) = SEG_DIGIT
         seg_list(4) = SEG_UNDERSCORE
         call invert_segment_list(seg_list)
      case (ESCAPE_S)
         allocate(seg_list(6))
         seg_list(1) = SEG_SPACE
         seg_list(2) = SEG_TAB
         seg_list(3) = SEG_CR
         seg_list(4) = SEG_LF
         seg_list(5) = SEG_FF
         seg_list(6) = SEG_ZENKAKU_SPACE
      case (ESCAPE_S_CAPITAL)
         allocate(seg_list(6))
         seg_list(1) = SEG_SPACE
         seg_list(2) = SEG_TAB
         seg_list(3) = SEG_CR
         seg_list(4) = SEG_LF
         seg_list(5) = SEG_FF
         seg_list(6) = SEG_ZENKAKU_SPACE
         call invert_segment_list(seg_list)
      case (ESCAPE_X)
         allocate(seg_list(1))
         call hex2seg(chara, seg_list(1), unused)
      case (ESCAPE_P)
         allocate(seg_list(1))
         seg_list(1) = SEG_ERROR
         continue
      case (SYMBOL_BSLH)
         allocate(seg_list(1))
         seg_list(1)%min = ichar_utf8(SYMBOL_BSLH)
         seg_list(1)%max = ichar_utf8(SYMBOL_BSLH)
      case (SYMBOL_LCRB)
         allocate(seg_list(1))
         seg_list(1)%min = ichar_utf8(SYMBOL_LCRB)
         seg_list(1)%max = ichar_utf8(SYMBOL_LCRB)
      case (SYMBOL_RCRB)
         allocate(seg_list(1))
         seg_list(1)%min = ichar_utf8(SYMBOL_RCRB)
         seg_list(1)%max = ichar_utf8(SYMBOL_RCRB)
      case (SYMBOL_LSBK)
         allocate(seg_list(1))
         seg_list(1)%min = ichar_utf8(SYMBOL_LSBK)
         seg_list(1)%max = ichar_utf8(SYMBOL_LSBK)
      case (SYMBOL_RSBK)
         allocate(seg_list(1))
         seg_list(1)%min = ichar_utf8(SYMBOL_RSBK)
         seg_list(1)%max = ichar_utf8(SYMBOL_RSBK)
      case default
         allocate(seg_list(1))
         seg_list(1) = SEG_ERROR
      end select

   end subroutine convert_escaped_character_into_segments


!=====================================================================!
  
   subroutine dump_tree_table(tree)
      use, intrinsic :: iso_fortran_env, stderr => error_unit
      implicit none
      class(tree_node_t), intent(in) :: tree(:)
      type(segment_t), allocatable :: segments(:)

      integer :: i, k

      write(stderr, '(1x, a)') '  own index|  operation|     parent|       left|      right|   registered|  segments'
      do i = TREE_NODE_BASE, ubound(tree, dim=1)
         if (tree(i)%is_registered) then
            write(stderr, '(5i12, a, 10x, 1l, 3x)', advance='no') tree(i)%own_i, &
               tree(i)%op, tree(i)%parent_i, tree(i)%left_i, tree(i)%right_i, '   ', &
               tree(i)%is_registered

            call tree(i)%c%cube2seg(segments)
            do k = 1, ubound(segments, dim=1)
               if (k /= 1) write(stderr, '(a)', advance='no') ', '

               write(stderr, '(a)', advance='no') trim(segments(k)%print())
            end do
            write(stderr, *) ""
         end if
      end do
   end subroutine dump_tree_table

   subroutine print_tree_wrap(self, uni)
      implicit none
      ! type(tree_node_t), intent(in) :: tree(:)
      class(tree_t), intent(in) :: self
      integer, intent(in) :: uni

      call print_tree_internal(self%nodes, self%top, uni)
      write(uni, *) ''
   end subroutine print_tree_wrap

   recursive subroutine print_tree_internal(tree, node_i, uni)
      implicit none
      type(tree_node_t), intent(in) :: tree(:)
      integer, intent(in) :: node_i
      integer, intent(in) :: uni

      if (node_i == INVALID_INDEX) return
      select case (tree(node_i)%op)
      case (op_char)
         write(uni, '(a)', advance='no') trim(print_class_simplify(tree, node_i))
      case (op_concat)
         write(uni, '(a)', advance='no') "(concatenate "
         call print_tree_internal(tree, tree(node_i)%left_i, uni)
         write(uni, '(a)', advance='no') ' '
         call print_tree_internal(tree, tree(node_i)%right_i, uni)
         write(uni, '(a)', advance='no') ')'

      case (op_union)
         write(uni, '(a)', advance='no') "(or "
         call print_tree_internal(tree, tree(node_i)%left_i, uni)
         write(uni, '(a)', advance='no') ' '
         call print_tree_internal(tree, tree(node_i)%right_i, uni)
         write(uni, '(a)', advance='no') ')'

      case (op_closure)
         write(uni, '(a)', advance='no') "(closure"
         call print_tree_internal(tree, tree(node_i)%left_i, uni)
         write(uni, '(a)', advance='no') ')'
      case (op_repeat)
         write(uni, '(a)', advance='no') "(repeat "
         call print_tree_internal(tree, tree(node_i)%left_i, uni)
         if (tree(node_i)%min_repeat == INVALID_REPEAT_VAL) then
            write(uni, "('{', ',', i0, '}')", advance='no') tree(node_i)%max_repeat
         else if (tree(node_i)%max_repeat == INVALID_REPEAT_VAL) then
            write(uni, "('{', i0, ',}')", advance='no') tree(node_i)%min_repeat
         else
            write(uni, "('{', i0, ',', i0, '}')",advance='no') tree(node_i)%min_repeat, tree(node_i)%max_repeat
         end if

         write(uni, '(a)', advance='no') ')'
         
      case (op_empty)
         write(uni, '(a)', advance='no') 'EMPTY'

      case default
         write(uni, '(a)') "This will not occur in 'print_tree'."
         error stop
      end select
   end subroutine print_tree_internal


   function print_class_simplify (tree, root_i) result(str)
      use :: forgex_segment_m, only: SEG_EMPTY
      use :: forgex_utf8_m
      implicit none
      type(tree_node_t), intent(in) :: tree(:)
      
      type(segment_t), allocatable :: segments(:)
      integer(int32) :: root_i
      character(:), allocatable :: str

      integer(int32) :: siz, j
      character(:),allocatable :: buf

      str = ''

      call tree(root_i)%c%cube2seg(segments)

      if (allocated(segments)) then
         siz = size(segments, dim=1)
      else
         return
      end if

      if (siz == 0) return

      if (segments(1) == SEG_LF) then
         str = '<LF>'
         return

      else if (segments(1) == SEG_CR) then
         str = '<CR>'
         return

      else if (segments(1) == SEG_NULL) then
         str = '<NULL>'
         return

      else if (segments(1) == SEG_EMPTY) then
         str ="<EMPTY>"
         return

      else if (siz == 1 .and. segments(1)%min == segments(1)%max) then
         str = '"'//char_utf8(segments(1)%min)//'"'
         return

      else if (siz == 1 .and. segments(1) == SEG_ANY) then
         str = '<ANY>'
         return
      end if

      buf = '[ '
      do j = 1, siz

         if (segments(j) == SEG_LF) then
            buf = buf//'<LF>; '

         else if (segments(j) == SEG_TAB) then
            buf = buf//'<TAB>; '

         else if (segments(j) == SEG_CR) then
            buf = buf//'<CR>; '

         else if (segments(j) == SEG_FF) then
            buf = buf//'<FF>; '

         else if (segments(j) == SEG_SPACE) then
            buf = buf//'<SPACE>; '

         else if (segments(j) == SEG_ZENKAKU_SPACE) then
            buf = buf//'<ZENKAKU SPACE>; '
      
         else if (segments(j)%min == 0) then
            buf = buf//'<NULL>'//'-"'//char_utf8(segments(j)%max)//'"; '

         else if (segments(j)%max == UTF8_CODE_MAX) then
            buf = buf//'"'//char_utf8(segments(j)%min)//'"-"'//"<U+10FFFF>"//'; '

         else
            buf = buf//'"'//char_utf8(segments(j)%min)//'"-"'//char_utf8(segments(j)%max)//'"; '
         end if
      end do

      buf = trim(buf)//']'

      str = trim(buf)

   end function print_class_simplify


end module forgex_syntax_tree_graph_m