This file defines syntactic parsing.
! Fortran Regular Expression (Forgex) ! ! MIT License ! ! (C) Amasaki Shinobu, 2023-2025 ! A regular expression engine for Fortran. ! `forgex_syntax_tree_m` module is a part of Forgex. ! !! This file defines syntactic parsing. !> The`forgex_syntax_tree_m` module defines parsing and !> the `tree_node_t` derived-type for building syntax-tree. #ifdef IMPURE #define pure #endif module forgex_syntax_tree_node_m use, intrinsic :: iso_fortran_env, stderr => error_unit use :: forgex_parameters_m use :: forgex_segment_m, only: segment_t use :: forgex_enums_m use :: forgex_cube_m, only: cube_t, white_bmp implicit none private public :: tree_node_t public :: tape_t public :: make_atom public :: make_tree_node public :: make_repeat_node !! The regular expression parsing performed by this module !! is done using recursive descent parsing. character(UTF8_CHAR_SIZE), parameter, public :: EMPTY = char(0) type :: tree_node_t !! This type is used to construct a concrete syntax tree, !! later converted to NFA. integer(int32) :: op = op_not_init type(cube_t) :: c integer(int32) :: left_i = INVALID_INDEX integer(int32) :: right_i = INVALID_INDEX integer(int32) :: parent_i = INVALID_INDEX integer(int32) :: own_i = INVALID_INDEX integer(int32) :: min_repeat integer(int32) :: max_repeat logical :: is_registered = .false. logical :: has_closure_left = .false. ! Whether the nodes immediately below the tree node is a closure or not. logical :: has_closure_right = .false. end type type :: tape_t !! This type holds the input pattern string and manages the index !! of the character it is currently focused. character(:), allocatable :: str ! Contains the entire input pattern string integer(int32) :: current_token ! token enumerator (cf. enums_m.f90) character(UTF8_CHAR_SIZE) :: token_char = EMPTY ! initialized as ASCII character number 0 integer(int32) :: idx = 0 ! index of the character that is currently focused contains procedure :: get_token end type type(cube_t), parameter :: terminal_c = cube_t(single_flag=.false.) type(tree_node_t), parameter, public :: terminal = & tree_node_t( op=op_not_init,& left_i=TERMINAL_INDEX, & right_i=TERMINAL_INDEX, & parent_i=INVALID_INDEX, & own_i=INVALID_INDEX, & min_repeat=INVALID_REPEAT_VAL, & max_repeat=INVALID_REPEAT_VAL) contains pure subroutine reallocate_tree(tree, alloc_count) implicit none type(tree_node_t), allocatable, intent(inout) :: tree(:) integer, intent(inout) :: alloc_count type(tree_node_t), allocatable :: tmp(:) integer :: new_part_begin, new_part_end, i if (.not. allocated(tree)) then allocate(tree(TREE_NODE_BASE:TREE_NODE_UNIT)) alloc_count = 1 return end if new_part_begin = ubound(tree, dim=1) + 1 new_part_end = ubound(tree, dim=1)*2 if (new_part_end > TREE_NODE_HARD_LIMIT) then error stop "Exceeded the maximum number of tree nodes can be allocated." end if call move_alloc(tree, tmp) allocate(tree(TREE_NODE_BASE:new_part_end)) alloc_count = alloc_count + 1 ! Deep copy tree(TREE_NODE_BASE:new_part_begin-1) = tmp(TREE_NODE_BASE:new_part_begin-1) ! Initialize new part tree(new_part_begin:new_part_end)%own_i = [(i, i = new_part_begin, new_part_end)] ! deallocate old tree deallocate(tmp) end subroutine reallocate_tree !> This subroutine deallocate the syntax tree. pure subroutine deallocate_tree(tree) implicit none type(tree_node_t), allocatable, intent(inout) :: tree(:) integer :: i ! do i = lbound(tree, dim=1), ubound(tree, dim=1) ! if (allocated(tree(i)%c)) deallocate(tree(i)%c) ! end do if (allocated(tree)) deallocate(tree) end subroutine deallocate_tree !| Get the currently focused character (1 to 4 bytes) from the entire string inside ! the `type_t` derived-type, and store the enumerator's numeric value in the ! `current_token` component. ! This is a type-bound procedure of `tape_t`. pure subroutine get_token(self, class_flag) use :: forgex_utf8_m, only: idxutf8, next_idxutf8 implicit none class(tape_t), intent(inout) :: self logical, optional, intent(in) :: class_flag character(UTF8_CHAR_SIZE) :: c integer(int32) :: ib, ie ib = self%idx if (ib == INVALID_CHAR_INDEX .or. ib > len(self%str)) then self%current_token = tk_end self%token_char = '' else ie = idxutf8(self%str, ib) c = self%str(ib:ie) if (present(class_flag)) then if (class_flag) then select case (trim(c)) case (SYMBOL_RSBK) self%current_token = tk_rsbracket self%token_char = c case (SYMBOL_HYPN) self%current_token = tk_hyphen self%token_char = c case (SYMBOL_BSLH) self%current_token = tk_backslash self%token_char = c case default self%current_token = tk_char self%token_char = c end select end if else select case(trim(c)) case (SYMBOL_VBAR) self%current_token = tk_union case (SYMBOL_LPAR) self%current_token = tk_lpar case (SYMBOL_RPAR) self%current_token = tk_rpar case (SYMBOL_STAR) self%current_token = tk_star case (SYMBOL_PLUS) self%current_token = tk_plus case (SYMBOL_QUES) self%current_token = tk_question case (SYMBOL_BSLH) self%current_token = tk_backslash ib = next_idxutf8(self%str, ie) ie = idxutf8(self%str, ib) self%token_char = self%str(ib:ie) case (SYMBOL_LSBK) self%current_token = tk_lsbracket case (SYMBOL_RSBK) self%current_token = tk_rsbracket case (SYMBOL_LCRB) self%current_token = tk_lcurlybrace self%token_char = c case (SYMBOL_RCRB) self%current_token = tk_rcurlybrace self%token_char = c case (SYMBOL_DOT) self%current_token = tk_dot case (SYMBOL_CRET) self%current_token = tk_caret case (SYMBOL_DOLL) self%current_token = tk_dollar case default self%current_token = tk_char self%token_char = c end select end if self%idx = next_idxutf8(self%str, ib) end if end subroutine get_token !=====================================================================! pure function make_tree_node(op) result(node) implicit none integer(int32), intent(in) :: op type(tree_node_t) :: node node%op = op end function make_tree_node pure function make_atom(segment) result(node) implicit none type(segment_t), intent(in) :: segment type(tree_node_t) :: node node%op = op_char call node%c%add(segment) end function pure function make_repeat_node(min, max) result(node) implicit none integer(int32), intent(in) :: min, max type(tree_node_t) :: node node%op = op_repeat node%min_repeat = min node%max_repeat = max end function make_repeat_node end module forgex_syntax_tree_node_m