cube_m.F90 Source File


Source Code

! Fortran Regular Expression (Forgex)
!
! MIT License
!
! (C) Amasaki Shinobu, 2023-2025
!     A regular expression engine for Fortran.
!     forgex_cube_m module is a part of Forgex.
!
#ifdef IMPURE
#define pure
#endif
module forgex_cube_m
   use, intrinsic :: iso_fortran_env, only: int64, int32
   use :: forgex_parameters_m, only: BMP_SIZE, BMP_SIZE_BIT, bits_64, INVALID_CODE_POINT, UTF8_CODE_MAX, ASCII_SIZE, ASCII_SIZE_BIT
   use :: forgex_bitmap_m, only: bmp_t, ascii_t
   use :: forgex_segment_m, only: segment_t, symbol_to_segment, &
      operator(.in.), SEG_INIT, SEG_EPSILON, operator(==), width_of_segment, invert_segment_list
   use :: forgex_utf8_m, only: ichar_utf8
   implicit none
   private


   type, public :: cube_t ! contains all planes (ascii, BMP and SPs) of Unicode.
      logical :: epsilon_flag = .false.
      logical :: single_flag = .true.
      logical :: is_switched_to_bmp = .false.
      type(ascii_t) :: ascii                 ! for  U+0000 .. U+007F ASCII
      type(bmp_t), allocatable :: bmp        ! for  U+0000 .. U+FFFF BMP
      type(segment_t), allocatable :: sps(:) ! for U+10000 .. U+10FFFF SPs (SIP, SMP, etc.)
   contains
      procedure :: flag_epsilon => cube_flag__epsilon
      procedure :: is_flagged_epsilon => cube_flag__is_flagged_epsilon
      procedure :: cube_add__symbol
      procedure :: cube_add__segment
      procedure :: cube_add__segment_list
      procedure :: cube_add__cube
      procedure :: switch_bmp => cube__switch_ascii_to_bmp
      procedure :: cube2seg => cube__bmp2seg
      procedure :: print_sps => cube__dump_sps
      procedure :: invert => cube__invert
      procedure :: num => cube__number_of_flagged_bits
      procedure :: first => cube__first_codepoint
      generic :: add => cube_add__symbol, cube_add__segment, cube_add__segment_list, cube_add__cube
   end type cube_t

   interface operator(.in.)
      module procedure :: cube_t__symbol_in_cube
   end interface

   interface assignment(=)
      module procedure :: cube_t__cube_assign
   end interface

   interface operator(==)
      module procedure :: cube_t__equal
   end interface

   public :: operator(.in.)
   public :: assignment(=)
   public :: operator(==)

   integer :: q
   type(bmp_t), parameter, public :: white_bmp = bmp_t([(0_int64, q=0, BMP_SIZE-1)])

contains

   pure subroutine cube_t__cube_assign(a, b)
      implicit none
      type(cube_t), intent(inout) :: a
      type(cube_t), intent(in) :: b

      integer :: num

      a%is_switched_to_bmp = b%is_switched_to_bmp
      a%epsilon_flag = b%epsilon_flag
      if (b%is_switched_to_bmp) then
         call a%switch_bmp()
         a%bmp%b(:) = b%bmp%b(:)
      else
         a%ascii%a(:) = b%ascii%a(:)
      end if

      if (.not. allocated(b%sps)) return
      num = ubound(b%sps, dim=1)
      a%sps = b%sps ! implicit reallocation

      a%single_flag = a%num() == 1

   end subroutine cube_t__cube_assign


   pure function cube_t__symbol_in_cube (symbol, cube) result(ret)
      implicit none
      character(*), intent(in) :: symbol
      type(cube_t), intent(in) :: cube
      logical :: ret

      integer :: cp

      cp = ichar_utf8(symbol)

      if (cp < ASCII_SIZE_BIT .and. .not. cube%is_switched_to_bmp) then
         ret = iand(cube%ascii%a(cp/bits_64), ishft(1_int64, mod(cp, bits_64))) /= 0_int64

      else if (cp < BMP_SIZE_BIT .and. cube%is_switched_to_bmp) then
         ret = iand(cube%bmp%b(cp/bits_64), ishft(1_int64, mod(cp, bits_64))) /= 0_int64

      else if (cp >= ASCII_SIZE_BIT .and. cp < BMP_SIZE_BIT .and. .not. cube%is_switched_to_bmp) then
            ret = .false.
      else
         if (allocated(cube%sps)) then
            ret = symbol_to_segment(symbol) .in. cube%sps(:)
         else
            ret = .false.
         end if
      end if

   end function cube_t__symbol_in_cube


   pure function cube_t__equal(a, b) result(ret)
      use :: forgex_segment_m, only: segment_t, merge_segments, sort_segment_by_min
      implicit none
      type(cube_t), intent(in) :: a, b

      type(segment_t), allocatable :: a_sps(:), b_sps(:)
      logical :: ret, candi

      integer :: i

      ret = .false.

      if (a%is_switched_to_bmp .neqv. b%is_switched_to_bmp) return
      if (.not. all(a%ascii%a(:) == b%ascii%a(:))) return

      if (allocated(a%bmp) .and. allocated(b%bmp)) then
         if (any(a%bmp%b(:) /= b%bmp%b(:))) return
      end if


      if (allocated(a%sps) .and. allocated(b%sps)) then
         a_sps = a%sps
         b_sps = b%sps
         call sort_segment_by_min(a_sps)
         call merge_segments(a_sps)
         call sort_segment_by_min(b_sps)
         call merge_segments(b_sps)

         if (size(a_sps, dim=1) == size(b_sps, dim=1)) then
            candi = .true. 
            do i = 1, size(a_sps, dim=1)
               candi = candi .and. a_sps(i) == b_sps(i)
            end do
            ret = candi
            return

         end if
      end if

      ret = .true.

   end function cube_t__equal


   pure function cube_t__codepoint_in_cube (cp, cube) result(ret)
      implicit none
      integer(int32), intent(in) :: cp
      type(cube_t), intent(in) :: cube
      logical :: ret

      if (cp < ASCII_SIZE_BIT .and. .not. cube%is_switched_to_bmp) then
         ret = iand(cube%ascii%a(cp/bits_64), ishft(1_int64, mod(cp, bits_64))) /= 0_int64
      else if (cp < BMP_SIZE_BIT) then
         ret = iand(cube%bmp%b(cp/bits_64), ishft(1_int64, mod(cp, bits_64))) /= 0_int64
      else
         ret = cp .in. cube%sps(:)
      end if

   end function cube_t__codepoint_in_cube

!=====================================================================!

   pure subroutine cube_flag__epsilon(self)
      implicit none
      class(cube_t), intent(inout) :: self
      self%epsilon_flag = .true.
   end subroutine cube_flag__epsilon

   pure logical function cube_flag__is_flagged_epsilon(self)
      implicit none
      class(cube_t), intent(in) :: self
      cube_flag__is_flagged_epsilon = self%epsilon_flag
   end function cube_flag__is_flagged_epsilon

!=====================================================================!

   pure subroutine cube_add__symbol(self, symbol)
      implicit none
      class(cube_t), intent(inout) :: self
      character(*), intent(in) :: symbol

      integer :: cp
      cp = ichar_utf8(symbol)
      if (cp == -1) return ! WARNING: magic nubmer
      if (cp < ASCII_SIZE_BIT .and. .not. self%is_switched_to_bmp) then
         call self%ascii%add(cp)

      else if (cp < BMP_SIZE_BIT) then
         call self%switch_bmp()
         call self%bmp%add(cp)

      else
         call self%switch_bmp()
         call cube_add__segment(self, segment_t(cp, cp))
      end if

      if (self%single_flag) self%single_flag = self%num() == 1

   end subroutine cube_add__symbol

   pure subroutine cube_add__segment(self, segment)
      implicit none
      class(cube_t), intent(inout) :: self
      type(segment_t), intent(in) :: segment

      integer :: cp_min, cp_max, sps_size, i, j
      type(segment_t), allocatable :: tmp(:)
      type(segment_t) :: what_to_add

      if (segment == SEG_EPSILON) then
         call self%flag_epsilon()
         return
      end if

      cp_min = segment%min
      cp_max = segment%max

      if (cp_max < ASCII_SIZE_BIT .and. .not. self%is_switched_to_bmp) then
         call self%ascii%add(cp_min, cp_max)
         if (self%single_flag) self%single_flag = self%num() == 1
         return
      else
         call self%switch_bmp()
         call self%bmp%add(cp_min, cp_max)
      end if

      if (cp_max > BMP_SIZE_BIT) then

         what_to_add = segment_t(max(cp_min, BMP_SIZE_BIT), cp_max)

         if (allocated(self%sps)) then
            sps_size = size(self%sps, dim=1) + 1
            allocate(tmp(sps_size))
            j = 0
            do i = 1, size(self%sps)
               j = j + 1
               if (self%sps(i)%min < what_to_add%min) then
                  tmp(j) = self%sps(i)
               else
                  tmp(j) = what_to_add
               end if
            end do
            self%sps = tmp(1:sps_size) ! implicit reallocation

         else
            self%sps = [segment]
         end if

      end if

      if (self%single_flag) self%single_flag = self%num() == 1

   end subroutine cube_add__segment


   pure subroutine cube_add__segment_list(self, seglist)
      implicit none
      class(cube_t), intent(inout) :: self
      type(segment_t), intent(in) :: seglist(:)

      integer :: cp_min, cp_max, siz, i, j, k, m, n, p

      type(segment_t), allocatable :: tmp(:), ret(:)
      type(segment_t) :: what_to_add

      integer :: upper

      if (allocated(self%sps)) then
         m = size(self%sps)
      else
         m = 0
      end if
      n = size(seglist, dim=1)

      if (any(seglist == SEG_EPSILON)) then
         self%epsilon_flag = .true.
      end if

      ! Scan all segments in the list to find max value of them.
      upper = 0
      do i = 1, n
         upper = max(seglist(i)%max, upper)
      end do

      ! If all values of the list is within the range of ASCII, register them to self%ascii and retrun.
      if (upper < ASCII_SIZE_BIT .and. .not. self%is_switched_to_bmp) then
         do i = 1, n
            cp_min = seglist(i)%min
            cp_max = seglist(i)%max
            call self%ascii%add(cp_min, cp_max)
         end do
         if (self%single_flag) self%single_flag = self%num() == 1
         return
      end if

      if (.not. self%is_switched_to_bmp) call self%switch_bmp()

      siz = m + n
      allocate(tmp(n))
      allocate(ret(siz+1))

      k = 0 ! for tmp
      j = 1 ! for segments to add
      do while ( j <= n)
         cp_min = seglist(j)%min
         cp_max = seglist(j)%max
         call self%bmp%add(cp_min, cp_max)
         if (cp_max > BMP_SIZE_BIT) then
            k = k + 1
            what_to_add = segment_t(max(cp_min, BMP_SIZE_BIT), cp_max)
            tmp(k) = what_to_add
         end if

         j = j + 1
      end do

      if (k /= 0) then
         joint: block
            type(segment_t), allocatable :: cache(:)
            if (allocated(self%sps)) then
               p = ubound(self%sps, dim=1)
               cache(1:p) = self%sps(1:p)
               deallocate(self%sps)

               allocate(self%sps(1:p+k))
               self%sps(1:p) = cache(1:p)
               self%sps(p+1:p+k) = tmp(1:k)
            else
               allocate(self%sps(1:k))
               self%sps(1:k) = tmp(1:k)
            end if

         end block joint
      end if

      if (self%single_flag) self%single_flag = self%num() == 1
   end subroutine cube_add__segment_list


   pure subroutine cube_add__cube(self, cube)
      implicit none
      class(cube_t), intent(inout) :: self
      type(cube_t), intent(in) :: cube

      integer :: i

      if (.not. self%is_switched_to_bmp .and. .not. cube%is_switched_to_bmp ) then
         do concurrent (i=0:ASCII_SIZE-1)
            self%ascii%a(i) = ior(self%ascii%a(i), cube%ascii%a(i))
         end do
         return
      else
         call self%switch_bmp()
      end if

      do concurrent (i = 0:BMP_SIZE-1)
         self%bmp%b(i) = ior(self%bmp%b(i), cube%bmp%b(i))
      end do

      if (allocated(cube%sps)) then
         call cube_add__segment_list(self, cube%sps)
      end if

      if (self%single_flag) self%single_flag = self%num() == 1
   end subroutine cube_add__cube


   pure subroutine cube__switch_ascii_to_bmp(self)
      implicit none
      class(cube_t), intent(inout) :: self

      self%is_switched_to_bmp = .true.
      if (.not. allocated(self%bmp)) then
         allocate(self%bmp)
         self%bmp%b(0:1) = self%ascii%a(0:1)
      end if

   end subroutine cube__switch_ascii_to_bmp

!=====================================================================!

   pure subroutine cube__invert(self)
      implicit none
      class(cube_t), intent(inout) :: self
      integer :: i
      integer(int64) :: mask

      if (.not. self%is_switched_to_bmp) then
         call self%switch_bmp()
      end if

      mask = not(shiftl(1_int64, 32) -1)
      self%bmp%b(0) = iand(not(self%bmp%b(0)), mask)

      do concurrent (i = 1:BMP_SIZE-1)
         self%bmp%b(i) = not(self%bmp%b(i))
      end do

      if (.not. allocated(self%sps)) then
         self%sps = [segment_t(BMP_SIZE_BIT, UTF8_CODE_MAX)]
      else
         call invert_segment_list(self%sps)
      end if

      self%single_flag = self%num() == 1
   end subroutine cube__invert


   pure function cube__number_of_flagged_bits(self) result(ret)
      implicit none
      class(cube_t), intent(in) :: self
      integer :: ret

      integer :: i
      integer :: partial_sum(0:BMP_SIZE-1)

      if (.not. self%is_switched_to_bmp) then
         ret = popcnt(self%ascii%a(0)) + popcnt(self%ascii%a(1))
         return
      end if

      ret = 0
      do concurrent (i = 0:BMP_SIZE-1)
         partial_sum(i) = popcnt(self%bmp%b(i))
      end do

      ret = sum(partial_sum)

      if (allocated(self%sps)) then
         do i = 1, size(self%sps)
            ret = ret + width_of_segment(self%sps(i))
         end do
      end if

   end function cube__number_of_flagged_bits


   pure function cube__first_codepoint(self) result(ret)
      use :: forgex_parameters_m, only: UTF8_CODE_MAX
      implicit none
      class(cube_t), intent(in) :: self

      integer :: i, num, pos, ret, candi

      if (.not. self%is_switched_to_bmp) then
         do i = 0, ASCII_SIZE-1
            if (self%ascii%a(i) /= 0) then
               pos = trailz(self%ascii%a(i))
               ret = i*bits_64+pos
               return
            end if
         end do
         ret = INVALID_CODE_POINT
         return
      end if

      do i = 0, BMP_SIZE-1
         if (self%bmp%b(i) /= 0) then
            pos = trailz(self%bmp%b(i))
            ret = i*bits_64 + pos
            return
         end if
      end do

      ret = INVALID_CODE_POINT
      if (.not. allocated(self%sps)) return

      candi = UTF8_CODE_MAX
      do i = 1, size(self%sps)
         candi = min(candi, self%sps(i)%min)
      end do

      if (candi /= UTF8_CODE_MAX) then
         ret = candi
      else
         ret = -1
      end if

   end function cube__first_codepoint


!=====================================================================!

   pure subroutine cube__bmp2seg(self, segments)
      implicit none
      class(cube_t), intent(in) :: self
      type(segment_t), allocatable, intent(inout) :: segments(:)

      type(segment_t), allocatable :: tmp(:)

      integer :: m, n

      if (allocated(segments)) deallocate(segments)

      if (.not. self%is_switched_to_bmp) then
         call self%ascii%ascii2seg(segments)
         return
      end if

      if (allocated(self%bmp)) then
         call self%bmp%bmp2seg(tmp)
         m = size(tmp, dim=1)
      else
         m = 0
      end if

      if (allocated(self%sps)) then
         n = size(self%sps, dim=1)
      else
         n = 0
      end if

      if (m+n > 0) then
         allocate(segments(m+n))
         segments(1:m) = tmp(1:m)
         if (n > 0) segments(m+1:m+n) = self%sps(1:n)
      end if

   end subroutine cube__bmp2seg


   subroutine cube__dump_sps(self)
      class(cube_t), intent(in) :: self

      integer :: i
      if (.not. allocated(self%sps)) return

      do i = 1, ubound(self%sps, dim=1)
         write(0,*) self%sps(i)%print()
      end do

   end subroutine cube__dump_sps

end module forgex_cube_m