The char_utf8
function takes a code point as integer in Unicode character set,
and returns the corresponding character as UTF-8 binary string.
This function is like an extension of char() for the UTF-8 codeset.
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
integer(kind=int32), | intent(in) | :: | code |
pure function char_utf8 (code) result(str) use, intrinsic :: iso_fortran_env implicit none integer(int32), intent(in) :: code ! Input Unicode code point. character(:), allocatable :: str ! Resulting one UTF-8 character. character(32), allocatable :: bin ! A 32-digit number expressed in character format for masking. integer(int32) :: buf, mask ! Buffer and mask for bit operations. integer(int8) :: byte(4) ! Array to hold up 4 bytes of the UTF-8 character. str = '' ! Initialize result string. buf = code ! Initialize buffer with input `code` point. bin = '0000000000000000000000000111111' ! Lower 6-bit mask read(bin, '(b32.32)') mask ! Read the `mask` from the `bin` character string. byte(1) = int(iand(ishft(buf, -18), mask),kind(byte)) ! First byte buf = code byte(2) = int(iand(ishft(buf, -12), mask), kind(byte)) ! Second byte buf = code byte(3) = int(iand(ishft(buf, -6), mask), kind(byte)) ! Third byte buf = code byte(4) = int(iand(buf, mask), kind(byte)) ! Fourth byte if (code > 2**7-1) then ! Check if the `code` point is greater than 127 (non-ASCII character). if (2**16 -1 < code) then ! 4-byte character byte(1) = ibset(byte(1),7) byte(1) = ibset(byte(1),6) byte(1) = ibset(byte(1),5) byte(1) = ibset(byte(1),4) byte(1) = ibclr(byte(1),3) byte(2) = set_continuation_byte(byte(2)) ! Set continuation bytes. byte(3) = set_continuation_byte(byte(3)) byte(4) = set_continuation_byte(byte(4)) else if (2**11 - 1 < code) then ! 3-byte character byte(1) = 32 byte(2) = ibset(byte(2), 7) byte(2) = ibset(byte(2), 6) byte(2) = ibset(byte(2), 5) byte(2) = ibclr(byte(2), 4) byte(3) = set_continuation_byte(byte(3)) byte(4) = set_continuation_byte(byte(4)) else if (2**7 -1 < code) then ! 2-byte character byte(1) = 32 byte(2) = 32 byte(3) = ibset(byte(3), 7) byte(3) = ibset(byte(3), 6) byte(3) = ibclr(byte(3), 5) byte(4) = set_continuation_byte(byte(4)) end if str = char(byte(1)) //char(byte(2)) //char(byte(3)) //char(byte(4)) ! Concatenate bytes into a string. str = trim(adjustl(str)) ! Trim leading and tailing space. else str = char(code) ! For ASCII characters. end if end function char_utf8