function string unicode_to_utf8(long uni%()) !
! caveat: Here we are passing an array of longs which represent a srting in ASCII form
!==================================================================================================
! title : unicode_to_utf8.fun
! history:
! ver who when what
! --- --- ------ ----------------------------------------------------------------------------------
! 100 NSR 170315 1. original effort
! NSR 190702 2. added some missing documentation
!==================================================================================================
! caveats:
! 1) Here we are passing an array of numbers which represent a string in ASCII form
! 2) This function is best used after calling function: MIXED_TO_UNICODE()
! 3) If you call this program for single character conversions, then do so like this:
! external string function (long dim()) !!
! dim long uni%(5) !!
! uni%(0) = 1 !! this means "one character"
! uni%(1) = 201 !! eg. 201 = ascii("�")
! junk$ = unicode_to_utf8(uni%()) !! eg. chr$(195) + chr$(195)
! 3) See these programs for more info:
! MYSQL_IMPORT_HELPER_BASIC_TEMPLATE_106_PART3.BAS
! AAA_UNICODE_TESTING_100.BAS
! MIXED_TO_UNICODE_100.FUN
!==================================================================================================
! UTF-8 encoding
! 1. RFC-2279: http://www.faqs.org/rfcs/rfc2279.html
! 2. RFC-3629: https://tools.ietf.org/html/rfc3629 (limits UTF-8 to 4 octets; some code points in
! the 21-bit address space are not being used (notice the 'z' on line 4))
!
! UCS-4 range (hex) UTF-8 octet sequence (binary) Data Bits
! ------------------- ----------------------------- ---------
! 0000,0000-0000,007F 0xxxxxxx 7 bits
! 0000,0080-0000,07FF 110xxxxx 10xxxxxx 11 bits
! 0000,0800-0000,FFFF 1110xxxx 10xxxxxx 10xxxxxx 16 bits
! 0001,0000-001F,FFFF 11110zXX 10xxxxxx 10xxxxxx 10xxxxxx 21 bits (RFC limit)
! 0020,0000-03FF,FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 26 bits (invalid)
! 0400,0000-7FFF,FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 31 bits (invalid)
!==================================================================================================
option type=explicit !
!
declare string out$, temp$ !
declare long uni%, temp%, alt%, i%, j%, k%, bytes%, bits% !
!-----------------------------------------------------------------------
! main
!-----------------------------------------------------------------------
out$ = "" !
k% = uni%(0) ! data length is stored here
for i% = 1 to k% ! scan the string
uni% = uni%(i%) ! grab some unicode
select uni% !
case 0 to x"0007F" !
bytes% = 1 !
case x"00080" to x"007FF" !
bytes% = 2 !
case x"00800" to x"0FFFF" !
bytes% = 3 !
case x"10000" to x"10FFFF" !
bytes% = 4 !
case else !
goto next_code_point ! throw away anything else
end select !
!
temp$ = "" ! zap
temp% = bytes% ! copy desired bytes
while temp% > 0 !
if temp% = 1 then ! if on last one
select bytes% !
case 1 !
bits% = uni% ! no encoding required
case 2 !
bits% = b"11000000" or uni% !
case 3 !
bits% = b"11100000" or uni% !
case 4 !
bits% = b"11110000" or uni% !
end select !
else ! else not on last one
bits% = b"10000000" or (uni% and x"3f") ! only use the lowest 6-bits
uni% = uni% / 64 ! shift data by six bits
end if !
temp$ = chr$(bits%) + temp$ ! append from the left
temp% = temp% -1 !
next
out$ = out$ + temp$
!
next_code_point:
next i% !
unicode_to_utf8 = out$ ! pass string back
end function ! adios
!
Back to OpenVMS
Back to OpenVMS Demo Index
Back to Home
Neil Rieck
Waterloo, Ontario, Canada.