OpenVMS Source Code Demos

UNICODE_TO_ISO

	function string unicode_to_iso(string src$, long debug%)		! debug is not used here
	!==================================================================================================
	! title  : UNICODE_TO_ISO_106.FUN
	! caveat1: This function is misnamed. It should have called "utf8_to_cp1252"
	! purpose: Scan inbound data looking for legal UTF-8 code sequences. These are converted to unicode
	!	   which is then mapped to cp1252 (also known as Windows-1252; also known as ANSI) which is
	!	   a superset of ISO-8859-1
	! caveat2: There are two ways to do this: Strict and Relaxed
	!	   1) Strict : everything above ASCII 127 must be legal UTF-8 or we throw it away
	!	   2) Relaxed:
	!	      2a. anything above ASCII 127 which is not legal UTF-8 is assumed to be cp1252 so we keep it
	!	      2b. any unicode value above 255 must be mapped back to the equivalent cp1252 code if possible
	!		  otherwise we throw it away (because we only have room for one single byte)
	! history:
	! ver who when   what
	! --- --- ------ ----------------------------------------------------------------------------------
	! 105 DGM 150401 1. renovation
	! 106 NSR 160314 1. renovation (to match utf8_decode.c)
	!     NSR 170315 2. minor cleanup + documentation changes
	!==================================================================================================
	! UTF-8 encoding
	! 1. RFC-2279: http://www.faqs.org/rfcs/rfc2279.html
	! 2. RFC-3629: https://tools.ietf.org/html/rfc3629 (limits UTF-8 to 4 octets; some code points
	!	in the 21-bit address space are being used (notice the 'z' on line 4))
	!
	! UCS-4 range (hex)	UTF-8 octet sequence (binary)				Data Bits
	! -------------------	-----------------------------				---------
	! 0000,0000-0000,007F	0xxxxxxx						 7 bits
	! 0000,0080-0000,07FF	110xxxxx 10xxxxxx					11 bits
	! 0000,0800-0000,FFFF	1110xxxx 10xxxxxx 10xxxxxx				16 bits
	! 0001,0000-001F,FFFF	11110zXX 10xxxxxx 10xxxxxx 10xxxxxx			21 bits (RFC limit)
	! 0020,0000-03FF,FFFF	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx		26 bits (invalid)
	! 0400,0000-7FFF,FFFF	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx	31 bits (invalid)
	!==================================================================================================
	option type=explicit							!
	!
	declare string	dst$, tst$, alt$					!
	declare long	uni%, tst%, alt%, i%, j%, bytes%			!
	!-----------------------------------------------------------------------
	!	main
	!-----------------------------------------------------------------------
	for i% = 1 to len(src$)							! scan the string
	    tst$ = mid$(src$, i%, 1)						! isolate tst character
	    tst% = asc(tst$)							! convert to ascii
	    if tst% <= 127 then							!
		dst$ = dst$ + tst$						! copy ASCII as-is
		goto get_next_char						! next iteration
	    end if								!
	    !
	    if (tst% and X"e0") = x"c0" then					! test for: 1100-xxxx
		bytes%	= 2							! this might be a 2-byte sequence (or not)
		uni%	= tst% and x"1f"					! keep 5-bits of octet #1
		goto process_uni						! continue below
	    end if								!
	    if (tst% and X"f0") = x"e0" then					! test for: 1110-xxxx
		bytes%	= 3							! this might be a 3-byte sequence (or not)
		uni%	= tst% and x"0f"					! keep 4-bits of octet #1
		goto process_uni						! continue below
	    end if								!
	    if (tst% and X"f8") = x"f0" then					! test for: 1111-xxxx
		bytes%	= 4							! this might be a 4-byte sequence (or not)
		uni%	= tst% and x"07"					! keep 3-bits of octet #1
		goto process_uni						! continue below
	    end if								!
	    !
	    !	definately not unicode
	    !
	    dst$ = dst$ + tst$							! copy cp1252 as-is
	    goto get_next_char							! next iteration
	    !
	    !	this might be a unicode character depending upon the following bytes following
	    !	entry:	i%	= points to tmp$ (first utf-8 octet)
	    !		bytes%	= expected total number of octects (2-4)
	    !
	    process_uni:							!
	    for j% = 1 to (bytes%-1)						!
		alt$ = mid$(src$, i%+j%, 1)					! isolate character after tst$
		alt% = asc(alt$)						!
		if (alt% and x"c0") = x"80"					! is this a secondary utf-8 octet? (10xx-xxxx)
		then								! yes
		    alt% = (alt% and x"7f")					! isolate 6-bits
		    uni% = uni% * 64%						! shift by 6 places
		    uni% = uni% + alt%						! merge bits
		else								! no
		    dst$ = dst$ + tst$						! assume is cp1252 then copy as-is
		    goto get_next_char						!
		end if								!
	    next j%								!
	    !
	    !	at this point, this appeared to be a legal UTF-8 sequence
	    !
	    !-----------------------------------------------------------
	    !	unicode mapping to cp1252
	    !	note: perhaps a single table lookup would be more efficient
	    !-----------------------------------------------------------
	    !	unicode ref: http://unicode-table.com/en/
	    !
	    select uni%							!
		case <= 255						!
				dst$ = dst$ + chr$(uni%)		!
		case 8208, 8209						! x2010,x2011	Unicode: hyphen, non-breaking hyphen
				dst$ = dst$ + chr$(45)			!		ASCII  : dash
		case 8210, 8211						! x2012,x2013	Unicode: Figure Dash, En Dash
				dst$ = dst$ + chr$(150)			!		Windows: dash
		case 8212, 8213						! x2014,x2015	Unicode: EM dash, Horizontal Bar
				dst$ = dst$ + chr$(151)			!		Windows: dash
		case 8216						! x2018		Unicode: left side single quote
				dst$ = dst$ + chr$(145)			!		Windows: left single quote
		case 8217						! x2019		Unicode: right side single quote
				dst$ = dst$ + chr$(146)			!		Windows: right single quote
		case 8220						! x201c		Unicode: left double-quote
				dst$ = dst$ + chr$(147)			!		Windows: left double-quote
		case 8221						! x201d		Unicode: right double-quote
				dst$ = dst$ + chr$(148)			!		Windows" right double-quote
		case 8224						! x2020		Unicode: dagger
				dst$ = dst$ + chr$(134)			!		Windows: dagger
		case 8225						! x2021		Unicode: double dagger
				dst$ = dst$ + chr$(135)			!		Windows: double dagger
		case 8226						! x2022		Unicode: black dot
				dst$ = dst$ + chr$(149)			!		Windows: black dot
		case 8230						! x2026		Unicode: three dot horizontal ellipsis
				dst$ = dst$ + chr$(133)			!		Windows: horizontal ellipsis
		case 8248						! x2038		Unicode: caret
				dst$ = dst$ + chr$(94)			!		Windows: circumflex
		case 8249						! x2039		Unicode: single less than
				dst$ = dst$ + chr$(139)			!		Windows: single less than
		case 8250						! x203a		unicode: single greater than
				dst$ = dst$ + chr$(155)			!		Windows: single greater than
		case 8364						! x20ac		Unicode: Euro symbol
				dst$ = dst$ + chr$(128)			!		Windows: Euro symbol
		case else						! oops, what do we do now?
%if 1=1 %then								!
				dst$ = dst$ + chr$(182)			! substitute	Windows: Pilcrow (funny inverted "P")
%else									!
									! do nothig (throw it way)
%end %if								!
	    end select							!
	    !-------------------------------------------------------------------
	    i% = i% + bytes% - 1						! eat some chars (NEXT will eat one more)
	    get_next_char:							!
	next i%									! advance by tst
	unicode_to_iso = dst$							! pass back to called
	end function								! adios
	!