OpenVMS Source-Code Demos

mixed_to_unicode_array

	function long mixed_to_unicode_array(string inbound$, long uni%())		!
	!==================================================================================================
	! title  : mixed_to_unicode_array.fun
	! purpose: Scan inbound data looking for legal UTF-8 code sequences. These are converted to unicode
	!	   which is then mapped to cp1252 (also known as Windows-1252; also known as ANSI) which is
	!	   a superset of ISO-8859-1
	! caveat1: There are two ways to do this: Strict and Relaxed
	!	   1) Strict : everything above ASCII 127 must be legal UTF-8 or we throw it away
	!	   2) Relaxed: anything above ASCII 127 which is not legal UTF-8 is assumed to be cp1252 so
	!		must be mapped to unicode (if possible) or thrown away
	! caveat2: 1) notice that the second parameter is an array of long. We translate from inbound$ into
	!		this array.
	!	   2) This program can be used in conjunction with function unicode_to_utf() which
	!		translates from uni%() to a string
	!	   3) A little bit of external code can be used to convert this array into a string
	! history:
	! ver who when   what
	! --- --- ------ ----------------------------------------------------------------------------------
	! 107 NSR 200107 1. original effort (derived from MIXED_TO_UNICODE_101.FUN)
	!==================================================================================================
	! UTF-8 encoding
	! 1. RFC-2279: http://www.faqs.org/rfcs/rfc2279.html
	! 2. RFC-3629: https://tools.ietf.org/html/rfc3629 (limits UTF-8 to 4 octets; some code points
	!	in the 21-bit address space are being used (notice the 'z' on line 4))
	!
	! UCS-4 range (hex)	UTF-8 octet sequence (binary)				Data Bits
	! -------------------	-----------------------------				---------
	! 0000,0000-0000,007F	0xxxxxxx						 7 bits
	! 0000,0080-0000,07FF	110xxxxx 10xxxxxx					11 bits
	! 0000,0800-0000,FFFF	1110xxxx 10xxxxxx 10xxxxxx				16 bits
	! 0001,0000-001F,FFFF	11110zXX 10xxxxxx 10xxxxxx 10xxxxxx			21 bits (RFC limit)
	! 0020,0000-03FF,FFFF	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx		26 bits (invalid)
	! 0400,0000-7FFF,FFFF	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx	31 bits (invalid)
	!==================================================================================================
	option type=explicit							!
	!
	declare string	tst$, alt$, src$					!
	declare long	uni%, tst%, alt%, i%, j%, k%, bytes%, count_out%	!
	!=======================================================================
	!	main
	!=======================================================================
	k% = len(inbound$)							! measure the length of inbound data
	src$ = inbound$ + space$(6)						! tack on 6-spaces for end-of-string processing
	for i% = 1 to k%							! scan the string
	    tst$ = mid$(src$, i%, 1)						! isolate tst character
	    tst% = asc(tst$)							! convert to ascii
	    if tst% <= 127 then							! legal ASCII so copy then move on
		count_out% = count_out% + 1					!
		uni%(count_out%) = tst%						! store this byte value as a long
		goto get_next_char						! next iteration
	    end if								!
	    !
	    !	this might be UTF-8 so look for the introductory character
	    !
	    if (tst% and X"e0") = x"c0" then					! test for: 110x-xxxx
		bytes%	= 2							! this might be a 2-byte sequence (or not)
		uni%	= tst% and x"1f"					! keep 5-bits of octet #1
		goto process_uni						! continue below
	    end if								!
	    if (tst% and X"f0") = x"e0" then					! test for: 1110-xxxx
		bytes%	= 3							! this might be a 3-byte sequence (or not)
		uni%	= tst% and x"0f"					! keep 4-bits of octet #1
		goto process_uni						! continue below
	    end if								!
	    if (tst% and X"f8") = x"f0" then					! test for: 1111-0xxx
		bytes%	= 4							! this might be a 4-byte sequence (or not)
		uni%	= tst% and x"07"					! keep 3-bits of octet #1
		goto process_uni						! continue below
	    end if								!
	    !
	    !	definately not unicode (see: https://en.wikipedia.org/wiki/Windows-1252)
	    !
	    select tst%								! test the original code
		case 128							!
			uni% = x'20ac'						! euro
		case 129							!
			uni% = 0						! BLANK ON THE CHART
		case 130							!
			uni% = x'201a'						! ending single quote
		case 131							!
			uni% = x'0192'						! stylized lower case f
		case 132							!
			uni% = x'201e'						! ending double quote
		case 133							!
			uni% = x'2026'						! elipsis
		case 134							!
			uni% = x'2020'						!
		case 135							!
			uni% = x'2021'						!
		case 136							!
			uni% = x'02c6'						!
		case 137							!
			uni% = x'2030'						!
		case 138							!
			uni% = x'0160'						!
		case 139							!
			uni% = x'2039'						!
		case 140							!
			uni% = x'0152'						!
		case 141							!
			uni% = 0						! BLANK ON THE CHART
		case 142							!
			uni% = x'017d'						!
		case 143							!
			uni% = 0						! BLANK ON THE CHART
		case 144							!
			uni% = 0						! BLANK ON THE CHART
		case 145							!
			uni% = x'2018'						!
		case 146							!
			uni% = x'2019'						!
		case 147							!
			uni% = x'201c'						!
		case 148							!
			uni% = x'201d'						!
		case 149							!
			uni% = x'2022'						!
		case 150							!
			uni% = x'2013'						!
		case 151							!
			uni% = x'2014'						!
		case 152							!
			uni% = x'02dc'						!
		case 153							!
			uni% = x'2122'						!
		case 154							!
			uni% = x'0161'						!
		case 155							!
			uni% = x'203a'						!
		case 156							!
			uni% = x'0153'						!
		case 157							!
			uni% = 0						! BLANK ON THE CHART
		case 158							!
			uni% = x'017e'						!
		case 159							!
			uni% = x'0178'						!
		case else							!
			uni% = tst%						! is this what we want to do?
	    end select								!
	    if uni% > 0 then							!
		count_out% = count_out% + 1					!
		uni%(count_out%) = uni%						! store this value
	    end if								!
	    goto get_next_char							! next iteration
	    !
	    !	might be unicode depending upon the following bytes
	    !	entry:	i%	= points to tmp$ (first utf-8 octet)
	    !		bytes%	= expected total number of octects (2-4)
	    !
	    process_uni:							!
	    for j% = 1 to (bytes%-1)						!
		alt$ = mid$(src$, i%+j%, 1)					! isolate character after tst$
		alt% = asc(alt$)						!
		if (alt% and x"c0") = x"80"					! is this a secondary utf-8 octet? (10xx-xxxx)
		then								! yes
		    alt% = (alt% and x"3f")					! isolate 6-bits
		    uni% = uni% * 64%						! shift this by 6 places
		    uni% = uni% + alt%						! merge bits
		else								! no
		    count_out% = count_out% + 1					!
		    uni%(count_out%) = tst%					! store original first byte as-is
		    goto get_next_char						!
		end if								!
	    next j%								!
	    !
	    count_out% = count_out% + 1						!
	    uni%(count_out%) = uni%						! is legal UTF-8 so store unicode
	    !-------------------------------------------------------------------
	    i% = i% + bytes% - 1						! eat some chars (NEXT will eat one more)
	    get_next_char:							!
	next i%									! advance by tst
	uni%(0) = count_out%							! copy count here for safe keeping
	mixed_to_unicode_array = count_out%					! also pass count to the caller
	end function								! adios
	!

left hand Back to OpenVMS
left hand Back to OpenVMS Demo Index
home Back to Home
Neil Rieck
Waterloo, Ontario, Canada.