OpenVMS Source Code Demos

UTF8_DECODE

//================================================================================================
// title   : utf8_decode_xxx.c
// author  : Neil Rieck
// created : 2016-02-23
// notes   :
// 1) this program is my hack to convert inbound web data from UTF-8 to Windows-1252 (a superset
//    of ISO-8859-1)
// 2) I did it this way because we are seeing a lot of malformed data hitting our site and I
//    wanted a solution that would decode legal UTF-8 sequences but just copy everything else.
//    We see this relaxed processing all the time in terminal emulators (this could be disabled
//    if required)
// 3) Version 100 of this program discarded Unicode code points above 255 so in version 101 I
//    added logic which would map code points 256-384 back to ASCII (this could be disabled if
//    required)
// 4) If your emulator will not display certain Windows-1252 characters like the Euro symbol,
//    try setting the emulator code page to "65001" or "windows-1252" or "cp1252" or "ANSI"
//
// ver who when   what
// --- --- ------ --------------------------------------------------------------------------------
// 100 NSR 160223 1. original effort
// 101 NSR 160223 1. added logic to substitute (rather than discard) some codes above 255
// 102 NSR 160308 1. began adding more mappings to table utoa[]
//         160309 1. added more characters to table utoa[]
//		  2. added support for unicode-to-ansi					ver_102.2
//================================================================================================
// UTF-8 encoding
// 1.	RFC 2279: http://www.faqs.org/rfcs/rfc2279.html
// 2.	RFC 3629: https://tools.ietf.org/html/rfc3629 (limits UTF-8 to 4 octets; some code points
//			in the 21-bit address space are being used (notice the 'z' on line 4))
//
// UCS-4 range (hex)	UTF-8 octet sequence (binary)				Data Bits
// -------------------	-----------------------------				---------
// 0000,0000-0000,007F	0xxxxxxx						 7 bits
// 0000,0080-0000,07FF	110xxxxx 10xxxxxx					11 bits
// 0000,0800-0000,FFFF	1110xxxx 10xxxxxx 10xxxxxx				16 bits
// 0001,0000-001F,FFFF	11110zXX 10xxxxxx 10xxxxxx 10xxxxxx			21 bits (RFC limit)
// 0020,0000-03FF,FFFF	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx		26 bits (invalid)
// 0400,0000-7FFF,FFFF	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx	31 bits (invalid)
//=================================================================================================
#define MAX_OCTETS	4							// supported: 4,5,6
#define DVLP		1							// develop: 1=program, 0=function
#define RELAXED		1							//
#if (DVLP==0)
void general_decode(char*,char*);						// forward
void utf_decode(char *buffer1, char *buffer0) {					// function declaration
	general_decode(buffer1, buffer0);
}
#else
#define	SMALL_STRING	255							//
char buffer0[SMALL_STRING+1];							//
char buffer1[SMALL_STRING+1];							//
#endif
//
#include <stdio.h>								//
#include <stdlib.h>								//
#include <string.h>								//
//
//	unicode->ansi (last resort mapping)
//
//	1. Rather than discard unicode code points above 255, we will remap some of them to simple ASCII
//	2. I only use the first letter of any ligature
//	3. This table starts at code point 0x0100 (unicode 256)
//
static const unsigned char utoa[] = {
	//
	//	All 8 rows of Latin Extended A (all mapped back to ASCII)
	//	ref: http://www.unicode.org/charts/PDF/U0100.pdf
	//
	'A' ,'a' ,'A' ,'a' ,'A' ,'a' ,'C' ,'c' ,'C' ,'c' ,'C' ,'c' ,'C' ,'c' ,'D' ,'d' , // row 1
	'D' ,'d' ,'E' ,'e' ,'E' ,'e' ,'E' ,'e' ,'E' ,'e' ,'E' ,'e' ,'G' ,'g' ,'G' ,'g' , // row 2
	'G' ,'g' ,'G' ,'g' ,'H' ,'h' ,'H' ,'h' ,'I' ,'i' ,'I' ,'i' ,'I' ,'i' ,'I' ,'i' , // row 3
	'I' ,'i' ,'I' ,'i' ,'J' ,'j' ,'K' ,'k' ,'k' ,'L' ,'l' ,'L' ,'l' ,'L' ,'l' ,'L' , // row 4
	'l' ,'L' ,'l' ,'N' ,'n' ,'N' ,'n' ,'N' ,'n' ,'n' ,'N' ,'n' ,'O' ,'o' ,'O' ,'o' , // row 5
	'O' ,'o' ,'O' ,'o' ,'R' ,'r' ,'R' ,'r' ,'R' ,'r' ,'S' ,'s' ,'S' ,'s' ,'S' ,'s' , // row 6
	'S' ,'s' ,'T' ,'t' ,'T' ,'t' ,'T' ,'t' ,'U' ,'u' ,'U' ,'u' ,'U' ,'u' ,'U' ,'u' , // row 7
	'U' ,'u' ,'U' ,'u' ,'W' ,'w' ,'Y' ,'y' ,'Y' ,'Z' ,'z' ,'Z' ,'z' ,'Z' ,'z' ,'f' , // row 8
	//
	//	All 13 rows of Latin Extended B (most mapped back to ASCII; one mapped to ANSI)
	//	ref: http://www.unicode.org/charts/PDF/U0180.pdf
	//
	'b' ,'B' ,'b' ,'b' ,'b' ,'b' ,'C' ,'C' ,'c' ,'D' ,'D' ,'d' ,'d' ,'q' ,'E' ,'e' , // row 1
	'e' ,'F' ,'f' ,'G' ,'V' ,'h' ,'l' ,'I' ,'K' ,'k' ,'l' ,'y' ,'W' ,'N' ,'n' ,'O' , // row 2
	'O' ,'o' ,'D' ,'d' ,'P' ,'p' ,'R' ,'S' ,'s' ,'Z' ,'f' ,'t' ,'T' ,'f' ,'T' ,'U' , // row 3
	'u' ,'U' ,'V' ,'Y' ,'y' ,'Z' ,'z' ,'3' ,'3' ,'3' ,'3' ,'2' ,'5' ,'5' ,'t' ,'p' , // row 4
	'|' ,'|' ,'|' ,'!' ,'D' ,'D' ,'d' ,'L' ,'L' ,'l' ,'N' ,'N' ,'n' ,'A' ,'a' ,'I' , // row 5
	'i' ,'O' ,'o' ,'U' ,'u' ,'U' ,'u' ,'U' ,'u' ,'U' ,'u' ,'U' ,'u' ,'e' ,'A' ,'a' , // row 6
	'A' ,'a' ,'A' ,'a' ,'G' ,'g' ,'G' ,'g' ,'K' ,'k' ,'Q' ,'q' ,'Q' ,'q' ,'3' ,'3' , // row 7
	'J' ,'D' ,'D' ,'d' ,'G' ,'g' ,'H' ,'P' ,'N' ,'n' ,'A' ,'a' ,'A' ,'a' ,'0' ,'0' , // row 8
	'A' ,'a' ,'A' ,'a' ,'E' ,'e' ,'E' ,'e' ,'I' ,'i' ,'I' ,'i' ,'O' ,'o' ,'O' ,'o' , // row 9
	'R' ,'r' ,'R' ,'r' ,'U' ,'u' ,'U' ,'u' ,'S' ,'s' ,'T' ,'t' ,'3' ,'3' ,'H' ,'h' , // row 10
	'N' ,'d' ,'8' ,'8' ,'Z' ,'z' ,'A' ,'a' ,'E' ,'e' ,'O' ,'o' ,'O' ,'o' ,'O' ,'o' , // row 11
	'O' ,'o' ,'Y' ,'y' ,'l' ,'n' ,'t' ,'J' ,'d' ,'p' ,'A' ,'C' ,0xa2,'L' ,'T' ,'s' , // row 12
	'z' ,'?' ,'c' ,'B' ,'U' ,'A' ,'E' ,'e' ,'J' ,'j' ,'Q' ,'q' ,'R' ,'r' ,'Y' ,'y' , // row 13
	//
	//	All 6 rows of IPA extensions (most mapped back to ASCII; a few mapped back to ANSI)
	//	ref: http://www.unicode.org/charts/PDF/U0250.pdf
	//
	'r' ,'a' ,'a' ,'g' ,'c' ,'c' ,'p' ,'d' ,'e' ,'e' ,'e' ,'3' ,'3' ,'3' ,'B' ,'J' , // row 1
	'g' ,'g' ,'G' ,'V' ,'v' ,'h' ,'h' ,'h' ,'i' ,'l' ,'I' ,'l' ,'l' ,'l' ,'l' ,'m' , // row 2
	'm' ,'m' ,'n' ,'n' ,'N' ,'0' ,'D' ,'w' ,'o' ,'r' ,'r' ,'r' ,'r' ,'r' ,'j' ,'J' , // row 3
	'R' ,'R' ,'S' ,'f' ,'f' ,'f' ,'f' ,'t' ,'t' ,'u' ,'o' ,'v' ,'v' ,'w' ,'y' ,'Y' , // row 4
	'Z' ,'z' ,'3' ,'3' ,'?' ,'?' ,0xbf,'C' ,'O' ,'B' ,'b' ,'G' ,'H' ,'j' ,'k' ,'L' , // row 5
	'q' ,'?' ,'?' ,'d' ,'d' ,'d' ,'t' ,'t' ,'t' ,'f' ,'l' ,'l' ,'w' ,']' ,'h' ,'h'   // row 6
	};
//
//	general decode buffer0 while copying to buffer1
//
void general_decode(char *buffer1, char *buffer0) {
    long s0, s1;								// subscripts
    long remain;								//
    long bytes;									//
    unsigned char ch;								//
    unsigned long uni;								// large enough for UCS4
    unsigned long msk;								//
    //--------------------------------------------------------------------------
    remain = strlen(buffer0);							// get the length
#if (DVLP==1)
    printf("-i-input    : %s\n", buffer0);					//
    printf("-i-length   : %d\n", remain);					//
#endif
    s0 = 0;									// init subscript 0
    s1 = 0;									// init subscript 1
    buffer1[s1] = '\0';								// init output buf
    while (buffer0[s0] != '\0') {						//
	uni = 0;								//
	ch = buffer0[s0];							// sample a character
	if (ch <= 0x7F) {							// if 7-bit ASCII
	    bytes = 1;								// just copy
	}
	else if ((ch & 0xE0) == 0xC0) {						// 110X XXXX
	    uni = ch & 0x1f;							// isolate 5 bits
	    bytes = 2;								//
	}
	else if ((ch & 0xF0) == 0xE0) {						// 1110 XXXX
	    uni = ch & 0x0f;							// isolate 4 bits
	    bytes = 3;								//
	}
	else if ((ch & 0xF8) == 0xF0) {						// 1111 0XXX
	    uni = ch & 0x07;							// isolate 3 bits
	    bytes = 4;								//
	}
#if (MAX_OCTETS>=5)								// strict UTF-8 will never use this
	else if ((ch & 0xFC) == 0xF8) {						// 1111 10XX
	    uni = ch & 0x03;							// isolate 2 bits
	    bytes = 5;								//
	}
#endif
#if (MAX_OCTETS>=6)								// strict UTF-8 will never use this
	else if ((ch & 0xFE) == 0xFC) {						// 1111 110X
	    uni = ch & 0x01;							// isolate 1 bit
	    bytes = 6;								//
	}
#endif
	else {									// optional (copy as-is)
	    bytes = 1;								//
	}
	//
	//	optional multi-byte processing
	//	note: next byte(s) must be coded as 10xx xxxx or the encoding can't be called utf-8
	//	      if the next byte is not a utf-8 continuation character then just copy as-is
	//	      s0 has not yet moved so start using offset 1
	//
//	printf("bytes: %d remain: %d\n",bytes,remain);
	if ((bytes>1) && (bytes<=remain)) {					//
	    for (int i=1; i < bytes; i++) {					// remember to start at byte one
		ch = buffer0[s0+i];						// sample a character (speculative)
//		printf("i: %d x: %c\n",bytes,ch);
		if ((ch & 0xC0) == 0x80) {					// 10XX XXXX
		    msk = 0;							//
	    	    msk = ch & 0x7F;						// isolate 7 bit;
		    uni = uni * 64;						// shift by 6 places
		    uni = uni + msk;						// merge bits
		}else{								// not the correct bit pattern
//		    printf("oops: %d\n",ch);
		    bytes = 1;							// so default to copy as-is
		    goto handle_single_byte;
		}
	    }
	    //  at this point "uni" is the unicode code point value but we only have room for one byte in windows-1252
	    //	so I will map what I can and replace or discard as I see fit (in computing we never throw anything away
	    //
	    //  caveat: I am assuming that the encoding is legal UNICODE and not windows-1252 which was run through a UTF-8
	    //		encoder without being first remapped.
	    //
#if (DVLP==1)
	    printf("-i-unicode  : %d\n", uni);					//
#endif
#if (RELAXED==1)
	    if (uni<=127) {							// illegal unicode (security risk)
		buffer1[s1++] = uni;						// but copy as ascii anyway
		goto continue_processing;					//
	    }
#endif
	    if (uni<160) {							// if a legal unicode control code
		buffer1[s1++] = 'X';						// then replace with an 'X'
		goto continue_processing;					//
	    }
	    else if (uni<=255) {						// if legal unicode
		buffer1[s1++] = uni;						// then copy as if iso-8859-1
	    }
	    else{
		//	special mapping for windows-1252 (a superset of iso-8859-1)
		//	ref: https://en.wikipedia.org/wiki/Windows-1252#Code_page_layout
		//	ref: http://www.unicode.org/charts/PDF/U0000.pdf (C0 Controls and Basic Latin)
		//	ref: http://www.unicode.org/charts/PDF/U0080.pdf (C1 Controls and Latin-1 Supplement)
		//	ref: http://www.unicode.org/charts/PDF/U20A0.pdf (alternate currency stuff)
		//
		switch(uni) {							//					ver_102.2
		case 0x20ac:							// euro (unicode)
		    buffer1[s1++] = 128;					// euro (ansi)
		    break;							//
		case 0x201a:							//
		    buffer1[s1++] = 130;					//
		    break;							//
		case 0x192:							//
		    buffer1[s1++] = 131;					//
		    break;							//
		case 0x201e:							//
		    buffer1[s1++] = 132;					//
		    break;							//
		case 0x2026:							//
		    buffer1[s1++] = 133;					//
		    break;							//
		case 0x2020:							//
		    buffer1[s1++] = 134;					//
		    break;							//
		case 0x2021:							//
		    buffer1[s1++] = 135;					//
		    break;							//
		case 0x02c6:							//
		    buffer1[s1++] = 136;					//
		    break;							//
		case 0x2030:							//
		    buffer1[s1++] = 137;					//
		    break;							//
		case 0x0160:							//
		    buffer1[s1++] = 138;					//
		    break;							//
		case 0x2039:							//
		    buffer1[s1++] = 139;					//
		    break;							//
		case 0x0152:							//
		    buffer1[s1++] = 140;					//
		    break;							//
		case 0x017d:							//
		    buffer1[s1++] = 142;					//
		    break;							//
		case 0x2018:							//
		    buffer1[s1++] = 145;					//
		    break;							//
		case 0x2019:							//
		    buffer1[s1++] = 146;					//
		    break;							//
		case 0x201c:							//
		    buffer1[s1++] = 147;					//
		    break;							//
		case 0x201d:							//
		    buffer1[s1++] = 148;					//
		    break;							//
		case 0x2022:							//
		    buffer1[s1++] = 149;					//
		    break;							//
		case 0x2013:							//
		    buffer1[s1++] = 150;					//
		    break;							//
		case 0x2014:							//
		    buffer1[s1++] = 151;					//
		    break;							//
		case 0x02dc:							//
		    buffer1[s1++] = 152;					//
		    break;							//
		case 0x2122:							//
		    buffer1[s1++] = 153;					//
		    break;							//
		case 0x0161:							//
		    buffer1[s1++] = 154;					//
		    break;							//
		case 0x203a:							//
		    buffer1[s1++] = 155;					//
		    break;							//
		case 0x0153:							//
		    buffer1[s1++] = 156;					//
		    break;							//
		case 0x017e:							//
		    buffer1[s1++] = 158;					//
		    break;							//
		case 0x0178:							//
		    buffer1[s1++] = 159;					//
		    break;							//
		//
		//	special unicode mapping
		//
		case 0x20a4:							// Lyra (unicode)
		    buffer1[s1++] = 0xa4;					// british pound (ansi)
		    break;							//
		default:							//
		    if ((uni-256)<=sizeof(utoa)) {				// if somewhere in this table
			buffer1[s1++] = utoa[uni-256];				// then remap using utoa[]
		    }else{							//
			buffer1[s1++] = 'Z';					// replace with a 'Z'
		    }								//
		}
	    }
	    continue_processing:
	    s0 = s0 + bytes;							// advance source pointer by bytes
	    buffer1[s1  ] = '\0';						// and terminate
#if (DVLP==1)
	printf("-i-ansi     : %u\n", (unsigned char) buffer1[s1-1]);		//
#endif
	}else{
	    bytes = 1;								// force one-byte copy as-is
	}
	handle_single_byte:;							//
	if (bytes==1) {								//
	    buffer1[s1++] = buffer0[s0++];					// copy
	    buffer1[s1  ] = '\0';						// and terminate
	}									//
	remain = remain - bytes;						// update remaining count
    }
#if (DVLP==1)
    if (remain!=0)
        printf("-e-final value for remain: %d\n",remain);
    printf("-i-output   : %s\n", buffer1);
    printf("-i-length   : %d\n", strlen(buffer1));
    printf("----------------------------------------\n");
#endif
}
#if (DVLP==1)
//==============================================================================
//	main()
//==============================================================================
void main(){
	printf("-i-test case: ASCII only\n");					//
	sprintf(buffer0,"this is a test");					//
	general_decode(buffer1,buffer0);					//
	//
	printf("-i-test case: ISO-8859-1 (e acute)\n");				//
	sprintf(buffer0,"%s%c%s"	,"this is a t",0xE9,          "st");	//
	general_decode(buffer1,buffer0);					//
	//
	printf("-i-test case: UTF-8 (e acute)\n");				//
	sprintf(buffer0,"%s%c%c%s"	,"this is a t",0xC3,0xA9,     "st");	//
	general_decode(buffer1,buffer0);					//
	//
	printf("-i-test case: mixed (two e acute; one ISO and one UTF-8; probably illegal)\n");
	sprintf(buffer0,"%s%c%c%c%s"	,"this is a t",0xE9,0xC3,0xA9,"st");	// mixed (probably illegal)
	general_decode(buffer1,buffer0);					//
	//
	printf("-i-test case: UTF-8 on last char of string (boundary check)\n");//
	sprintf(buffer0,"%s%c%c"	,"inverted question: ",0xC2,0xBF);	// UTF-8 on last char (boundary check)
	general_decode(buffer1,buffer0);					//
	//
	//	table: latin-a
	//
	printf("-i-test case: UTF-8> 0xC4,0x80 = 0x100\n");			//
	sprintf(buffer0,"%s%c%c"	,"should map to 'A': ",0xC4,0x80);	// UTF-8 (this should map to 'A')
	general_decode(buffer1,buffer0);					//
	//
	printf("-i-test case: UTF-8> 0xC5,0xbf = 0x17f\n");			//
	sprintf(buffer0,"%s%c%c"	,"should map to 'f': ",0xC5,0xbf);	// UTF-8 (this should map to 'f')
	general_decode(buffer1,buffer0);					//
	//
	//	table: latin-b
	//
	printf("-i-test case: UTF-8> 0xC6,0x80 = 0x180\n");			//
	sprintf(buffer0,"%s%c%c"	,"should map to 'b': ",0xC6,0x80);	// UTF-8 (this should map to 'b')
	general_decode(buffer1,buffer0);					//
	//
	printf("-i-test case: UTF-8> 0xC9,0x8f = 0x24f\n");			//
	sprintf(buffer0,"%s%c%c"	,"should map to 'y': ",0xC9,0x8f);	// UTF-8 (this should map to 'y')
	general_decode(buffer1,buffer0);					//
	//
	//	special case(s)
	//
	printf("-i-test case: UTF-8> 0xE2, 0x82, 0xAC = Euro Symbol\n");
	sprintf(buffer0,"%s%c%c%c"	,"should map to euro symbol: ", 0xE2, 0x82, 0xAC);
	general_decode(buffer1,buffer0);
}
#endif