OpenVMS Source Code Demos

UTF8_ENCODE

//================================================================================================
// title   : utf8_encode_xxx.c
// author  : Neil Rieck
// created : 2016-03-09
// notes   :
// 1) This program is my hack to convert outbound web data from Windows-1252 (a superset of
//    ISO-8859-1) to UTF-8
// 2) To properly see output data, adjust your terminal emulator to display UTF-8 (this will cause
//    certain input characters to be undisplayable)
//
// ver who when   what
// --- --- ------ --------------------------------------------------------------------------------
// 100 NSR 160309 1. original effort
//================================================================================================
// UTF-8 encoding
// 1.	RFC 2279: http://www.faqs.org/rfcs/rfc2279.html
// 2.	RFC 3629: https://tools.ietf.org/html/rfc3629 (limits UTF-8 to 4 octets; some code points
//			in the 21-bit address space are being used (notice the 'z' on line 4))
//
// UCS-4 range (hex)	UTF-8 octet sequence (binary)				Data Bits
// -------------------	-----------------------------				---------
// 0000,0000-0000,007F	0xxxxxxx						 7 bits
// 0000,0080-0000,07FF	110xxxxx 10xxxxxx					11 bits
// 0000,0800-0000,FFFF	1110xxxx 10xxxxxx 10xxxxxx				16 bits
// 0001,0000-001F,FFFF	11110zXX 10xxxxxx 10xxxxxx 10xxxxxx			21 bits (RFC limit)
// 0020,0000-03FF,FFFF	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx		26 bits (invalid)
// 0400,0000-7FFF,FFFF	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx	31 bits (invalid)
//=================================================================================================
#define DVLP		1							// develop: 1=program, 0=function
#if (DVLP==0)
void general_encode(char*, char*);						// forward
void utf_encode(char *buffer1, char *buffer0) {					// function declaration
	general_encode(buffer1, buffer0);
}
#else
#define	SMALL_STRING	255							//
char buffer0[SMALL_STRING+1];							//
char buffer1[SMALL_STRING+1];							//
#endif
//
#include <stdio.h>								//
#include <stdlib.h>								//
#include <string.h>								//
//
//	ansi-to-unicode
//	1. this table is used to map 32-chars from ansi (windows-1252) to unicode values
//	   (these 32 characters represent the differece between iso-8859-1 and ansi (windows-1252)
//	2. the first entry here represents 0x80 (128)
//	3. decimal values represent holes in the mapping (better to copy data than discard data)
//	4. references
//		ref: https://en.wikipedia.org/wiki/Windows-1252#Code_page_layout
//		ref: http://www.unicode.org/charts/PDF/U0000.pdf (C0 Controls and Basic Latin)
//		ref: http://www.unicode.org/charts/PDF/U0080.pdf (C1 Controls and Latin-1 Supplement)
//		ref: http://www.unicode.org/charts/PDF/U20A0.pdf (alternate currency stuff)
//
static const unsigned long atou[] = {
	0x20ac,   129,0x201a,0x0192,0x201e,0x2026,0x2020,0x2021,0x02c6,0x2030,0x0160,0x2039,0x0152,  141,0x017d,   143,	// row 1
	   144,0x2018,0x2019,0x20c1,0x201d,0x2022,0x2013,0x2014,0x02dc,0x2122,0x0161,0x203a,0x0153,  157,0x017e,0x0178	// row 2
	};
//
//	general encode buffer0 placing the result in buffer1
//
void general_encode(char *buffer1, char *buffer0) {
    unsigned char ch;								//
    unsigned long uni;								// large enough for UCS4
    unsigned long msk;								//
    char *dst;
    char *src;
    //--------------------------------------------------------------------------
    dst = buffer1;								// paranoid copy
    src = buffer0;								//
#if (DVLP==1)
    printf("-i-input    : %s\n", buffer0);					//
    printf("-i-length   : %d\n", strlen(buffer0));				//
#endif
    *dst = '\0';								// init output buf
    while (*src != '\0') {							//
	uni = 0;								// init unicode value
	ch = (unsigned char) *src++;						// sample a character
	if ((ch <= 0x7F) || (ch >= 0xa0)) {					// if 7-bit ASCII or 8-bit ISO-8859-1
	    uni = ch;								// character code becomes unicode value
	}else{									//
	    uni = atou[ch-128];							// else ANSI (windows-1252)
	}									//
	//
	//	convert unicode to utf-8
	//
	if      (uni<0x80)		*dst++=uni;
	else if (uni<0x800)		*dst++=192+uni/64, *dst++=128+uni%64;
	else if (uni-0xd800u<0x800) goto utf8_error;				// unsupported (see RFC)
	else if (uni<0x10000)		*dst++=224+uni/4096, *dst++=128+uni/64%64, *dst++=128+uni%64;
	else if (uni<0x110000)		*dst++=240+uni/262144, *dst++=128+uni/4096%64, *dst++=128+uni/64%64, *dst++=128+uni%64;
	else goto utf8_error;							// unsupported (see RFC)
	//
	utf8_error:
	// do nothing here for now
	utf8_post_processing:
	*(dst+1) = '\0';							// always append a terminator
#if (DVLP==1)
	if (uni>=128) {
	    printf("-i-ansi     : %s%x\n", "0x",ch );				//
	    printf("-i-unicode  : %s%x\n", "0x",uni);				//
	}
#endif
    }
#if (DVLP==1)
    printf("-i-output   : %s\n", buffer1);
    printf("-i-length   : %d\n", strlen(buffer1));
    printf("----------------------------------------\n");
#endif
}
#if (DVLP==1)
//==============================================================================
//	main()
//==============================================================================
void main(){
	printf("-i-test case: ASCII only\n");					//
	sprintf(buffer0,"this is a test");					//
	general_encode(buffer1,buffer0);					//
	//
	printf("-i-test case: ISO-8859-1 (e acute)\n");				//
	sprintf(buffer0,"%s%c%s"	,"this is a t",0xE9,	"st");		//
	general_encode(buffer1,buffer0);					//
	//
	printf("-i-test case: ANSI (Euro symbol)\n");				//
	sprintf(buffer0,"%s%c"	,"Euro Symbol: ",0x80);				//
	general_encode(buffer1,buffer0);					//
}
#endif