Add GB18030 support. Contributed by Bill Huang <bill_huanghb@ybb.ne.jp>

(ODBC support has not been committed yet. left for Hiroshi...)

Add GB18030 support. Contributed by Bill Huang <bill_huanghb@ybb.ne.jp>
(ODBC support has not been committed yet. left for Hiroshi...)
14f72b9a · Tatsuo Ishii · 620dbc98 · 14f72b9a · 14f72b9a · 14f72b9a
Commit 14f72b9a authored Jun 13, 2002 by Tatsuo Ishii
8 changed files
--- a/src/backend/utils/mb/Unicode/ISO10646-GB18030.TXT
+++ b/src/backend/utils/mb/Unicode/ISO10646-GB18030.TXT
--- a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
+#! /usr/bin/perl
+#
+# Copyright 2002 by Bill Huang
+#
+# $Id: UCS_to_GB18030.pl,v 1.1 2002/06/13 08:28:55 ishii Exp $
+#
+# Generate UTF-8 <--> GB18030 code conversion tables from
+# map files provided by Unicode organization.
+# Unfortunately it is prohibited by the organization
+# to distribute the map files. So if you try to use this script,
+# you have to obtain ISO10646-GB18030.TXT from 
+# the organization's ftp site.
+#
+# ISO10646-GB18030.TXT format:
+#		 GB18030 code in hex
+#		 UCS-2 code in hex
+#		 # and Unicode name (not used in this script)
+require "ucs2utf.pl";
+# first generate UTF-8 --> GB18030 table
+$in_file = "ISO10646-GB18030.TXT";
+open( FILE, $in_file ) || die( "cannot open $in_file" );
+while( <FILE> ){
+	chop;
+	if( /^#/ ){
+		next;
+	}
+	( $u, $c, $rest ) = split;
+	$utf = hex($u);
+	$code = hex($c);
+	$count++;
+	$array{ $utf } = ($code);
+}
+close( FILE );
+#
+# first, generate UTF8 --> GB18030 table
+#
+$file = "utf8_to_gb18030.map";
+open( FILE, "> $file" ) || die( "cannot open $file" );
+print FILE "static pg_utf_to_local ULmapGB18030[ $count ] = {\n";
+for $index ( sort {$a <=> $b} keys( %array ) ){
+	$code = $array{ $index };
+	$count--;
+	if( $count == 0 ){
+		printf FILE "  {0x%04x, 0x%04x}\n", $index, $code;
+	} else {
+		printf FILE "  {0x%04x, 0x%04x},\n", $index, $code;
+	}
+}
+print FILE "};\n";
+close(FILE);
+#
+# then generate GB18030 --> UTF8 table
+#
+reset 'array';
+open( FILE, $in_file ) || die( "cannot open $in_file" );
+while( <FILE> ){
+	chop;
+	if( /^#/ ){
+		next;
+	}
+	( $u, $c, $rest ) = split;
+	$utf = hex($u);
+	$code = hex($c);
+	$count++;
+	$array{ $code } = $utf;
+}
+close( FILE );
+$file = "gb18030_to_utf8.map";
+open( FILE, "> $file" ) || die( "cannot open $file" );
+print FILE "static pg_local_to_utf LUmapGB18030[ $count ] = {\n";
+for $index ( sort {$a <=> $b} keys( %array ) ){
+	$utf = $array{ $index };
+	$count--;
+	if( $count == 0 ){
+		printf FILE "  {0x%04x, 0x%04x}\n", $index, $utf;
+	} else {
+		printf FILE "  {0x%04x, 0x%04x},\n", $index, $utf;
+	}
+}
+print FILE "};\n";
+close(FILE);
--- a/src/backend/utils/mb/Unicode/gb18030_to_utf8.map
+++ b/src/backend/utils/mb/Unicode/gb18030_to_utf8.map
--- a/src/backend/utils/mb/Unicode/utf8_to_gb18030.map
+++ b/src/backend/utils/mb/Unicode/utf8_to_gb18030.map
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -6,7 +6,7 @@
 * WIN1250 client encoding support contributed by Pavel Behal
 * SJIS UDC (NEC selection IBM kanji) support contributed by Eiji Tokuya
 *
- * $Id: conv.c,v 1.37 2002/03/06 06:10:26 momjian Exp $
+ * $Id: conv.c,v 1.38 2002/06/13 08:28:54 ishii Exp $
 *
 *
 */
@@ -48,6 +48,8 @@
 #include "Unicode/euc_jp_to_utf8.map"
 #include "Unicode/utf8_to_euc_cn.map"
 #include "Unicode/euc_cn_to_utf8.map"
+#include "Unicode/utf8_to_gb18030.map"
+#include "Unicode/gb18030_to_utf8.map"
 #include "Unicode/utf8_to_euc_kr.map"
 #include "Unicode/euc_kr_to_utf8.map"
 #include "Unicode/utf8_to_euc_tw.map"
@@ -515,6 +517,96 @@ mic2euc_cn(unsigned char *mic, unsigned char *p, int len)
 	*p = '\0';
 }
+/*
+ * GB18030 ---> MIC
+ * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
+ */
+static void
+gb180302mic(unsigned char *gb18030, unsigned char *p, int len)
+{
+	int			c1;
+	int			c2;
+	while (len > 0 && (c1 = *gb18030++))
+	{
+		if (c1 < 0x80)
+		{						/* should be ASCII */
+			len--;
+			*p++ = c1;
+		}
+		else if(c1 >= 0x81 && c1 <= 0xfe)
+		{
+			c2 = *gb18030++;
+			if(c2 >= 0x30 && c2 <= 0x69){
+				len -= 4;
+				*p++ = c1;
+				*p++ = c2;
+				*p++ = *gb18030++;
+				*p++ = *gb18030++;
+				*p++ = *gb18030++;
+			}
+			else if ((c2 >=0x40 && c2 <= 0x7e) ||(c2 >=0x80 && c2 <= 0xfe)){
+				len -= 2;
+				*p++ = c1;
+				*p++ = c2;
+				*p++ = *gb18030++;
+			}
+			else{	/*throw the strange code*/
+				len--;
+			}
+		}
+	}
+	*p = '\0';
+}
+/*
+ * MIC ---> GB18030
+ * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
+ */
+static void
+mic2gb18030(unsigned char *mic, unsigned char *p, int len)
+{
+	int			c1;
+	int			c2;
+	while (len > 0 && (c1 = *mic))
+	{
+		len -= pg_mic_mblen(mic++);
+		if (c1 <= 0x7f) /*ASCII*/
+		{					
+			*p++ = c1;
+		}
+		else if (c1 >= 0x81 && c1 <= 0xfe)
+		{		
+			c2 = *mic++;
+			if((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe)){
+				*p++ = c1;
+				*p++ = c2;
+			}
+			else if(c2 >= 0x30 && c2 <= 0x39){
+				*p++ = c1;
+				*p++ = c2;
+				*p++ = *mic++;
+				*p++ = *mic++;
+			}	
+			else{
+				mic--;
+				printBogusChar(&mic, &p);
+				mic--;
+				printBogusChar(&mic, &p);
+			}		
+		}
+		else{
+			mic--;
+			printBogusChar(&mic, &p);
+		}
+	}
+	*p = '\0';
+}
 /*
 * EUC_TW ---> MIC
 */
@@ -1596,6 +1688,26 @@ euc_cn_to_utf(unsigned char *euc, unsigned char *utf, int len)
 		  sizeof(LUmapEUC_CN) / sizeof(pg_local_to_utf), PG_EUC_CN, len);
 }
+/*
+ * UTF-8 ---> GB18030
+ */
+static void
+utf_to_gb18030(unsigned char *utf, unsigned char *euc, int len)
+{
+	utf_to_local(utf, euc, ULmapGB18030,
+				 sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), len);
+}
+/*
+ * GB18030 ---> UTF-8
+ */
+static void
+gb18030_to_utf(unsigned char *euc, unsigned char *utf, int len)
+{
+	local_to_utf(euc, utf, LUmapGB18030,
+		  sizeof(LUmapGB18030) / sizeof(pg_local_to_utf), PG_GB18030, len);
+}
 /*
 * UTF-8 ---> EUC_KR
 */
@@ -1935,6 +2047,9 @@ pg_enconv	pg_enconv_tbl[] =
 	{
 		PG_WIN1250, win12502mic, mic2win1250, win1250_to_utf, utf_to_win1250
 	},
+	{
+		PG_GB18030, gb180302mic, mic2gb18030, gb18030_to_utf, utf_to_gb18030
+	},
 };
 #else
@@ -2019,9 +2134,18 @@ pg_enconv	pg_enconv_tbl[] =
 	{
 		PG_BIG5, big52mic, mic2big5, 0, 0
 	},
+	{
+		PG_GBK, 0, 0, 0, 0
+	},
+	{
+		PG_UHC, 0, 0, 0, 0
+	},
 	{
 		PG_WIN1250, win12502mic, mic2win1250, 0, 0
 	},
+	{
+		PG_GB18030, gb180302mic, mic2gb18030, 0, 0
+	},
 };
 #endif   /* UNICODE_CONVERSION */
--- a/src/backend/utils/mb/encnames.c
+++ b/src/backend/utils/mb/encnames.c
@@ -2,7 +2,7 @@
 * Encoding names and routines for work with it. All
 * in this file is shared bedween FE and BE.
 *
- * $Id: encnames.c,v 1.7 2002/03/05 05:52:44 momjian Exp $
+ * $Id: encnames.c,v 1.8 2002/06/13 08:28:54 ishii Exp $
 */
 #ifdef FRONTEND
 #include "postgres_fe.h"
@@ -60,7 +60,11 @@ pg_encname	pg_encname_tbl[] =
 	{
 		"euctw", PG_EUC_TW
 	},							/* EUC-TW; Extended Unix Code for
 								 * traditional Chinese */
+	{
+		"gb18030", PG_GB18030
+	},							/* GB18030;GB18030 */
 	{
 		"gbk", PG_GBK
 	},							/* GBK; Chinese Windows CodePage 936
@@ -239,7 +243,6 @@ pg_encname	pg_encname_tbl[] =
 	{
 		"windows950", PG_BIG5
 	},							/* alias for BIG5 */
 	{
 		NULL, 0
 	}							/* last */
@@ -353,6 +356,9 @@ pg_enc2name pg_enc2name_tbl[] =
 	},
 	{
 		"WIN1250", PG_WIN1250
+	},
+	{
+		"GB18030", PG_GB18030
 	}
 };

--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
 /*
 * conversion functions between pg_wchar and multi-byte streams.
 * Tatsuo Ishii
- * $Id: wchar.c,v 1.27 2002/03/05 05:52:44 momjian Exp $
+ * $Id: wchar.c,v 1.28 2002/06/13 08:28:54 ishii Exp $
 *
 * WIN1250 client encoding updated by Pavel Behal
 *
@@ -510,6 +510,31 @@ pg_uhc_mblen(const unsigned char *s)
 	return (len);
 }
+/*
+ *  * GB18030
+ *   * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
+ *    */
+static int
+pg_gb18030_mblen(const unsigned char *s)
+{
+        int                     len;
+        if (*s <= 0x7f)
+        {                                                       /* ASCII */
+                len = 1;
+        }
+        else
+        {                                                       
+                if((*(s+1) >= 0x40 && *(s+1) <= 0x7e)|| (*(s+1) >= 0x80 && *(s+1) <= 0xfe))
+                        len = 2;
+                else if(*(s+1) >= 0x30 && *(s+1) <= 0x39)
+                        len = 4;
+                else
+                        len = 2;
+        }
+        return (len);
+}
 pg_wchar_tbl pg_wchar_table[] = {
 	{pg_ascii2wchar_with_len, pg_ascii_mblen, 1},		/* 0; PG_SQL_ASCII	*/
 	{pg_eucjp2wchar_with_len, pg_eucjp_mblen, 3},		/* 1; PG_EUC_JP */
@@ -544,6 +569,7 @@ pg_wchar_tbl pg_wchar_table[] = {
 	{0, pg_gbk_mblen, 2},		/* 30; PG_GBK */
 	{0, pg_uhc_mblen, 2},		/* 31; PG_UHC */
 	{pg_latin12wchar_with_len, pg_latin1_mblen, 1},		/* 32; PG_WIN1250 */
+	{0, pg_gb18030_mblen, 2}       /* 33; PG_GB18030 */
 };
 /* returns the byte length of a word for mule internal code */

--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
-/* $Id: pg_wchar.h,v 1.38 2002/03/05 05:52:50 momjian Exp $ */
+/* $Id: pg_wchar.h,v 1.39 2002/06/13 08:30:22 ishii Exp $ */
 #ifndef PG_WCHAR_H
 #define PG_WCHAR_H
@@ -189,7 +189,7 @@ typedef enum pg_enc
 	PG_GBK,					/* GBK (Windows-936) */
 	PG_UHC,					/* UHC (Windows-949) */
 	PG_WIN1250,					/* windows-1250 */
+	PG_GB18030,					/* GB18030 */
 	_PG_LAST_ENCODING_			/* mark only */
 } pg_enc;