Extend GB18030 encoding conversion to cover full Unicode range.

Our previous code for GB18030 <-> UTF8 conversion only covered Unicode code points up to U+FFFF, but the actual spec defines conversions for all code points up to U+10FFFF. That would be rather impractical as a lookup table, but fortunately there is a simple algorithmic conversion between the additional code points and the equivalent GB18030 byte patterns. Make use of the just-added callback facility in LocalToUtf/UtfToLocal to perform the additional conversions. Having created the infrastructure to do that, we can use the same code to map certain linearly-related subranges of the Unicode space below U+FFFF, allowing removal of the corresponding lookup table entries. This more than halves the lookup table size, which is a substantial savings; utf8_and_gb18030.so drops from nearly a megabyte to about half that. In support of doing that, replace ISO10646-GB18030.TXT with the data file gb-18030-2000.xml (retrieved from http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ ) in which these subranges have been deleted from the simple lookup entries. Per bug #12845 from Arjen Nienhuis. The conversion code added here is based on his proposed patch, though I whacked it around rather heavily.

Extend GB18030 encoding conversion to cover full Unicode range.
Our previous code for GB18030 <-> UTF8 conversion only covered Unicode code points up to U+FFFF, but the actual spec defines conversions for all code points up to U+10FFFF. That would be rather impractical as a lookup table, but fortunately there is a simple algorithmic conversion between the additional code points and the equivalent GB18030 byte patterns. Make use of the just-added callback facility in LocalToUtf/UtfToLocal to perform the additional conversions. Having created the infrastructure to do that, we can use the same code to map certain linearly-related subranges of the Unicode space below U+FFFF, allowing removal of the corresponding lookup table entries. This more than halves the lookup table size, which is a substantial savings; utf8_and_gb18030.so drops from nearly a megabyte to about half that. In support of doing that, replace ISO10646-GB18030.TXT with the data file gb-18030-2000.xml (retrieved from http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ ) in which these subranges have been deleted from the simple lookup entries. Per bug #12845 from Arjen Nienhuis. The conversion code added here is based on his proposed patch, though I whacked it around rather heavily.
8d3e0906 · Tom Lane · 92edba26 · 92edba26 · 8d3e0906 · 8d3e0906
Commit 8d3e0906 authored May 15, 2015 by Tom Lane
7 changed files
--- a/src/backend/utils/mb/Unicode/ISO10646-GB18030.TXT
+++ b/src/backend/utils/mb/Unicode/ISO10646-GB18030.TXT
--- a/src/backend/utils/mb/Unicode/Makefile
+++ b/src/backend/utils/mb/Unicode/Makefile
@@ -86,14 +86,14 @@ euc_tw_to_utf8.map utf8_to_euc_tw.map : CNS11643.TXT
 sjis_to_utf8.map utf8_to_sjis.map : CP932.TXT
 	$(PERL) $(srcdir)/UCS_to_SJIS.pl
-gb18030_to_utf8.map  utf8_to_gb18030.map : ISO10646-GB18030.TXT
+gb18030_to_utf8.map  utf8_to_gb18030.map : gb-18030-2000.xml
 	$(PERL) $(srcdir)/UCS_to_GB18030.pl
 big5_to_utf8.map  utf8_to_big5.map : BIG5.TXT CP950.TXT
 	$(PERL) $(srcdir)/UCS_to_BIG5.pl
-clean:
-	rm -f $(MAPS)
 distclean: clean
 	rm -f $(TEXTS)
+maintainer-clean: distclean
+	rm -f $(MAPS)
--- a/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
@@ -5,42 +5,46 @@
 # src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
 #
 # Generate UTF-8 <--> GB18030 code conversion tables from
-# "ISO10646-GB18030.TXT"
+# "gb-18030-2000.xml"
 #
-# file format:
+# The lines we care about in the source file look like
-#		GB18030 hex code
+#    <a u="009A" b="81 30 83 36"/>
-#		UCS-2 hex code
+# where the "u" field is the Unicode code point in hex,
+# and the "b" field is the hex byte sequence for GB18030
 require "ucs2utf.pl";
-# first generate UTF-8 --> GB18030 table
+# Read the input
-$in_file = "ISO10646-GB18030.TXT";
+$in_file = "gb-18030-2000.xml";
 open(FILE, $in_file) || die("cannot open $in_file");
 while (<FILE>)
 {
-	chop;
+	next if (! m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
-	if (/^#/)
+	$u = $1;
-	{
+	$c = $2;
-		next;
+	$c =~ s/ //g;
-	}
-	($u, $c, $rest) = split;
 	$ucs  = hex($u);
 	$code = hex($c);
 	if ($code >= 0x80 && $ucs >= 0x0080)
 	{
 		$utf = &ucs2utf($ucs);
-		if ($array{$utf} ne "")
+		if ($arrayu{$utf} ne "")
 		{
 			printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
 			next;
 		}
+		if ($arrayc{$code} ne "")
+		{
+			printf STDERR "Warning: duplicate GB18030: %08x\n", $code;
+			next;
+		}
+		$arrayu{$utf} = $code;
+		$arrayc{$code} = $utf;
 		$count++;
-		$array{$utf} = $code;
 	}
 }
 close(FILE);
@@ -54,11 +58,12 @@ $file = "utf8_to_gb18030.map";
 open(FILE, "> $file") || die("cannot open $file");
 print FILE "static const pg_utf_to_local ULmapGB18030[ $count ] = {\n";
-for $index (sort { $a <=> $b } keys(%array))
+$cc = $count;
+for $index (sort { $a <=> $b } keys(%arrayu))
 {
-	$code = $array{$index};
+	$code = $arrayu{$index};
-	$count--;
+	$cc--;
-	if ($count == 0)
+	if ($cc == 0)
 	{
 		printf FILE "  {0x%04x, 0x%04x}\n", $index, $code;
 	}
@@ -75,43 +80,17 @@ close(FILE);
 #
 # then generate GB18030 --> UTF8 table
 #
-reset 'array';
-open(FILE, $in_file) || die("cannot open $in_file");
-while (<FILE>)
-{
-	chop;
-	if (/^#/)
-	{
-		next;
-	}
-	($u, $c, $rest) = split;
-	$ucs  = hex($u);
-	$code = hex($c);
-	if ($code >= 0x80 && $ucs >= 0x0080)
-	{
-		$utf = &ucs2utf($ucs);
-		if ($array{$code} ne "")
-		{
-			printf STDERR "Warning: duplicate code: %04x\n", $ucs;
-			next;
-		}
-		$count++;
-		$array{$code} = $utf;
-	}
-}
-close(FILE);
 $file = "gb18030_to_utf8.map";
 open(FILE, "> $file") || die("cannot open $file");
 print FILE "static const pg_local_to_utf LUmapGB18030[ $count ] = {\n";
-for $index (sort { $a <=> $b } keys(%array))
+$cc = $count;
+for $index (sort { $a <=> $b } keys(%arrayc))
 {
-	$utf = $array{$index};
+	$utf = $arrayc{$index};
-	$count--;
+	$cc--;
-	if ($count == 0)
+	if ($cc == 0)
 	{
 		printf FILE "  {0x%04x, 0x%04x}\n", $index, $utf;
 	}

--- a/src/backend/utils/mb/Unicode/gb-18030-2000.xml
+++ b/src/backend/utils/mb/Unicode/gb-18030-2000.xml
--- a/src/backend/utils/mb/Unicode/gb18030_to_utf8.map
+++ b/src/backend/utils/mb/Unicode/gb18030_to_utf8.map
--- a/src/backend/utils/mb/Unicode/utf8_to_gb18030.map
+++ b/src/backend/utils/mb/Unicode/utf8_to_gb18030.map
--- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
@@ -25,6 +25,161 @@ PG_FUNCTION_INFO_V1(utf8_to_gb18030);
 extern Datum gb18030_to_utf8(PG_FUNCTION_ARGS);
 extern Datum utf8_to_gb18030(PG_FUNCTION_ARGS);
+/*
+ * Convert 4-byte GB18030 characters to and from a linear code space
+ *
+ * The first and third bytes can range from 0x81 to 0xfe (126 values),
+ * while the second and fourth bytes can range from 0x30 to 0x39 (10 values).
+ */
+static inline uint32
+gb_linear(uint32 gb)
+{
+	uint32		b0 = (gb & 0xff000000) >> 24;
+	uint32		b1 = (gb & 0x00ff0000) >> 16;
+	uint32		b2 = (gb & 0x0000ff00) >> 8;
+	uint32		b3 = (gb & 0x000000ff);
+	return b0 * 12600 + b1 * 1260 + b2 * 10 + b3 -
+		(0x81 * 12600 + 0x30 * 1260 + 0x81 * 10 + 0x30);
+}
+static inline uint32
+gb_unlinear(uint32 lin)
+{
+	uint32		r0 = 0x81 + lin / 12600;
+	uint32		r1 = 0x30 + (lin / 1260) % 10;
+	uint32		r2 = 0x81 + (lin / 10) % 126;
+	uint32		r3 = 0x30 + lin % 10;
+	return (r0 << 24) | (r1 << 16) | (r2 << 8) | r3;
+}
+/*
+ * Convert word-formatted UTF8 to and from Unicode code points
+ *
+ * Probably this should be somewhere else ...
+ */
+static inline uint32
+unicode_to_utf8word(uint32 c)
+{
+	uint32		word;
+	if (c <= 0x7F)
+	{
+		word = c;
+	}
+	else if (c <= 0x7FF)
+	{
+		word = (0xC0 | ((c >> 6) & 0x1F)) << 8;
+		word |= 0x80 | (c & 0x3F);
+	}
+	else if (c <= 0xFFFF)
+	{
+		word = (0xE0 | ((c >> 12) & 0x0F)) << 16;
+		word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
+		word |= 0x80 | (c & 0x3F);
+	}
+	else
+	{
+		word = (0xF0 | ((c >> 18) & 0x07)) << 24;
+		word |= (0x80 | ((c >> 12) & 0x3F)) << 16;
+		word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
+		word |= 0x80 | (c & 0x3F);
+	}
+	return word;
+}
+static inline uint32
+utf8word_to_unicode(uint32 c)
+{
+	uint32		ucs;
+	if (c <= 0x7F)
+	{
+		ucs = c;
+	}
+	else if (c <= 0xFFFF)
+	{
+		ucs = ((c >> 8) & 0x1F) << 6;
+		ucs |= c & 0x3F;
+	}
+	else if (c <= 0xFFFFFF)
+	{
+		ucs = ((c >> 16) & 0x0F) << 12;
+		ucs |= ((c >> 8) & 0x3F) << 6;
+		ucs |= c & 0x3F;
+	}
+	else
+	{
+		ucs = ((c >> 24) & 0x07) << 18;
+		ucs |= ((c >> 16) & 0x3F) << 12;
+		ucs |= ((c >> 8) & 0x3F) << 6;
+		ucs |= c & 0x3F;
+	}
+	return ucs;
+}
+/*
+ * Perform mapping of GB18030 ranges to UTF8
+ *
+ * The ranges we need to convert are specified in gb-18030-2000.xml.
+ * All are ranges of 4-byte GB18030 codes.
+ */
+static uint32
+conv_18030_to_utf8(uint32 code)
+{
+#define conv18030(minunicode, mincode, maxcode) \
+	if (code >= mincode && code <= maxcode) \
+		return unicode_to_utf8word(gb_linear(code) - gb_linear(mincode) + minunicode)
+	conv18030(0x0452, 0x8130D330, 0x8136A531);
+	conv18030(0x2643, 0x8137A839, 0x8138FD38);
+	conv18030(0x361B, 0x8230A633, 0x8230F237);
+	conv18030(0x3CE1, 0x8231D438, 0x8232AF32);
+	conv18030(0x4160, 0x8232C937, 0x8232F837);
+	conv18030(0x44D7, 0x8233A339, 0x8233C931);
+	conv18030(0x478E, 0x8233E838, 0x82349638);
+	conv18030(0x49B8, 0x8234A131, 0x8234E733);
+	conv18030(0x9FA6, 0x82358F33, 0x8336C738);
+	conv18030(0xE865, 0x8336D030, 0x84308534);
+	conv18030(0xFA2A, 0x84309C38, 0x84318537);
+	conv18030(0xFFE6, 0x8431A234, 0x8431A439);
+	conv18030(0x10000, 0x90308130, 0xE3329A35);
+	/* No mapping exists */
+	return 0;
+}
+/*
+ * Perform mapping of UTF8 ranges to GB18030
+ */
+static uint32
+conv_utf8_to_18030(uint32 code)
+{
+	uint32		ucs = utf8word_to_unicode(code);
+#define convutf8(minunicode, maxunicode, mincode) \
+	if (ucs >= minunicode && ucs <= maxunicode) \
+		return gb_unlinear(ucs - minunicode + gb_linear(mincode))
+	convutf8(0x0452, 0x200F, 0x8130D330);
+	convutf8(0x2643, 0x2E80, 0x8137A839);
+	convutf8(0x361B, 0x3917, 0x8230A633);
+	convutf8(0x3CE1, 0x4055, 0x8231D438);
+	convutf8(0x4160, 0x4336, 0x8232C937);
+	convutf8(0x44D7, 0x464B, 0x8233A339);
+	convutf8(0x478E, 0x4946, 0x8233E838);
+	convutf8(0x49B8, 0x4C76, 0x8234A131);
+	convutf8(0x9FA6, 0xD7FF, 0x82358F33);
+	convutf8(0xE865, 0xF92B, 0x8336D030);
+	convutf8(0xFA2A, 0xFE2F, 0x84309C38);
+	convutf8(0xFFE6, 0xFFFF, 0x8431A234);
+	convutf8(0x10000, 0x10FFFF, 0x90308130);
+	/* No mapping exists */
+	return 0;
+}
 /* ----------
 * conv_proc(
 *		INTEGER,	-- source encoding id
@@ -47,7 +202,7 @@ gb18030_to_utf8(PG_FUNCTION_ARGS)
 	LocalToUtf(src, len, dest,
 			   LUmapGB18030, lengthof(LUmapGB18030),
 			   NULL, 0,
-			   NULL,
+			   conv_18030_to_utf8,
 			   PG_GB18030);
 	PG_RETURN_VOID();
@@ -65,7 +220,7 @@ utf8_to_gb18030(PG_FUNCTION_ARGS)
 	UtfToLocal(src, len, dest,
 			   ULmapGB18030, lengthof(ULmapGB18030),
 			   NULL, 0,
-			   NULL,
+			   conv_utf8_to_18030,
 			   PG_GB18030);
 	PG_RETURN_VOID();