Commit 8d3e0906 authored by Tom Lane's avatar Tom Lane

Extend GB18030 encoding conversion to cover full Unicode range.

Our previous code for GB18030 <-> UTF8 conversion only covered Unicode code
points up to U+FFFF, but the actual spec defines conversions for all code
points up to U+10FFFF.  That would be rather impractical as a lookup table,
but fortunately there is a simple algorithmic conversion between the
additional code points and the equivalent GB18030 byte patterns.  Make use
of the just-added callback facility in LocalToUtf/UtfToLocal to perform the
additional conversions.

Having created the infrastructure to do that, we can use the same code to
map certain linearly-related subranges of the Unicode space below U+FFFF,
allowing removal of the corresponding lookup table entries.  This more
than halves the lookup table size, which is a substantial savings;
utf8_and_gb18030.so drops from nearly a megabyte to about half that.

In support of doing that, replace ISO10646-GB18030.TXT with the data file
gb-18030-2000.xml (retrieved from
http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/ )
in which these subranges have been deleted from the simple lookup entries.

Per bug #12845 from Arjen Nienhuis.  The conversion code added here is
based on his proposed patch, though I whacked it around rather heavily.
parent 92edba26
This diff is collapsed.
...@@ -86,14 +86,14 @@ euc_tw_to_utf8.map utf8_to_euc_tw.map : CNS11643.TXT ...@@ -86,14 +86,14 @@ euc_tw_to_utf8.map utf8_to_euc_tw.map : CNS11643.TXT
sjis_to_utf8.map utf8_to_sjis.map : CP932.TXT sjis_to_utf8.map utf8_to_sjis.map : CP932.TXT
$(PERL) $(srcdir)/UCS_to_SJIS.pl $(PERL) $(srcdir)/UCS_to_SJIS.pl
gb18030_to_utf8.map utf8_to_gb18030.map : ISO10646-GB18030.TXT gb18030_to_utf8.map utf8_to_gb18030.map : gb-18030-2000.xml
$(PERL) $(srcdir)/UCS_to_GB18030.pl $(PERL) $(srcdir)/UCS_to_GB18030.pl
big5_to_utf8.map utf8_to_big5.map : BIG5.TXT CP950.TXT big5_to_utf8.map utf8_to_big5.map : BIG5.TXT CP950.TXT
$(PERL) $(srcdir)/UCS_to_BIG5.pl $(PERL) $(srcdir)/UCS_to_BIG5.pl
clean:
rm -f $(MAPS)
distclean: clean distclean: clean
rm -f $(TEXTS) rm -f $(TEXTS)
maintainer-clean: distclean
rm -f $(MAPS)
...@@ -5,42 +5,46 @@ ...@@ -5,42 +5,46 @@
# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl # src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
# #
# Generate UTF-8 <--> GB18030 code conversion tables from # Generate UTF-8 <--> GB18030 code conversion tables from
# "ISO10646-GB18030.TXT" # "gb-18030-2000.xml"
# #
# file format: # The lines we care about in the source file look like
# GB18030 hex code # <a u="009A" b="81 30 83 36"/>
# UCS-2 hex code # where the "u" field is the Unicode code point in hex,
# and the "b" field is the hex byte sequence for GB18030
require "ucs2utf.pl"; require "ucs2utf.pl";
# first generate UTF-8 --> GB18030 table # Read the input
$in_file = "ISO10646-GB18030.TXT"; $in_file = "gb-18030-2000.xml";
open(FILE, $in_file) || die("cannot open $in_file"); open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>) while (<FILE>)
{ {
chop; next if (! m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
if (/^#/) $u = $1;
{ $c = $2;
next; $c =~ s/ //g;
}
($u, $c, $rest) = split;
$ucs = hex($u); $ucs = hex($u);
$code = hex($c); $code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080) if ($code >= 0x80 && $ucs >= 0x0080)
{ {
$utf = &ucs2utf($ucs); $utf = &ucs2utf($ucs);
if ($array{$utf} ne "") if ($arrayu{$utf} ne "")
{ {
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next; next;
} }
if ($arrayc{$code} ne "")
{
printf STDERR "Warning: duplicate GB18030: %08x\n", $code;
next;
}
$arrayu{$utf} = $code;
$arrayc{$code} = $utf;
$count++; $count++;
$array{$utf} = $code;
} }
} }
close(FILE); close(FILE);
...@@ -54,11 +58,12 @@ $file = "utf8_to_gb18030.map"; ...@@ -54,11 +58,12 @@ $file = "utf8_to_gb18030.map";
open(FILE, "> $file") || die("cannot open $file"); open(FILE, "> $file") || die("cannot open $file");
print FILE "static const pg_utf_to_local ULmapGB18030[ $count ] = {\n"; print FILE "static const pg_utf_to_local ULmapGB18030[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array)) $cc = $count;
for $index (sort { $a <=> $b } keys(%arrayu))
{ {
$code = $array{$index}; $code = $arrayu{$index};
$count--; $cc--;
if ($count == 0) if ($cc == 0)
{ {
printf FILE " {0x%04x, 0x%04x}\n", $index, $code; printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
} }
...@@ -75,43 +80,17 @@ close(FILE); ...@@ -75,43 +80,17 @@ close(FILE);
# #
# then generate GB18030 --> UTF8 table # then generate GB18030 --> UTF8 table
# #
reset 'array';
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($u, $c, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR "Warning: duplicate code: %04x\n", $ucs;
next;
}
$count++;
$array{$code} = $utf;
}
}
close(FILE);
$file = "gb18030_to_utf8.map"; $file = "gb18030_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file"); open(FILE, "> $file") || die("cannot open $file");
print FILE "static const pg_local_to_utf LUmapGB18030[ $count ] = {\n"; print FILE "static const pg_local_to_utf LUmapGB18030[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
$cc = $count;
for $index (sort { $a <=> $b } keys(%arrayc))
{ {
$utf = $array{$index}; $utf = $arrayc{$index};
$count--; $cc--;
if ($count == 0) if ($cc == 0)
{ {
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
} }
......
This diff is collapsed.
...@@ -25,6 +25,161 @@ PG_FUNCTION_INFO_V1(utf8_to_gb18030); ...@@ -25,6 +25,161 @@ PG_FUNCTION_INFO_V1(utf8_to_gb18030);
extern Datum gb18030_to_utf8(PG_FUNCTION_ARGS); extern Datum gb18030_to_utf8(PG_FUNCTION_ARGS);
extern Datum utf8_to_gb18030(PG_FUNCTION_ARGS); extern Datum utf8_to_gb18030(PG_FUNCTION_ARGS);
/*
* Convert 4-byte GB18030 characters to and from a linear code space
*
* The first and third bytes can range from 0x81 to 0xfe (126 values),
* while the second and fourth bytes can range from 0x30 to 0x39 (10 values).
*/
static inline uint32
gb_linear(uint32 gb)
{
uint32 b0 = (gb & 0xff000000) >> 24;
uint32 b1 = (gb & 0x00ff0000) >> 16;
uint32 b2 = (gb & 0x0000ff00) >> 8;
uint32 b3 = (gb & 0x000000ff);
return b0 * 12600 + b1 * 1260 + b2 * 10 + b3 -
(0x81 * 12600 + 0x30 * 1260 + 0x81 * 10 + 0x30);
}
static inline uint32
gb_unlinear(uint32 lin)
{
uint32 r0 = 0x81 + lin / 12600;
uint32 r1 = 0x30 + (lin / 1260) % 10;
uint32 r2 = 0x81 + (lin / 10) % 126;
uint32 r3 = 0x30 + lin % 10;
return (r0 << 24) | (r1 << 16) | (r2 << 8) | r3;
}
/*
* Convert word-formatted UTF8 to and from Unicode code points
*
* Probably this should be somewhere else ...
*/
static inline uint32
unicode_to_utf8word(uint32 c)
{
uint32 word;
if (c <= 0x7F)
{
word = c;
}
else if (c <= 0x7FF)
{
word = (0xC0 | ((c >> 6) & 0x1F)) << 8;
word |= 0x80 | (c & 0x3F);
}
else if (c <= 0xFFFF)
{
word = (0xE0 | ((c >> 12) & 0x0F)) << 16;
word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
word |= 0x80 | (c & 0x3F);
}
else
{
word = (0xF0 | ((c >> 18) & 0x07)) << 24;
word |= (0x80 | ((c >> 12) & 0x3F)) << 16;
word |= (0x80 | ((c >> 6) & 0x3F)) << 8;
word |= 0x80 | (c & 0x3F);
}
return word;
}
static inline uint32
utf8word_to_unicode(uint32 c)
{
uint32 ucs;
if (c <= 0x7F)
{
ucs = c;
}
else if (c <= 0xFFFF)
{
ucs = ((c >> 8) & 0x1F) << 6;
ucs |= c & 0x3F;
}
else if (c <= 0xFFFFFF)
{
ucs = ((c >> 16) & 0x0F) << 12;
ucs |= ((c >> 8) & 0x3F) << 6;
ucs |= c & 0x3F;
}
else
{
ucs = ((c >> 24) & 0x07) << 18;
ucs |= ((c >> 16) & 0x3F) << 12;
ucs |= ((c >> 8) & 0x3F) << 6;
ucs |= c & 0x3F;
}
return ucs;
}
/*
* Perform mapping of GB18030 ranges to UTF8
*
* The ranges we need to convert are specified in gb-18030-2000.xml.
* All are ranges of 4-byte GB18030 codes.
*/
static uint32
conv_18030_to_utf8(uint32 code)
{
#define conv18030(minunicode, mincode, maxcode) \
if (code >= mincode && code <= maxcode) \
return unicode_to_utf8word(gb_linear(code) - gb_linear(mincode) + minunicode)
conv18030(0x0452, 0x8130D330, 0x8136A531);
conv18030(0x2643, 0x8137A839, 0x8138FD38);
conv18030(0x361B, 0x8230A633, 0x8230F237);
conv18030(0x3CE1, 0x8231D438, 0x8232AF32);
conv18030(0x4160, 0x8232C937, 0x8232F837);
conv18030(0x44D7, 0x8233A339, 0x8233C931);
conv18030(0x478E, 0x8233E838, 0x82349638);
conv18030(0x49B8, 0x8234A131, 0x8234E733);
conv18030(0x9FA6, 0x82358F33, 0x8336C738);
conv18030(0xE865, 0x8336D030, 0x84308534);
conv18030(0xFA2A, 0x84309C38, 0x84318537);
conv18030(0xFFE6, 0x8431A234, 0x8431A439);
conv18030(0x10000, 0x90308130, 0xE3329A35);
/* No mapping exists */
return 0;
}
/*
* Perform mapping of UTF8 ranges to GB18030
*/
static uint32
conv_utf8_to_18030(uint32 code)
{
uint32 ucs = utf8word_to_unicode(code);
#define convutf8(minunicode, maxunicode, mincode) \
if (ucs >= minunicode && ucs <= maxunicode) \
return gb_unlinear(ucs - minunicode + gb_linear(mincode))
convutf8(0x0452, 0x200F, 0x8130D330);
convutf8(0x2643, 0x2E80, 0x8137A839);
convutf8(0x361B, 0x3917, 0x8230A633);
convutf8(0x3CE1, 0x4055, 0x8231D438);
convutf8(0x4160, 0x4336, 0x8232C937);
convutf8(0x44D7, 0x464B, 0x8233A339);
convutf8(0x478E, 0x4946, 0x8233E838);
convutf8(0x49B8, 0x4C76, 0x8234A131);
convutf8(0x9FA6, 0xD7FF, 0x82358F33);
convutf8(0xE865, 0xF92B, 0x8336D030);
convutf8(0xFA2A, 0xFE2F, 0x84309C38);
convutf8(0xFFE6, 0xFFFF, 0x8431A234);
convutf8(0x10000, 0x10FFFF, 0x90308130);
/* No mapping exists */
return 0;
}
/* ---------- /* ----------
* conv_proc( * conv_proc(
* INTEGER, -- source encoding id * INTEGER, -- source encoding id
...@@ -47,7 +202,7 @@ gb18030_to_utf8(PG_FUNCTION_ARGS) ...@@ -47,7 +202,7 @@ gb18030_to_utf8(PG_FUNCTION_ARGS)
LocalToUtf(src, len, dest, LocalToUtf(src, len, dest,
LUmapGB18030, lengthof(LUmapGB18030), LUmapGB18030, lengthof(LUmapGB18030),
NULL, 0, NULL, 0,
NULL, conv_18030_to_utf8,
PG_GB18030); PG_GB18030);
PG_RETURN_VOID(); PG_RETURN_VOID();
...@@ -65,7 +220,7 @@ utf8_to_gb18030(PG_FUNCTION_ARGS) ...@@ -65,7 +220,7 @@ utf8_to_gb18030(PG_FUNCTION_ARGS)
UtfToLocal(src, len, dest, UtfToLocal(src, len, dest,
ULmapGB18030, lengthof(ULmapGB18030), ULmapGB18030, lengthof(ULmapGB18030),
NULL, 0, NULL, 0,
NULL, conv_utf8_to_18030,
PG_GB18030); PG_GB18030);
PG_RETURN_VOID(); PG_RETURN_VOID();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment