Commit 1de9cc0d authored by Heikki Linnakangas's avatar Heikki Linnakangas

Rewrite the perl scripts to produce our Unicode conversion tables.

Generate EUC_CN mappings from gb-18030-2000.xml, because GB2312.TXT is no
longer available.

Get UHC from windows-949-2000.xml, it's more up-to-date.

Plus tons more small changes. With these changes, the perl scripts
faithfully produce the *.map files we have in the repository, from the
external source files.

In the passing, fix the Makefile to also download CP932.TXT and CP950.TXT.

Based on patches by Kyotaro Horiguchi, reviewed by Daniel Gustafsson.

Discussion: https://postgr.es/m/08e7892a-d55c-eefe-76e6-7910bc8dd1f3@iki.fi
parent 6c303223
......@@ -39,8 +39,6 @@ WINMAPS = win866_to_utf8.map utf8_to_win866.map \
win1258_to_utf8.map utf8_to_win1258.map
GENERICMAPS = $(ISO8859MAPS) $(WINMAPS) \
johab_to_utf8.map utf8_to_johab.map \
uhc_to_utf8.map utf8_to_uhc.map \
gbk_to_utf8.map utf8_to_gbk.map \
koi8r_to_utf8.map utf8_to_koi8r.map
......@@ -51,6 +49,8 @@ SPECIALMAPS = euc_cn_to_utf8.map utf8_to_euc_cn.map \
sjis_to_utf8.map utf8_to_sjis.map \
gb18030_to_utf8.map utf8_to_gb18030.map \
big5_to_utf8.map utf8_to_big5.map \
johab_to_utf8.map utf8_to_johab.map \
uhc_to_utf8.map utf8_to_uhc.map \
euc_jis_2004_to_utf8.map euc_jis_2004_to_utf8_combined.map \
utf8_to_euc_jis_2004.map utf8_to_euc_jis_2004_combined.map \
shift_jis_2004_to_utf8.map shift_jis_2004_to_utf8_combined.map \
......@@ -63,23 +63,29 @@ ISO8859TEXTS = 8859-2.TXT 8859-3.TXT 8859-4.TXT 8859-5.TXT \
8859-10.TXT 8859-13.TXT 8859-14.TXT 8859-15.TXT \
8859-16.TXT
WINTEXTS = CP866.TXT CP874.TXT CP936.TXT CP949.TXT \
WINTEXTS = CP866.TXT CP874.TXT CP936.TXT \
CP1250.TXT CP1251.TXT \
CP1252.TXT CP1253.TXT CP1254.TXT CP1255.TXT \
CP1256.TXT CP1257.TXT CP1258.TXT
GENERICTEXTS = $(ISO8859TEXTS) $(WINTEXTS) \
KOI8-R.TXT KOI8-U.TXT JOHAB.TXT
KOI8-R.TXT KOI8-U.TXT
all: $(MAPS)
$(GENERICMAPS): UCS_to_most.pl $(GENERICTEXTS)
$(PERL) $<
euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl JIS0201.TXT JIS0208.TXT JIS0212.TXT
johab_to_utf8.map utf8_to_johab.map: UCS_to_JOHAB.pl JOHAB.TXT
$(PERL) $<
uhc_to_utf8.map utf8_to_uhc.map: UCS_to_UHC.pl windows-949-2000.xml
$(PERL) $<
euc_jp_to_utf8.map utf8_to_euc_jp.map: UCS_to_EUC_JP.pl CP932.TXT JIS0212.TXT
$(PERL) $<
euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl GB2312.TXT
euc_cn_to_utf8.map utf8_to_euc_cn.map: UCS_to_EUC_CN.pl gb-18030-2000.xml
$(PERL) $<
euc_kr_to_utf8.map utf8_to_euc_kr.map: UCS_to_EUC_KR.pl KSX1001.TXT
......@@ -119,7 +125,7 @@ BIG5.TXT CNS11643.TXT:
euc-jis-2004-std.txt sjis-0213-2004-std.txt:
$(DOWNLOAD) http://x0213.org/codetable/$(@F)
gb-18030-2000.xml:
gb-18030-2000.xml windows-949-2000.xml:
$(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F)
GB2312.TXT:
......@@ -137,7 +143,7 @@ KOI8-R.TXT KOI8-U.TXT:
$(ISO8859TEXTS):
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/ISO8859/$(@F)
$(filter-out CP8%,$(WINTEXTS)):
$(filter-out CP8%,$(WINTEXTS)) CP932.TXT CP950.TXT:
$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F)
$(filter CP8%,$(WINTEXTS)):
......
......@@ -25,56 +25,17 @@
# # and Unicode name (not used in this script)
require "ucs2utf.pl";
require "convutils.pm";
# Load BIG5.TXT
my $all = &read_source("BIG5.TXT");
#
# first, generate UTF8 --> BIG5 table
#
$in_file = "BIG5.TXT";
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
# Load CP950.TXT
my $cp950txt = &read_source("CP950.TXT");
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$utf} = $code;
}
}
close(FILE);
$in_file = "CP950.TXT";
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
foreach my $i (@$cp950txt) {
my $code = $i->{code};
my $ucs = $i->{ucs};
# Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
# from CP950.TXT
......@@ -83,126 +44,25 @@ while (<FILE>)
&& $code >= 0xf9d6
&& $code <= 0xf9dc)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$utf} = $code;
push @$all, {code => $code,
ucs => $ucs,
comment => $i->{comment},
direction => "both"};
}
}
close(FILE);
$file = lc("utf8_to_big5.map");
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_utf_to_local ULmapBIG5[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
}
}
print FILE "};\n";
close(FILE);
#
# then generate BIG5 --> UTF8 table
#
$in_file = "BIG5.TXT";
open(FILE, $in_file) || die("cannot open $in_file");
foreach my $i (@$all) {
my $code = $i->{code};
my $ucs = $i->{ucs};
reset 'array';
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$code} = $utf;
}
}
close(FILE);
$in_file = "CP950.TXT";
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
# Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
# from CP950.TXT
if ( $code >= 0x80
&& $ucs >= 0x0080
&& $code >= 0xf9d6
&& $code <= 0xf9dc)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$code} = $utf;
}
}
close(FILE);
$file = lc("big5_to_utf8.map");
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_local_to_utf LUmapBIG5[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$utf = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}
else
# BIG5.TXT maps several BIG5 characters to U+FFFD. The UTF-8 to BIG5 mapping can
# contain only one of them. XXX: Doesn't really make sense to include any of them,
# but for historical reasons, we map the first one of them.
if ($i->{ucs} == 0xFFFD && $i->{code} != 0xA15A)
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
$i->{direction} = "to_unicode";
}
}
print FILE "};\n";
close(FILE);
# Output
print_tables("BIG5", $all);
#! /usr/bin/perl
#
# Copyright (c) 2001-2016, PostgreSQL Global Development Group
# Copyright (c) 2007-2016, PostgreSQL Global Development Group
#
# src/backend/utils/mb/Unicode/UCS_to_EUC_CN.pl
# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
#
# Generate UTF-8 <--> EUC_CN code conversion tables from
# map files provided by Unicode organization.
# Unfortunately it is prohibited by the organization
# to distribute the map files. So if you try to use this script,
# you have to obtain GB2312.TXT from
# the organization's ftp site.
# Generate UTF-8 <--> GB18030 code conversion tables from
# "gb-18030-2000.xml", obtained from
# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/
#
# GB2312.TXT format:
# GB2312 code in hex
# UCS-2 code in hex
# # and Unicode name (not used in this script)
# The lines we care about in the source file look like
# <a u="009A" b="81 30 83 36"/>
# where the "u" field is the Unicode code point in hex,
# and the "b" field is the hex byte sequence for GB18030
require "ucs2utf.pl";
require "convutils.pm";
# first generate UTF-8 --> EUC_CN table
# Read the input
$in_file = "GB2312.TXT";
$in_file = "gb-18030-2000.xml";
open(FILE, $in_file) || die("cannot open $in_file");
my @mapping;
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
$u = $1;
$c = $2;
$c =~ s/ //g;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$utf} = ($code | 0x8080);
}
}
close(FILE);
$file = "utf8_to_euc_cn.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_utf_to_local ULmapEUC_CN[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
# The GB-18030 character set, which we use as the source, contains
# a lot of extra characters on top of the GB2312 character set that
# EUC_CN encodes. Filter out those extra characters.
next if (($code & 0xFF) < 0xA1);
next if (!($code >= 0xA100 && $code <= 0xA9FF ||
$code >= 0xB000 && $code <= 0xF7FF));
next if ($code >= 0xA2A1 && $code <= 0xA2B0);
next if ($code >= 0xA2E3 && $code <= 0xA2E4);
next if ($code >= 0xA2EF && $code <= 0xA2F0);
next if ($code >= 0xA2FD && $code <= 0xA2FE);
next if ($code >= 0xA4F4 && $code <= 0xA4FE);
next if ($code >= 0xA5F7 && $code <= 0xA5FE);
next if ($code >= 0xA6B9 && $code <= 0xA6C0);
next if ($code >= 0xA6D9 && $code <= 0xA6FE);
next if ($code >= 0xA7C2 && $code <= 0xA7D0);
next if ($code >= 0xA7F2 && $code <= 0xA7FE);
next if ($code >= 0xA8BB && $code <= 0xA8C4);
next if ($code >= 0xA8EA && $code <= 0xA8FE);
next if ($code >= 0xA9A1 && $code <= 0xA9A3);
next if ($code >= 0xA9F0 && $code <= 0xA9FE);
next if ($code >= 0xD7FA && $code <= 0xD7FE);
# A couple of characters are mapped differently from GB-2312 or GB-18030
if ($code == 0xA1A4)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
$ucs = 0x30FB;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
}
}
print FILE "};\n";
close(FILE);
#
# then generate EUC_CN --> UTF8 table
#
reset 'array';
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
if ($code == 0xA1AA)
{
next;
$ucs = 0x2015;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR "Warning: duplicate code: %04x\n", $ucs;
next;
}
$count++;
$code |= 0x8080;
$array{$code} = $utf;
push @mapping, {
ucs => $ucs,
code => $code,
direction => 'both'
}
}
close(FILE);
$file = "euc_cn_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_local_to_utf LUmapEUC_CN[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$utf = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
}
}
print FILE "};\n";
close(FILE);
print_tables("EUC_CN", \@mapping);
......@@ -7,9 +7,7 @@
# Generate UTF-8 <--> EUC_JIS_2004 code conversion tables from
# "euc-jis-2004-std.txt" (http://x0213.org)
require "ucs2utf.pl";
$TEST = 0;
require "convutils.pm";
# first generate UTF-8 --> EUC_JIS_2004 table
......@@ -17,10 +15,7 @@ $in_file = "euc-jis-2004-std.txt";
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
reset 'array1';
reset 'comment';
reset 'comment1';
my @all;
while ($line = <FILE>)
{
......@@ -31,14 +26,14 @@ while ($line = <FILE>)
$u2 = $3;
$rest = "U+" . $u1 . "+" . $u2 . $4;
$code = hex($c);
$ucs = hex($u1);
$utf1 = &ucs2utf($ucs);
$ucs = hex($u2);
$utf2 = &ucs2utf($ucs);
$str = sprintf "%08x%08x", $utf1, $utf2;
$array1{$str} = $code;
$comment1{$str} = $rest;
$count1++;
$ucs1 = hex($u1);
$ucs2 = hex($u2);
push @all, { direction => 'both',
ucs => $ucs1,
ucs_second => $ucs2,
code => $code,
comment => $rest };
next;
}
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
......@@ -54,252 +49,11 @@ while ($line = <FILE>)
$ucs = hex($u);
$code = hex($c);
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$utf} = $code;
$comment{$code} = $rest;
}
close(FILE);
$file = "utf8_to_euc_jis_2004.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
print FILE " */\n";
print FILE "static const pg_utf_to_local ULmapEUC_JIS_2004[] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%08x, 0x%06x} /* %s */\n", $index, $code,
$comment{$code};
}
else
{
printf FILE " {0x%08x, 0x%06x}, /* %s */\n", $index, $code,
$comment{$code};
}
}
print FILE "};\n";
close(FILE);
if ($TEST == 1)
{
$file1 = "utf8.data";
$file2 = "euc_jis_2004.data";
open(FILE1, "> $file1") || die("cannot open $file1");
open(FILE2, "> $file2") || die("cannot open $file2");
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
if ( $code > 0x00
&& $code != 0x09
&& $code != 0x0a
&& $code != 0x0d
&& $code != 0x5c
&& ( $code < 0x80
|| ($code >= 0x8ea1 && $code <= 0x8efe)
|| ($code >= 0x8fa1a1 && $code <= 0x8ffefe)
|| ($code >= 0xa1a1 && $code <= 0x8fefe)))
{
for ($i = 3; $i >= 0; $i--)
{
$s = $i * 8;
$mask = 0xff << $s;
print FILE1 pack("C", ($index & $mask) >> $s)
if $index & $mask;
print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask;
}
print FILE1 "\n";
print FILE2 "\n";
}
}
}
$file = "utf8_to_euc_jis_2004_combined.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
print FILE " */\n";
print FILE
"static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[] = {\n";
next if ($code < 0x80 && $ucs < 0x80);
for $index (sort { $a cmp $b } keys(%array1))
{
$code = $array1{$index};
$count1--;
if ($count1 == 0)
{
printf FILE " {0x%s, 0x%s, 0x%06x} /* %s */\n", substr($index, 0, 8),
substr($index, 8, 8), $code, $comment1{$index};
}
else
{
printf FILE " {0x%s, 0x%s, 0x%06x}, /* %s */\n",
substr($index, 0, 8), substr($index, 8, 8), $code,
$comment1{$index};
}
push @all, { direction => 'both', ucs => $ucs, code => $code, comment => $rest };
}
print FILE "};\n";
close(FILE);
if ($TEST == 1)
{
for $index (sort { $a cmp $b } keys(%array1))
{
$code = $array1{$index};
if ( $code > 0x00
&& $code != 0x09
&& $code != 0x0a
&& $code != 0x0d
&& $code != 0x5c
&& ( $code < 0x80
|| ($code >= 0x8ea1 && $code <= 0x8efe)
|| ($code >= 0x8fa1a1 && $code <= 0x8ffefe)
|| ($code >= 0xa1a1 && $code <= 0x8fefe)))
{
$v1 = hex(substr($index, 0, 8));
$v2 = hex(substr($index, 8, 8));
for ($i = 3; $i >= 0; $i--)
{
$s = $i * 8;
$mask = 0xff << $s;
print FILE1 pack("C", ($v1 & $mask) >> $s) if $v1 & $mask;
print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask;
}
for ($i = 3; $i >= 0; $i--)
{
$s = $i * 8;
$mask = 0xff << $s;
print FILE1 pack("C", ($v2 & $mask) >> $s) if $v2 & $mask;
}
print FILE1 "\n";
print FILE2 "\n";
}
}
close(FILE1);
close(FILE2);
}
# then generate EUC_JIS_2004 --> UTF-8 table
$in_file = "euc-jis-2004-std.txt";
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
reset 'array1';
reset 'comment';
reset 'comment1';
while ($line = <FILE>)
{
if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
{
$c = $1;
$u1 = $2;
$u2 = $3;
$rest = "U+" . $u1 . "+" . $u2 . $4;
$code = hex($c);
$ucs = hex($u1);
$utf1 = &ucs2utf($ucs);
$ucs = hex($u2);
$utf2 = &ucs2utf($ucs);
$str = sprintf "%08x%08x", $utf1, $utf2;
$array1{$code} = $str;
$comment1{$code} = $rest;
$count1++;
next;
}
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
{
$c = $1;
$u = $2;
$rest = "U+" . $u . $3;
}
else
{
next;
}
$ucs = hex($u);
$code = hex($c);
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$code} = $utf;
$comment{$utf} = $rest;
}
close(FILE);
$file = "euc_jis_2004_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
print FILE " */\n";
print FILE "static const pg_local_to_utf LUmapEUC_JIS_2004[] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%06x, 0x%08x} /* %s */\n", $index, $code,
$comment{$code};
}
else
{
printf FILE " {0x%06x, 0x%08x}, /* %s */\n", $index, $code,
$comment{$code};
}
}
print FILE "};\n";
close(FILE);
$file = "euc_jis_2004_to_utf8_combined.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
print FILE " */\n";
print FILE
"static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[] = {\n";
for $index (sort { $a <=> $b } keys(%array1))
{
$code = $array1{$index};
$count1--;
if ($count1 == 0)
{
printf FILE " {0x%06x, 0x%s, 0x%s} /* %s */\n", $index,
substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
}
else
{
printf FILE " {0x%06x, 0x%s, 0x%s}, /* %s */\n", $index,
substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
}
}
print FILE "};\n";
close(FILE);
print_tables("EUC_JIS_2004", \@all, 1);
......@@ -16,113 +16,22 @@
# UCS-2 code in hex
# # and Unicode name (not used in this script)
require "ucs2utf.pl";
require "convutils.pm";
# first generate UTF-8 --> EUC_KR table
# Load the source file.
$in_file = "KSX1001.TXT";
my $mapping = &read_source("KSX1001.TXT");
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$utf} = ($code | 0x8080);
}
}
close(FILE);
$file = "utf8_to_euc_kr.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_utf_to_local ULmapEUC_KR[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
foreach my $i (@$mapping)
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
}
$i->{code} = $i->{code} | 0x8080;
}
print FILE "};\n";
close(FILE);
#
# then generate EUC_KR --> UTF8 table
#
reset 'array';
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR "Warning: duplicate code: %04x\n", $ucs;
next;
}
$count++;
$code |= 0x8080;
$array{$code} = $utf;
}
}
close(FILE);
$file = "euc_kr_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_local_to_utf LUmapEUC_KR[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$utf = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
}
}
# Some extra characters that are not in KSX1001.TXT
push @$mapping, (
{direction => 'both', ucs => 0x20AC, code => 0xa2e6, comment => '# EURO SIGN'},
{direction => 'both', ucs => 0x00AE, code => 0xa2e7, comment => '# REGISTERED SIGN'},
{direction => 'both', ucs => 0x327E, code => 0xa2e8, comment => '# CIRCLED HANGUL IEUNG U'}
);
print FILE "};\n";
close(FILE);
print_tables("EUC_KR", $mapping);
......@@ -17,141 +17,47 @@
# UCS-2 code in hex
# # and Unicode name (not used in this script)
require "ucs2utf.pl";
require "convutils.pm";
# first generate UTF-8 --> EUC_TW table
my $mapping = &read_source("CNS11643.TXT");
$in_file = "CNS11643.TXT";
my @extras;
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
foreach my $i (@$mapping)
{
chop;
if (/^#/)
my $ucs = $i->{ucs};
my $code = $i->{code};
my $origcode = $i->{code};
my $plane = ($code & 0x1f0000) >> 16;
if ($plane > 16)
{
printf STDERR "Warning: invalid plane No.$plane. ignored\n";
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$plane = ($code & 0x1f0000) >> 16;
if ($plane > 16)
{
printf STDERR "Warning: invalid plane No.$plane. ignored\n";
next;
}
if ($plane == 1)
{
$array{$utf} = (($code & 0xffff) | 0x8080);
}
else
{
$array{$utf} =
(0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080);
}
}
}
close(FILE);
$file = "utf8_to_euc_tw.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_utf_to_local ULmapEUC_TW[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
if ($plane == 1)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
$code = ($code & 0xffff) | 0x8080;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
$code = (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080);
}
}
print FILE "};\n";
close(FILE);
#
# then generate EUC_TW --> UTF8 table
#
reset 'array';
open(FILE, $in_file) || die("cannot open $in_file");
$i->{code} = $code;
while (<FILE>)
{
chop;
if (/^#/)
# Some codes are mapped twice in the EUC_TW to UTF-8 table.
if ($origcode >= 0x12121 && $origcode <= 0x20000)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR "Warning: duplicate code: %04x\n", $ucs;
next;
}
$count++;
$plane = ($code & 0x1f0000) >> 16;
if ($plane > 16)
{
printf STDERR "Warning: invalid plane No.$plane. ignored\n";
next;
}
if ($plane == 1)
{
$c = (($code & 0xffff) | 0x8080);
$array{$c} = $utf;
$count++;
push @extras, {
ucs => $i->{ucs},
code => ($i->{code} + 0x8ea10000),
rest => $i->{rest},
direction => 'to_unicode'
}
$c = (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080);
$array{$c} = $utf;
}
}
close(FILE);
$file = "euc_tw_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_local_to_utf LUmapEUC_TW[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$utf = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
}
}
push @$mapping, @extras;
print FILE "};\n";
close(FILE);
print_tables("EUC_TW", $mapping);
......@@ -13,8 +13,7 @@
# where the "u" field is the Unicode code point in hex,
# and the "b" field is the hex byte sequence for GB18030
require "ucs2utf.pl";
require "convutils.pm";
# Read the input
......@@ -22,6 +21,8 @@ $in_file = "gb-18030-2000.xml";
open(FILE, $in_file) || die("cannot open $in_file");
my @mapping;
while (<FILE>)
{
next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
......@@ -32,78 +33,13 @@ while (<FILE>)
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($arrayu{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
push @mapping, {
ucs => $ucs,
code => $code,
direction => 'both'
}
if ($arrayc{$code} ne "")
{
printf STDERR "Warning: duplicate GB18030: %08x\n", $code;
next;
}
$arrayu{$utf} = $code;
$arrayc{$code} = $utf;
$count++;
}
}
close(FILE);
#
# first, generate UTF8 --> GB18030 table
#
$file = "utf8_to_gb18030.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_utf_to_local ULmapGB18030[ $count ] = {\n";
$cc = $count;
for $index (sort { $a <=> $b } keys(%arrayu))
{
$code = $arrayu{$index};
$cc--;
if ($cc == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
}
}
print FILE "};\n";
close(FILE);
#
# then generate GB18030 --> UTF8 table
#
$file = "gb18030_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_local_to_utf LUmapGB18030[ $count ] = {\n";
$cc = $count;
for $index (sort { $a <=> $b } keys(%arrayc))
{
$utf = $arrayc{$index};
$cc--;
if ($cc == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
}
}
print FILE "};\n";
close(FILE);
print_tables("GB18030", \@mapping);
#! /usr/bin/perl
#
# Copyright (c) 2001-2016, PostgreSQL Global Development Group
#
# src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl
#
# Generate UTF-8 <--> JOHAB conversion tables from
# map files provided by Unicode organization.
# Unfortunately it is prohibited by the organization
# to distribute the map files. So if you try to use this script,
# you have to obtain the map files from the organization's ftp site.
# ftp://www.unicode.org/Public/MAPPINGS/
# We assume the file include three tab-separated columns:
# JOHAB code in hex
# UCS-2 code in hex
# # and Unicode name (not used in this script)
require "convutils.pm";
# Load the source file.
my $mapping = &read_source("JOHAB.TXT");
# Some extra characters that are not in JOHAB.TXT
push @$mapping, (
{direction => 'both', ucs => 0x20AC, code => 0xd9e6, comment => '# EURO SIGN'},
{direction => 'both', ucs => 0x00AE, code => 0xd9e7, comment => '# REGISTERED SIGN'},
{direction => 'both', ucs => 0x327E, code => 0xd9e8, comment => '# CIRCLED HANGUL IEUNG U'}
);
print_tables("JOHAB", $mapping);
......@@ -7,7 +7,7 @@
# Generate UTF-8 <--> SHIFT_JIS_2004 code conversion tables from
# "sjis-0213-2004-std.txt" (http://x0213.org)
require "ucs2utf.pl";
require "convutils.pm";
# first generate UTF-8 --> SHIFT_JIS_2004 table
......@@ -15,10 +15,7 @@ $in_file = "sjis-0213-2004-std.txt";
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
reset 'array1';
reset 'comment';
reset 'comment1';
my @mapping;
while ($line = <FILE>)
{
......@@ -29,14 +26,16 @@ while ($line = <FILE>)
$u2 = $3;
$rest = "U+" . $u1 . "+" . $u2 . $4;
$code = hex($c);
$ucs = hex($u1);
$utf1 = &ucs2utf($ucs);
$ucs = hex($u2);
$utf2 = &ucs2utf($ucs);
$str = sprintf "%08x%08x", $utf1, $utf2;
$array1{$str} = $code;
$comment1{$str} = $rest;
$count1++;
$ucs1 = hex($u1);
$ucs2 = hex($u2);
push @mapping, {
code => $code,
ucs => $ucs1,
ucs_second => $ucs2,
comment => $rest,
direction => 'both'
};
next;
}
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
......@@ -52,183 +51,31 @@ while ($line = <FILE>)
$ucs = hex($u);
$code = hex($c);
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR
"Warning: duplicate UTF8: %08x UCS: %04x Shift JIS: %04x\n", $utf,
$ucs, $code;
next;
}
$count++;
$array{$utf} = $code;
$comment{$code} = $rest;
}
close(FILE);
$file = "utf8_to_shift_jis_2004.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n";
print FILE " */\n";
print FILE "static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%08x, 0x%06x} /* %s */\n", $index, $code,
$comment{$code};
}
else
if ($code < 0x80 && $ucs < 0x80)
{
printf FILE " {0x%08x, 0x%06x}, /* %s */\n", $index, $code,
$comment{$code};
}
}
print FILE "};\n";
close(FILE);
$file = "utf8_to_shift_jis_2004_combined.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n";
print FILE " */\n";
print FILE
"static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[] = {\n";
for $index (sort { $a cmp $b } keys(%array1))
{
$code = $array1{$index};
$count1--;
if ($count1 == 0)
{
printf FILE " {0x%s, 0x%s, 0x%04x} /* %s */\n", substr($index, 0, 8),
substr($index, 8, 8), $code, $comment1{$index};
}
else
{
printf FILE " {0x%s, 0x%s, 0x%04x}, /* %s */\n",
substr($index, 0, 8), substr($index, 8, 8), $code,
$comment1{$index};
}
}
print FILE "};\n";
close(FILE);
# then generate SHIFT_JIS_2004 --> UTF-8 table
$in_file = "sjis-0213-2004-std.txt";
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
reset 'array1';
reset 'comment';
reset 'comment1';
while ($line = <FILE>)
{
if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
{
$c = $1;
$u1 = $2;
$u2 = $3;
$rest = "U+" . $u1 . "+" . $u2 . $4;
$code = hex($c);
$ucs = hex($u1);
$utf1 = &ucs2utf($ucs);
$ucs = hex($u2);
$utf2 = &ucs2utf($ucs);
$str = sprintf "%08x%08x", $utf1, $utf2;
$array1{$code} = $str;
$comment1{$code} = $rest;
$count1++;
next;
}
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
elsif ($code < 0x80)
{
$c = $1;
$u = $2;
$rest = "U+" . $u . $3;
$direction = 'from_unicode';
}
else
{
next;
}
$ucs = hex($u);
$code = hex($c);
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR
"Warning: duplicate UTF8: %08x UCS: %04x Shift JIS: %04x\n", $utf,
$ucs, $code;
printf STDERR "Previous value: UTF8: %08x\n", $array{$utf};
next;
}
$count++;
$array{$code} = $utf;
$comment{$utf} = $rest;
}
close(FILE);
$file = "shift_jis_2004_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_SHIFTJIS_2004.pl\n";
print FILE " */\n";
print FILE "static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
elsif ($ucs < 0x80)
{
printf FILE " {0x%04x, 0x%08x} /* %s */\n", $index, $code,
$comment{$code};
$direction = 'to_unicode';
}
else
{
printf FILE " {0x%04x, 0x%08x}, /* %s */\n", $index, $code,
$comment{$code};
$direction = 'both';
}
}
print FILE "};\n";
close(FILE);
$file = "shift_jis_2004_to_utf8_combined.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/*\n";
print FILE " * This file was generated by UCS_to_SHIFT_JIS_2004.pl\n";
print FILE " */\n";
print FILE
"static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[] = {\n";
for $index (sort { $a <=> $b } keys(%array1))
{
$code = $array1{$index};
$count1--;
if ($count1 == 0)
{
printf FILE " {0x%04x, 0x%s, 0x%s} /* %s */\n", $index,
substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
}
else
{
printf FILE " {0x%04x, 0x%s, 0x%s}, /* %s */\n", $index,
substr($code, 0, 8), substr($code, 8, 8), $comment1{$index};
}
push @mapping, {
code => $code,
ucs => $ucs,
comment => $rest,
direction => $direction
};
}
print FILE "};\n";
close(FILE);
print_tables("SHIFT_JIS_2004", \@mapping, 1);
......@@ -4,138 +4,45 @@
#
# src/backend/utils/mb/Unicode/UCS_to_SJIS.pl
#
# Generate UTF-8 <--> SJIS code conversion tables from
# map files provided by Unicode organization.
# Unfortunately it is prohibited by the organization
# to distribute the map files. So if you try to use this script,
# you have to obtain SHIFTJIS.TXT from
# the organization's ftp site.
#
# SHIFTJIS.TXT format:
# SHIFTJIS code in hex
# UCS-2 code in hex
# # and Unicode name (not used in this script)
# Warning: SHIFTJIS.TXT contains only JIS0201 and JIS0208. no JIS0212.
require "ucs2utf.pl";
# first generate UTF-8 --> SJIS table
$in_file = "CP932.TXT";
$count = 0;
open(FILE, $in_file) || die("cannot open $in_file");
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ((($code >= 0xed40) && ($code <= 0xeefc))
|| ( ($code >= 0x8754)
&& ($code <= 0x875d))
|| ($code == 0x878a)
|| ($code == 0x8782)
|| ($code == 0x8784)
|| ($code == 0xfa5b)
|| ($code == 0xfa54)
|| ( ($code >= 0x8790)
&& ($code <= 0x8792))
|| ( ($code >= 0x8795)
&& ($code <= 0x8797))
|| ( ($code >= 0x879a)
&& ($code <= 0x879c)))
{
printf STDERR "Warning: duplicate UTF8: UCS=0x%04x SJIS=0x%04x\n",
$ucs,
$code;
next;
}
$count++;
$array{$utf} = $code;
}
}
# Generate UTF-8 <=> SJIS code conversion radix tree Generate UTF-8
# <=> SJIS code conversion radix tree Unfortunately it is prohibited
# by the organization to distribute the map files. So if you try to
# use this script, you have to obtain CP932.TXT from the organization's
# ftp site.
close(FILE);
use strict;
require "convutils.pm";
$file = "utf8_to_sjis.map";
open(FILE, "> $file") || die("cannot open $file");
my $charset = read_source("CP932.TXT");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_utf_to_local ULmapSJIS[ $count ] = {\n";
# Drop these SJIS codes from the source for UTF8=>SJIS conversion
my @reject_sjis =(
0xed40..0xeefc, 0x8754..0x875d, 0x878a, 0x8782,
0x8784, 0xfa5b, 0xfa54, 0x8790..0x8792, 0x8795..0x8797,
0x879a..0x879c
);
for $index (sort { $a <=> $b } keys(%array))
foreach my $i (@$charset)
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
}
}
print FILE "};\n";
close(FILE);
my $code = $i->{code};
my $ucs = $i->{ucs};
#
# then generate SJIS --> UTF8 table
#
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
$count = 0;
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
$count++;
$array{$code} = $utf;
}
}
close(FILE);
$file = "sjis_to_utf8.map";
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_local_to_utf LUmapSJIS[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$utf = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}
else
if (grep {$code == $_} @reject_sjis)
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
$i->{direction} = "to_unicode";
}
}
print FILE "};\n";
close(FILE);
# Add these UTF8->SJIS pairs to the table.
push @$charset, (
{direction => "from_unicode", ucs => 0x00a2, code => 0x8191, comment => '# CENT SIGN'},
{direction => "from_unicode", ucs => 0x00a3, code => 0x8192, comment => '# POUND SIGN'},
{direction => "from_unicode", ucs => 0x00a5, code => 0x5c, comment => '# YEN SIGN'},
{direction => "from_unicode", ucs => 0x00ac, code => 0x81ca, comment => '# NOT SIGN'},
{direction => "from_unicode", ucs => 0x2016, code => 0x8161, comment => '# DOUBLE VERTICAL LINE'},
{direction => "from_unicode", ucs => 0x203e, code => 0x7e, comment => '# OVERLINE'},
{direction => "from_unicode", ucs => 0x2212, code => 0x817c, comment => '# MINUS SIGN'},
{direction => "from_unicode", ucs => 0x301c, code => 0x8160, comment => '# WAVE DASH'}
);
print_tables("SJIS", $charset);
#! /usr/bin/perl
#
# Copyright (c) 2007-2016, PostgreSQL Global Development Group
#
# src/backend/utils/mb/Unicode/UCS_to_GB18030.pl
#
# Generate UTF-8 <--> UHC code conversion tables from
# "windows-949-2000.xml", obtained from
# http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/
#
# The lines we care about in the source file look like
# <a u="009A" b="81 30 83 36"/>
# where the "u" field is the Unicode code point in hex,
# and the "b" field is the hex byte sequence for UHC
require "convutils.pm";
# Read the input
$in_file = "windows-949-2000.xml";
open(FILE, $in_file) || die("cannot open $in_file");
my @mapping;
while (<FILE>)
{
next if (!m/<a u="([0-9A-F]+)" b="([0-9A-F ]+)"/);
$u = $1;
$c = $2;
$c =~ s/ //g;
$ucs = hex($u);
$code = hex($c);
next if ($code == 0x0080 || $code == 0x00FF);
if ($code >= 0x80 && $ucs >= 0x0080)
{
push @mapping, {
ucs => $ucs,
code => $code,
direction => 'both'
}
}
}
close(FILE);
# One extra character that's not in the source file.
push @mapping, { direction => 'both', code => 0xa2e8, ucs => 0x327e, comment => 'CIRCLED HANGUL IEUNG U' };
print_tables("UHC", \@mapping);
......@@ -15,7 +15,7 @@
# UCS-2 code in hex
# # and Unicode name (not used in this script)
require "ucs2utf.pl";
require "convutils.pm";
%filename = (
'WIN866' => 'CP866.TXT',
......@@ -44,121 +44,13 @@ require "ucs2utf.pl";
'ISO8859_16' => '8859-16.TXT',
'KOI8R' => 'KOI8-R.TXT',
'KOI8U' => 'KOI8-U.TXT',
'GBK' => 'CP936.TXT',
'UHC' => 'CP949.TXT',
'JOHAB' => 'JOHAB.TXT',);
'GBK' => 'CP936.TXT');
@charsets = keys(%filename);
@charsets = @ARGV if scalar(@ARGV);
foreach $charset (@charsets)
{
my $mapping = &read_source($filename{$charset});
#
# first, generate UTF8-> charset table
#
$in_file = $filename{$charset};
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$utf} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$utf} = $code;
}
}
close(FILE);
$file = lc("utf8_to_${charset}.map");
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_utf_to_local ULmap${charset}[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$code = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
}
}
print FILE "};\n";
close(FILE);
#
# then generate character set code ->UTF8 table
#
open(FILE, $in_file) || die("cannot open $in_file");
reset 'array';
while (<FILE>)
{
chop;
if (/^#/)
{
next;
}
($c, $u, $rest) = split;
$ucs = hex($u);
$code = hex($c);
if ($code >= 0x80 && $ucs >= 0x0080)
{
$utf = &ucs2utf($ucs);
if ($array{$code} ne "")
{
printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
next;
}
$count++;
$array{$code} = $utf;
}
}
close(FILE);
$file = lc("${charset}_to_utf8.map");
open(FILE, "> $file") || die("cannot open $file");
print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
print FILE "static const pg_local_to_utf LUmap${charset}[ $count ] = {\n";
for $index (sort { $a <=> $b } keys(%array))
{
$utf = $array{$index};
$count--;
if ($count == 0)
{
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
}
else
{
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
}
}
print FILE "};\n";
close(FILE);
print_tables($charset, $mapping);
}
#
# Copyright (c) 2001-2016, PostgreSQL Global Development Group
#
# src/backend/utils/mb/Unicode/convutils.pm
use strict;
#######################################################################
# convert UCS-4 to UTF-8
#
sub ucs2utf
{
my ($ucs) = @_;
my $utf;
if ($ucs <= 0x007f)
{
$utf = $ucs;
}
elsif ($ucs > 0x007f && $ucs <= 0x07ff)
{
$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
}
elsif ($ucs > 0x07ff && $ucs <= 0xffff)
{
$utf =
((($ucs >> 12) | 0xe0) << 16) |
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
}
else
{
$utf =
((($ucs >> 18) | 0xf0) << 24) |
(((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
}
return ($utf);
}
#######################################################################
# read_source - common routine to read source file
#
# fname ; input file name
sub read_source
{
my ($fname) = @_;
my @r;
open(my $in, '<', $fname) || die("cannot open $fname");
while (<$in>)
{
next if (/^#/);
chop;
next if (/^$/); # Ignore empty lines
next if (/^0x([0-9A-F]+)\s+(#.*)$/);
# Skip the first column for JIS0208.TXT
if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(?:0x([0-9A-Fa-f]+)\s+)?(#.*)$/)
{
print STDERR "READ ERROR at line $. in $fname: $_\n";
exit;
}
my $out = {f => $fname, l => $.,
code => hex($1),
ucs => hex($2),
comment => $4,
direction => "both"
};
# Ignore pure ASCII mappings. PostgreSQL character conversion code
# never even passes these to the conversion code.
next if ($out->{code} < 0x80 || $out->{ucs} < 0x80);
push(@r, $out);
}
close($in);
return \@r;
}
##################################################################
# print_tables : output mapping tables
#
# Arguments:
# charset - string name of the character set.
# table - mapping table (see format below)
# verbose - if 1, output comment on each line,
# if 2, also output source file name and number
#
#
#
# Mapping table format:
#
# Mapping table is a list of hashes. Each hash has the following fields:
# direction - Direction: 'both', 'from_unicode' or 'to_unicode'
# ucs - Unicode code point
# ucs_second - Second Unicode code point, if this is a "combined" character.
# code - Byte sequence in the "other" character set, as an integer
# comment - Text representation of the character
# f - Source filename
# l - Line number in source file
#
#
sub print_tables
{
my ($charset, $table, $verbose) = @_;
# Build an array with only the to-UTF8 direction mappings
my @to_unicode;
my @to_unicode_combined;
my @from_unicode;
my @from_unicode_combined;
foreach my $i (@$table)
{
if (defined $i->{ucs_second})
{
my $entry = {utf8 => ucs2utf($i->{ucs}),
utf8_second => ucs2utf($i->{ucs_second}),
code => $i->{code},
comment => $i->{comment},
f => $i->{f}, l => $i->{l}};
if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
{
push @to_unicode_combined, $entry;
}
if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
{
push @from_unicode_combined, $entry;
}
}
else
{
my $entry = {utf8 => ucs2utf($i->{ucs}),
code => $i->{code},
comment => $i->{comment},
f => $i->{f}, l => $i->{l}};
if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
{
push @to_unicode, $entry;
}
if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
{
push @from_unicode, $entry;
}
}
}
print_to_utf8_map($charset, \@to_unicode, $verbose);
print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0);
print_from_utf8_map($charset, \@from_unicode, $verbose);
print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0);
}
sub print_from_utf8_map
{
my ($charset, $table, $verbose) = @_;
my $last_comment = "";
my $fname = lc("utf8_to_${charset}.map");
print "- Writing UTF8=>${charset} conversion table: $fname\n";
open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
"static const pg_utf_to_local ULmap${charset}[ %d ] = {",
scalar(@$table));
my $first = 1;
foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table)
{
print($out ",") if (!$first);
$first = 0;
print($out "\t/* $last_comment */") if ($verbose);
printf($out "\n {0x%04x, 0x%04x}", $$i{utf8}, $$i{code});
if ($verbose >= 2)
{
$last_comment = "$$i{f}:$$i{l} $$i{comment}";
}
else
{
$last_comment = $$i{comment};
}
}
print($out "\t/* $last_comment */") if ($verbose);
print $out "\n};\n";
close($out);
}
sub print_from_utf8_combined_map
{
my ($charset, $table, $verbose) = @_;
my $last_comment = "";
my $fname = lc("utf8_to_${charset}_combined.map");
print "- Writing UTF8=>${charset} conversion table: $fname\n";
open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
"static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
scalar(@$table));
my $first = 1;
foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table)
{
print($out ",") if (!$first);
$first = 0;
print($out "\t/* $last_comment */") if ($verbose);
printf($out "\n {0x%08x, 0x%08x, 0x%04x}", $$i{utf8}, $$i{utf8_second}, $$i{code});
$last_comment = "$$i{comment}";
}
print($out "\t/* $last_comment */") if ($verbose);
print $out "\n};\n";
close($out);
}
sub print_to_utf8_map
{
my ($charset, $table, $verbose) = @_;
my $last_comment = "";
my $fname = lc("${charset}_to_utf8.map");
print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
"static const pg_local_to_utf LUmap${charset}[ %d ] = {",
scalar(@$table));
my $first = 1;
foreach my $i (sort {$$a{code} <=> $$b{code}} @$table)
{
print($out ",") if (!$first);
$first = 0;
print($out "\t/* $last_comment */") if ($verbose);
printf($out "\n {0x%04x, 0x%x}", $$i{code}, $$i{utf8});
if ($verbose >= 2)
{
$last_comment = "$$i{f}:$$i{l} $$i{comment}";
}
else
{
$last_comment = $$i{comment};
}
}
print($out "\t/* $last_comment */") if ($verbose);
print $out "\n};\n";
close($out);
}
sub print_to_utf8_combined_map
{
my ($charset, $table, $verbose) = @_;
my $last_comment = "";
my $fname = lc("${charset}_to_utf8_combined.map");
print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
"static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
scalar(@$table));
my $first = 1;
foreach my $i (sort {$$a{code} <=> $$b{code}} @$table)
{
print($out ",") if (!$first);
$first = 0;
print($out "\t/* $last_comment */") if ($verbose);
printf($out "\n {0x%04x, 0x%08x, 0x%08x}", $$i{code}, $$i{utf8}, $$i{utf8_second});
$last_comment = "$$i{comment}";
}
print($out "\t/* $last_comment */") if ($verbose);
print $out "\n};\n";
close($out);
}
1;
/*
* This file was generated by UCS_to_EUC_JIS_2004.pl
*/
static const pg_local_to_utf LUmapEUC_JIS_2004[] = {
/* src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8.map */
static const pg_local_to_utf LUmapEUC_JIS_2004[ 11303 ] = { /* */
{0x0080, 0xc280}, /* U+0080 <control> */
{0x0081, 0xc281}, /* U+0081 <control> */
{0x0082, 0xc282}, /* U+0082 <control> */
......@@ -205,7 +204,7 @@ static const pg_local_to_utf LUmapEUC_JIS_2004[] = {
{0xa2ac, 0xe28691}, /* U+2191 UPWARDS ARROW */
{0xa2ad, 0xe28693}, /* U+2193 DOWNWARDS ARROW */
{0xa2ae, 0xe38093}, /* U+3013 GETA MARK */
{0xa2af, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */
{0xa2af, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE */
{0xa2b0, 0xefbc82}, /* U+FF02 FULLWIDTH QUOTATION MARK [2000] */
{0xa2b1, 0xefbc8d}, /* U+FF0D FULLWIDTH HYPHEN-MINUS [2000] */
{0xa2b2, 0xefbd9e}, /* U+FF5E FULLWIDTH TILDE [2000] */
......
/*
* This file was generated by UCS_to_EUC_JIS_2004.pl
*/
static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[] = {
/* src/backend/utils/mb/Unicode/euc_jis_2004_to_utf8_combined.map */
static const pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[ 25 ] = { /* */
{0xa4f7, 0x00e3818b, 0x00e3829a}, /* U+304B+309A [2000] */
{0xa4f8, 0x00e3818d, 0x00e3829a}, /* U+304D+309A [2000] */
{0xa4f9, 0x00e3818f, 0x00e3829a}, /* U+304F+309A [2000] */
......
/* src/backend/utils/mb/Unicode/euc_jp_to_utf8.map */
static const pg_local_to_utf LUmapEUC_JP[] = {
static const pg_local_to_utf LUmapEUC_JP[ 13197 ] = {
{0x8ea1, 0xefbda1},
{0x8ea2, 0xefbda2},
{0x8ea3, 0xefbda3},
......@@ -13197,5 +13197,5 @@ static const pg_local_to_utf LUmapEUC_JP[] = {
{0x8ff4fb, 0xe9ab99},
{0x8ff4fc, 0xe9adb2},
{0x8ff4fd, 0xefa8ad},
{0x8ff4fe, 0xe9bb91},
{0x8ff4fe, 0xe9bb91}
};
/* src/backend/utils/mb/Unicode/euc_kr_to_utf8.map */
static const pg_local_to_utf LUmapEUC_KR[ 8227 ] = {
{0xa1a1, 0xe38080},
{0xa1a2, 0xe38081},
......
/* src/backend/utils/mb/Unicode/johab_to_utf8.map */
static const pg_local_to_utf LUmapJOHAB[ 17049 ] = {
{0x8444, 0xe384b3},
{0x8446, 0xe384b5},
......
/*
* This file was generated by UCS_to_SHIFTJIS_2004.pl
*/
static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = {
/* src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8.map */
static const pg_local_to_utf LUmapSHIFT_JIS_2004[ 11271 ] = { /* */
{0x00a1, 0xefbda1}, /* U+FF61 HALFWIDTH IDEOGRAPHIC FULL STOP */
{0x00a2, 0xefbda2}, /* U+FF62 HALFWIDTH LEFT CORNER BRACKET */
{0x00a3, 0xefbda3}, /* U+FF63 HALFWIDTH RIGHT CORNER BRACKET */
......@@ -173,7 +172,7 @@ static const pg_local_to_utf LUmapSHIFT_JIS_2004[] = {
{0x81aa, 0xe28691}, /* U+2191 UPWARDS ARROW */
{0x81ab, 0xe28693}, /* U+2193 DOWNWARDS ARROW */
{0x81ac, 0xe38093}, /* U+3013 GETA MARK */
{0x81ad, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */
{0x81ad, 0xefbc87}, /* U+FF07 FULLWIDTH APOSTROPHE */
{0x81ae, 0xefbc82}, /* U+FF02 FULLWIDTH QUOTATION MARK [2000] */
{0x81af, 0xefbc8d}, /* U+FF0D FULLWIDTH HYPHEN-MINUS [2000] */
{0x81b0, 0x7e}, /* U+007E TILDE [2000] Fullwidth: U+FF5E */
......
/*
* This file was generated by UCS_to_SHIFT_JIS_2004.pl
*/
static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[] = {
/* src/backend/utils/mb/Unicode/shift_jis_2004_to_utf8_combined.map */
static const pg_local_to_utf_combined LUmapSHIFT_JIS_2004_combined[ 25 ] = { /* */
{0x82f5, 0x00e3818b, 0x00e3829a}, /* U+304B+309A [2000] */
{0x82f6, 0x00e3818d, 0x00e3829a}, /* U+304D+309A [2000] */
{0x82f7, 0x00e3818f, 0x00e3829a}, /* U+304F+309A [2000] */
......
#
# Copyright (c) 2001-2016, PostgreSQL Global Development Group
#
# src/backend/utils/mb/Unicode/ucs2utf.pl
# convert UCS-4 to UTF-8
#
sub ucs2utf
{
local ($ucs) = @_;
local $utf;
if ($ucs <= 0x007f)
{
$utf = $ucs;
}
elsif ($ucs > 0x007f && $ucs <= 0x07ff)
{
$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
}
elsif ($ucs > 0x07ff && $ucs <= 0xffff)
{
$utf =
((($ucs >> 12) | 0xe0) << 16) |
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
}
else
{
$utf =
((($ucs >> 18) | 0xf0) << 24) |
(((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
(((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
}
return ($utf);
}
1;
/* src/backend/utils/mb/Unicode/uhc_to_utf8.map */
static const pg_local_to_utf LUmapUHC[ 17237 ] = {
{0x8141, 0xeab082},
{0x8142, 0xeab083},
......
/* src/backend/utils/mb/Unicode/utf8_to_euc_cn.map */
static const pg_utf_to_local ULmapEUC_CN[ 7445 ] = {
{0xc2a4, 0xa1e8},
{0xc2a7, 0xa1ec},
......
/*
* This file was generated by UCS_to_EUC_JIS_2004.pl
*/
static const pg_utf_to_local ULmapEUC_JIS_2004[] = {
/* src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004.map */
static const pg_utf_to_local ULmapEUC_JIS_2004[ 11303 ] = { /* */
{0xc280, 0x0080}, /* U+0080 <control> */
{0xc281, 0x0081}, /* U+0081 <control> */
{0xc282, 0x0082}, /* U+0082 <control> */
......@@ -10849,7 +10848,7 @@ static const pg_utf_to_local ULmapEUC_JIS_2004[] = {
{0xefbc84, 0xa1f0}, /* U+FF04 FULLWIDTH DOLLAR SIGN */
{0xefbc85, 0xa1f3}, /* U+FF05 FULLWIDTH PERCENT SIGN */
{0xefbc86, 0xa1f5}, /* U+FF06 FULLWIDTH AMPERSAND */
{0xefbc87, 0xa2af}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */
{0xefbc87, 0xa2af}, /* U+FF07 FULLWIDTH APOSTROPHE */
{0xefbc88, 0xa1ca}, /* U+FF08 FULLWIDTH LEFT PARENTHESIS */
{0xefbc89, 0xa1cb}, /* U+FF09 FULLWIDTH RIGHT PARENTHESIS */
{0xefbc8a, 0xa1f6}, /* U+FF0A FULLWIDTH ASTERISK */
......
/*
* This file was generated by UCS_to_EUC_JIS_2004.pl
*/
static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[] = {
/* src/backend/utils/mb/Unicode/utf8_to_euc_jis_2004_combined.map */
static const pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[ 25 ] = { /* */
{0x0000c3a6, 0x0000cc80, 0xabc4}, /* U+00E6+0300 [2000] */
{0x0000c994, 0x0000cc80, 0xabc8}, /* U+0254+0300 [2000] */
{0x0000c994, 0x0000cc81, 0xabc9}, /* U+0254+0301 [2000] */
......
/* src/backend/utils/mb/Unicode/utf8_to_euc_jp.map */
static const pg_utf_to_local ULmapEUC_JP[ 13175 ] = {
{0xc2a1, 0x8fa2c2},
{0xc2a4, 0x8fa2f0},
......
/* src/backend/utils/mb/Unicode/utf8_to_euc_kr.map */
static const pg_utf_to_local ULmapEUC_KR[ 8227 ] = {
{0xc2a1, 0xa2ae},
{0xc2a4, 0xa2b4},
......
/* src/backend/utils/mb/Unicode/utf8_to_johab.map */
static const pg_utf_to_local ULmapJOHAB[ 17049 ] = {
{0xc2a1, 0xd9ae},
{0xc2a4, 0xd9b4},
......
/*
* This file was generated by UCS_to_SHIFT_JIS_2004.pl
*/
static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = {
/* src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004.map */
static const pg_utf_to_local ULmapSHIFT_JIS_2004[ 11271 ] = { /* */
{0xc2a0, 0x8541}, /* U+00A0 NO-BREAK SPACE [2000] */
{0xc2a1, 0x8542}, /* U+00A1 INVERTED EXCLAMATION MARK [2000] */
{0xc2a2, 0x8191}, /* U+00A2 CENT SIGN Windows: U+FFE0 */
......@@ -10817,7 +10816,7 @@ static const pg_utf_to_local ULmapSHIFT_JIS_2004[] = {
{0xefbc84, 0x8190}, /* U+FF04 FULLWIDTH DOLLAR SIGN */
{0xefbc85, 0x8193}, /* U+FF05 FULLWIDTH PERCENT SIGN */
{0xefbc86, 0x8195}, /* U+FF06 FULLWIDTH AMPERSAND */
{0xefbc87, 0x81ad}, /* U+FF07 FULLWIDTH APOSTROPHE [2000] */
{0xefbc87, 0x81ad}, /* U+FF07 FULLWIDTH APOSTROPHE */
{0xefbc88, 0x8169}, /* U+FF08 FULLWIDTH LEFT PARENTHESIS */
{0xefbc89, 0x816a}, /* U+FF09 FULLWIDTH RIGHT PARENTHESIS */
{0xefbc8a, 0x8196}, /* U+FF0A FULLWIDTH ASTERISK */
......
/*
* This file was generated by UCS_to_SHIFT_JIS_2004.pl
*/
static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[] = {
/* src/backend/utils/mb/Unicode/utf8_to_shift_jis_2004_combined.map */
static const pg_utf_to_local_combined ULmapSHIFT_JIS_2004_combined[ 25 ] = { /* */
{0x0000c3a6, 0x0000cc80, 0x8663}, /* U+00E6+0300 [2000] */
{0x0000c994, 0x0000cc80, 0x8667}, /* U+0254+0300 [2000] */
{0x0000c994, 0x0000cc81, 0x8668}, /* U+0254+0301 [2000] */
......
......@@ -3,7 +3,7 @@
static const pg_utf_to_local ULmapSJIS[ 7397 ] = {
{0xc2a2, 0x8191},
{0xc2a3, 0x8192},
{0xc2a5, 0x5c},
{0xc2a5, 0x005c},
{0xc2a7, 0x8198},
{0xc2a8, 0x814e},
{0xc2ac, 0x81ca},
......@@ -142,7 +142,7 @@ static const pg_utf_to_local ULmapSJIS[ 7397 ] = {
{0xe280b2, 0x818c},
{0xe280b3, 0x818d},
{0xe280bb, 0x81a6},
{0xe280be, 0x7e},
{0xe280be, 0x007e},
{0xe28483, 0x818e},
{0xe28496, 0xfa59},
{0xe284a1, 0xfa5a},
......
/* src/backend/utils/mb/Unicode/utf8_to_uhc.map */
static const pg_utf_to_local ULmapUHC[ 17237 ] = {
{0xc2a1, 0xa2ae},
{0xc2a4, 0xa2b4},
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment