Commit a5073871 authored by Thomas Munro's avatar Thomas Munro

Fix conversion table generator scripts.

convutils.pm used implicit conversion of undefined value to integer
zero.  Some of conversion scripts are susceptible to regexp greediness.
Fix, avoiding whitespace changes in the output.  Also update ICU URLs
that moved.

No need to back-patch, because the output of these scripts is also in
the source tree so we shouldn't need to rerun them on back-branches.

Author: Kyotaro Horiguchi <horikyoga.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJ7SEGLbj%3D%3DTQCcyKRA9aqj8%2B6L%3DexSq1y25TA%3DWxLziQ%40mail.gmail.com
parent e47c2602
...@@ -122,7 +122,7 @@ euc-jis-2004-std.txt sjis-0213-2004-std.txt: ...@@ -122,7 +122,7 @@ euc-jis-2004-std.txt sjis-0213-2004-std.txt:
$(DOWNLOAD) http://x0213.org/codetable/$(@F) $(DOWNLOAD) http://x0213.org/codetable/$(@F)
gb-18030-2000.xml windows-949-2000.xml: gb-18030-2000.xml windows-949-2000.xml:
$(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F) $(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F)
GB2312.TXT: GB2312.TXT:
$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt' $(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'
......
...@@ -24,12 +24,13 @@ my @all; ...@@ -24,12 +24,13 @@ my @all;
while (my $line = <$in>) while (my $line = <$in>)
{ {
if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/) if ($line =~ /^0x(\w+)\s*U\+(\w+)\+(\w+)\s*#\s*(\S.*)?\s*$/)
{ {
# combined characters # combined characters
my ($c, $u1, $u2) = ($1, $2, $3); my ($c, $u1, $u2) = ($1, $2, $3);
my $rest = "U+" . $u1 . "+" . $u2 . $4; # The "\t \t" below is just to avoid insubstantial diffs.
my $rest = "U+" . $u1 . "+" . $u2 . "\t \t" . $4;
my $code = hex($c); my $code = hex($c);
my $ucs1 = hex($u1); my $ucs1 = hex($u1);
my $ucs2 = hex($u2); my $ucs2 = hex($u2);
...@@ -45,7 +46,7 @@ while (my $line = <$in>) ...@@ -45,7 +46,7 @@ while (my $line = <$in>)
l => $. l => $.
}; };
} }
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) elsif ($line =~ /^0x(\w+)\s*U\+(\w+)\s*#\s*(\S.*)?\s*$/)
{ {
# non-combined characters # non-combined characters
......
...@@ -80,7 +80,8 @@ foreach my $i (@$ct932) ...@@ -80,7 +80,8 @@ foreach my $i (@$ct932)
} }
} }
foreach my $i (@mapping) # extract only SJIS characers
foreach my $i (grep defined $_->{sjis}, @mapping)
{ {
my $sjis = $i->{sjis}; my $sjis = $i->{sjis};
......
...@@ -24,12 +24,13 @@ my @mapping; ...@@ -24,12 +24,13 @@ my @mapping;
while (my $line = <$in>) while (my $line = <$in>)
{ {
if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/) if ($line =~ /^0x(\w+)\s*U\+(\w+)\+(\w+)\s*#\s*(\S.*)?\s*$/)
{ {
# combined characters # combined characters
my ($c, $u1, $u2) = ($1, $2, $3); my ($c, $u1, $u2) = ($1, $2, $3);
my $rest = "U+" . $u1 . "+" . $u2 . $4; # The "\t \t" below is just to avoid insubstantial diffs.
my $rest = "U+" . $u1 . "+" . $u2 . "\t \t" . $4;
my $code = hex($c); my $code = hex($c);
my $ucs1 = hex($u1); my $ucs1 = hex($u1);
my $ucs2 = hex($u2); my $ucs2 = hex($u2);
...@@ -45,7 +46,7 @@ while (my $line = <$in>) ...@@ -45,7 +46,7 @@ while (my $line = <$in>)
l => $. l => $.
}; };
} }
elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) elsif ($line =~ /^0x(\w+)\s*U\+(\w+)\s*#\s*(\S.*)?\s*$/)
{ {
# non-combined characters # non-combined characters
......
...@@ -380,7 +380,8 @@ sub print_radix_table ...@@ -380,7 +380,8 @@ sub print_radix_table
{ {
header => "Dummy map, for invalid values", header => "Dummy map, for invalid values",
min_idx => 0, min_idx => 0,
max_idx => $widest_range max_idx => $widest_range,
label => "dummy map"
}; };
### ###
...@@ -471,35 +472,37 @@ sub print_radix_table ...@@ -471,35 +472,37 @@ sub print_radix_table
} }
# Also look up the positions of the roots in the table. # Also look up the positions of the roots in the table.
my $b1root = $segmap{"1-byte"}; # Missing map represents dummy mapping.
my $b2root = $segmap{"2-byte"}; my $b1root = $segmap{"1-byte"} || 0;
my $b3root = $segmap{"3-byte"}; my $b2root = $segmap{"2-byte"} || 0;
my $b4root = $segmap{"4-byte"}; my $b3root = $segmap{"3-byte"} || 0;
my $b4root = $segmap{"4-byte"} || 0;
# And the lower-upper values of each level in each radix tree. # And the lower-upper values of each level in each radix tree.
my $b1_lower = $min_idx{1}{1}; # Missing values represent zero.
my $b1_upper = $max_idx{1}{1}; my $b1_lower = $min_idx{1}{1} || 0;
my $b1_upper = $max_idx{1}{1} || 0;
my $b2_1_lower = $min_idx{2}{1};
my $b2_1_upper = $max_idx{2}{1}; my $b2_1_lower = $min_idx{2}{1} || 0;
my $b2_2_lower = $min_idx{2}{2}; my $b2_1_upper = $max_idx{2}{1} || 0;
my $b2_2_upper = $max_idx{2}{2}; my $b2_2_lower = $min_idx{2}{2} || 0;
my $b2_2_upper = $max_idx{2}{2} || 0;
my $b3_1_lower = $min_idx{3}{1};
my $b3_1_upper = $max_idx{3}{1}; my $b3_1_lower = $min_idx{3}{1} || 0;
my $b3_2_lower = $min_idx{3}{2}; my $b3_1_upper = $max_idx{3}{1} || 0;
my $b3_2_upper = $max_idx{3}{2}; my $b3_2_lower = $min_idx{3}{2} || 0;
my $b3_3_lower = $min_idx{3}{3}; my $b3_2_upper = $max_idx{3}{2} || 0;
my $b3_3_upper = $max_idx{3}{3}; my $b3_3_lower = $min_idx{3}{3} || 0;
my $b3_3_upper = $max_idx{3}{3} || 0;
my $b4_1_lower = $min_idx{4}{1};
my $b4_1_upper = $max_idx{4}{1}; my $b4_1_lower = $min_idx{4}{1} || 0;
my $b4_2_lower = $min_idx{4}{2}; my $b4_1_upper = $max_idx{4}{1} || 0;
my $b4_2_upper = $max_idx{4}{2}; my $b4_2_lower = $min_idx{4}{2} || 0;
my $b4_3_lower = $min_idx{4}{3}; my $b4_2_upper = $max_idx{4}{2} || 0;
my $b4_3_upper = $max_idx{4}{3}; my $b4_3_lower = $min_idx{4}{3} || 0;
my $b4_4_lower = $min_idx{4}{4}; my $b4_3_upper = $max_idx{4}{3} || 0;
my $b4_4_upper = $max_idx{4}{4}; my $b4_4_lower = $min_idx{4}{4} || 0;
my $b4_4_upper = $max_idx{4}{4} || 0;
### ###
### Find the maximum value in the whole table, to determine if we can ### Find the maximum value in the whole table, to determine if we can
...@@ -607,7 +610,8 @@ sub print_radix_table ...@@ -607,7 +610,8 @@ sub print_radix_table
for (my $j = 0; for (my $j = 0;
$j < $vals_per_line && $i <= $seg->{max_idx}; $j++) $j < $vals_per_line && $i <= $seg->{max_idx}; $j++)
{ {
my $val = $seg->{values}->{$i}; # missing values represent zero.
my $val = $seg->{values}->{$i} || 0;
printf $out " 0x%0*x", $colwidth, $val; printf $out " 0x%0*x", $colwidth, $val;
$off++; $off++;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment