Commit 80f8eb79 authored by Michael Paquier's avatar Michael Paquier

Use perfect hash for NFC and NFKC Unicode Normalization quick check

This makes the normalization quick check about 30% faster for NFC and
50% faster for NFKC than the binary search used previously.  The hash
lookup reuses the existing array of bit fields used for the binary
search to get the quick check property and is generated as part of "make
update-unicode" in src/common/unicode/.

Author: John Naylor
Reviewed-by: Mark Dilger, Michael Paquier
Discussion: https://postgr.es/m/CACPNZCt4fbJ0_bGrN5QPt34N4whv=mszM0LMVQdoa2rC9UMRXA@mail.gmail.com
parent 85d08b8b
......@@ -9,6 +9,10 @@
use strict;
use warnings;
use FindBin;
use lib "$FindBin::RealBin/../../tools/";
use PerfectHash;
my %data;
print
......@@ -18,13 +22,25 @@ print <<EOS;
#include "common/unicode_norm.h"
/*
* We use a bit field here to save space.
* Normalization quick check entry for codepoint. We use a bit field
* here to save space.
*/
typedef struct
{
unsigned int codepoint:21;
signed int quickcheck:4; /* really UnicodeNormalizationQC */
} pg_unicode_normprops;
/* Typedef for hash function on quick check table */
typedef int (*qc_hash_func) (const void *key);
/* Information for quick check lookup with perfect hash function */
typedef struct
{
const pg_unicode_normprops *normprops;
qc_hash_func hash;
int num_normprops;
} pg_unicode_norminfo;
EOS
foreach my $line (<ARGV>)
......@@ -66,6 +82,7 @@ foreach my $prop (sort keys %data)
"static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";
my %subdata = %{ $data{$prop} };
my @cp_packed;
foreach my $cp (sort { $a <=> $b } keys %subdata)
{
my $qc;
......@@ -82,7 +99,27 @@ foreach my $prop (sort keys %data)
die;
}
printf "\t{0x%04X, %s},\n", $cp, $qc;
# Save the bytes as a string in network order.
push @cp_packed, pack('N', $cp);
}
print "};\n";
# Emit the definition of the perfect hash function.
my $funcname = $prop . '_hash_func';
my $f = PerfectHash::generate_hash_function(\@cp_packed, $funcname,
fixed_key_length => 4);
printf "\n/* Perfect hash function for %s */", $prop;
print "\nstatic $f\n";
# Emit the structure that wraps the hash lookup information into
# one variable.
printf "/* Hash lookup information for %s */", $prop;
printf "\nstatic const pg_unicode_norminfo ";
printf "UnicodeNormInfo_%s = {\n", $prop;
printf "\tUnicodeNormProps_%s,\n", $prop;
printf "\t%s,\n", $funcname;
printf "\t%d\n", scalar @cp_packed;
printf "};\n";
}
......@@ -465,15 +465,32 @@ get_canonical_class(pg_wchar ch)
return entry->comb_class;
}
static int
qc_compare(const void *p1, const void *p2)
static const pg_unicode_normprops *
qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
{
uint32 v1,
v2;
int h;
uint32 hashkey;
v1 = ((const pg_unicode_normprops *) p1)->codepoint;
v2 = ((const pg_unicode_normprops *) p2)->codepoint;
return (v1 - v2);
/*
* Compute the hash function. The hash key is the codepoint with the bytes
* in network order.
*/
hashkey = htonl(ch);
h = norminfo->hash(&hashkey);
/* An out-of-range result implies no match */
if (h < 0 || h >= norminfo->num_normprops)
return NULL;
/*
* Since it's a perfect hash, we need only match to the specific codepoint
* it identifies.
*/
if (ch != norminfo->normprops[h].codepoint)
return NULL;
/* Success! */
return &norminfo->normprops[h];
}
/*
......@@ -482,26 +499,15 @@ qc_compare(const void *p1, const void *p2)
static UnicodeNormalizationQC
qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
{
pg_unicode_normprops key;
pg_unicode_normprops *found = NULL;
key.codepoint = ch;
const pg_unicode_normprops *found = NULL;
switch (form)
{
case UNICODE_NFC:
found = bsearch(&key,
UnicodeNormProps_NFC_QC,
lengthof(UnicodeNormProps_NFC_QC),
sizeof(pg_unicode_normprops),
qc_compare);
found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
break;
case UNICODE_NFKC:
found = bsearch(&key,
UnicodeNormProps_NFKC_QC,
lengthof(UnicodeNormProps_NFKC_QC),
sizeof(pg_unicode_normprops),
qc_compare);
found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
break;
default:
Assert(false);
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -18,6 +18,11 @@ src/backend/utils/fmgrprotos\.h$
# they match pgindent style, they'd look worse not better, so exclude them.
kwlist_d\.h$
#
# This is generated by the scripts from src/common/unicode/. It uses
# hash functions generated by PerfectHash.pm whose format looks worse with
# pgindent.
src/include/common/unicode_normprops_table\.h$
#
# Exclude ecpg test files to avoid breaking the ecpg regression tests
# (but include files at the top level of the ecpg/test/ directory).
src/interfaces/ecpg/test/.*/
......
......@@ -3191,6 +3191,7 @@ pg_tz
pg_tz_cache
pg_tzenum
pg_unicode_decomposition
pg_unicode_norminfo
pg_unicode_normprops
pg_utf_to_local_combined
pg_uuid_t
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment