Commit c64d0cd5 authored by Tom Lane's avatar Tom Lane

Use perfect hashing, instead of binary search, for keyword lookup.

We've been speculating for a long time that hash-based keyword lookup
ought to be faster than binary search, but up to now we hadn't found
a suitable tool for generating the hash function.  Joerg Sonnenberger
provided the inspiration, and sample code, to show us that rolling our
own generator wasn't a ridiculous idea.  Hence, do that.

The method used here requires a lookup table of approximately 4 bytes
per keyword, but that's less than what we saved in the predecessor commit
afb0d071, so it's not a big problem.  The time savings is indeed
significant: preliminary testing suggests that the total time for raw
parsing (flex + bison phases) drops by ~20%.

Patch by me, but it owes its existence to Joerg Sonnenberger;
thanks also to John Naylor for review.

Discussion: https://postgr.es/m/20190103163340.GA15803@britannica.bec.de
parent 5d59a6c5
......@@ -63,6 +63,11 @@ OBJS_FRONTEND = $(OBJS_COMMON) fe_memutils.o file_utils.o restricted_token.o
OBJS_SHLIB = $(OBJS_FRONTEND:%.o=%_shlib.o)
OBJS_SRV = $(OBJS_COMMON:%.o=%_srv.o)
# where to find gen_keywordlist.pl and subsidiary files
TOOLSDIR = $(top_srcdir)/src/tools
GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
all: libpgcommon.a libpgcommon_shlib.a libpgcommon_srv.a
distprep: kwlist_d.h
......@@ -118,8 +123,8 @@ libpgcommon_srv.a: $(OBJS_SRV)
$(CC) $(CFLAGS) $(subst -DFRONTEND,, $(CPPFLAGS)) -c $< -o $@
# generate SQL keyword lookup table to be included into keywords*.o.
kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(top_srcdir)/src/tools/gen_keywordlist.pl
$(PERL) $(top_srcdir)/src/tools/gen_keywordlist.pl --extern $<
kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(GEN_KEYWORDLIST_DEPS)
$(GEN_KEYWORDLIST) --extern $<
# Dependencies of keywords*.o need to be managed explicitly to make sure
# that you don't get broken parsing code, even in a non-enable-depend build.
......
......@@ -35,60 +35,51 @@
* receive a different case-normalization mapping.
*/
int
ScanKeywordLookup(const char *text,
ScanKeywordLookup(const char *str,
const ScanKeywordList *keywords)
{
int len,
i;
char word[NAMEDATALEN];
const char *kw_string;
const uint16 *kw_offsets;
const uint16 *low;
const uint16 *high;
len = strlen(text);
size_t len;
int h;
const char *kw;
/*
* Reject immediately if too long to be any keyword. This saves useless
* hashing and downcasing work on long strings.
*/
len = strlen(str);
if (len > keywords->max_kw_len)
return -1; /* too long to be any keyword */
/* We assume all keywords are shorter than NAMEDATALEN. */
Assert(len < NAMEDATALEN);
return -1;
/*
* Apply an ASCII-only downcasing. We must not use tolower() since it may
* produce the wrong translation in some locales (eg, Turkish).
* Compute the hash function. We assume it was generated to produce
* case-insensitive results. Since it's a perfect hash, we need only
* match to the specific keyword it identifies.
*/
for (i = 0; i < len; i++)
{
char ch = text[i];
h = keywords->hash(str, len);
if (ch >= 'A' && ch <= 'Z')
ch += 'a' - 'A';
word[i] = ch;
}
word[len] = '\0';
/* An out-of-range result implies no match */
if (h < 0 || h >= keywords->num_keywords)
return -1;
/*
* Now do a binary search using plain strcmp() comparison.
* Compare character-by-character to see if we have a match, applying an
* ASCII-only downcasing to the input characters. We must not use
* tolower() since it may produce the wrong translation in some locales
* (eg, Turkish).
*/
kw_string = keywords->kw_string;
kw_offsets = keywords->kw_offsets;
low = kw_offsets;
high = kw_offsets + (keywords->num_keywords - 1);
while (low <= high)
kw = GetScanKeyword(h, keywords);
while (*str != '\0')
{
const uint16 *middle;
int difference;
char ch = *str++;
middle = low + (high - low) / 2;
difference = strcmp(kw_string + *middle, word);
if (difference == 0)
return middle - kw_offsets;
else if (difference < 0)
low = middle + 1;
else
high = middle - 1;
if (ch >= 'A' && ch <= 'Z')
ch += 'a' - 'A';
if (ch != *kw++)
return -1;
}
if (*kw != '\0')
return -1;
return -1;
/* Success! */
return h;
}
......@@ -14,6 +14,9 @@
#ifndef KWLOOKUP_H
#define KWLOOKUP_H
/* Hash function used by ScanKeywordLookup */
typedef int (*ScanKeywordHashFunc) (const void *key, size_t keylen);
/*
* This struct contains the data needed by ScanKeywordLookup to perform a
* search within a set of keywords. The contents are typically generated by
......@@ -23,6 +26,7 @@ typedef struct ScanKeywordList
{
const char *kw_string; /* all keywords in order, separated by \0 */
const uint16 *kw_offsets; /* offsets to the start of each keyword */
ScanKeywordHashFunc hash; /* perfect hash function for keywords */
int num_keywords; /* number of keywords */
int max_kw_len; /* length of longest keyword */
} ScanKeywordList;
......
......@@ -21,8 +21,7 @@
/*
* List of keyword (name, token-value, category) entries.
*
* !!WARNING!!: This list must be sorted by ASCII name, because binary
* search is used to locate entries.
* Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
*/
/* name, value, category */
......
......@@ -28,7 +28,10 @@ OBJS= preproc.o pgc.o type.o ecpg.o output.o parser.o \
keywords.o c_keywords.o ecpg_keywords.o typename.o descriptor.o variable.o \
$(WIN32RES)
GEN_KEYWORDLIST = $(top_srcdir)/src/tools/gen_keywordlist.pl
# where to find gen_keywordlist.pl and subsidiary files
TOOLSDIR = $(top_srcdir)/src/tools
GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
# Suppress parallel build to avoid a bug in GNU make 3.82
# (see comments in ../Makefile)
......@@ -56,11 +59,11 @@ preproc.y: ../../../backend/parser/gram.y parse.pl ecpg.addons ecpg.header ecpg.
$(PERL) $(srcdir)/check_rules.pl $(srcdir) $<
# generate keyword headers
c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST)
$(PERL) $(GEN_KEYWORDLIST) --varname ScanCKeywords $<
c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST_DEPS)
$(GEN_KEYWORDLIST) --varname ScanCKeywords --no-case-fold $<
ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST)
$(PERL) $(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST_DEPS)
$(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
# Force these dependencies to be known even without dependency info built:
ecpg_keywords.o c_keywords.o keywords.o preproc.o pgc.o parser.o: preproc.h
......
......@@ -9,8 +9,6 @@
*/
#include "postgres_fe.h"
#include <ctype.h>
#include "preproc_extern.h"
#include "preproc.h"
......@@ -32,39 +30,38 @@ static const uint16 ScanCKeywordTokens[] = {
*
* Returns the token value of the keyword, or -1 if no match.
*
* Do a binary search using plain strcmp() comparison. This is much like
* Do a hash search using plain strcmp() comparison. This is much like
* ScanKeywordLookup(), except we want case-sensitive matching.
*/
int
ScanCKeywordLookup(const char *text)
ScanCKeywordLookup(const char *str)
{
const char *kw_string;
const uint16 *kw_offsets;
const uint16 *low;
const uint16 *high;
size_t len;
int h;
const char *kw;
/*
* Reject immediately if too long to be any keyword. This saves useless
* hashing work on long strings.
*/
len = strlen(str);
if (len > ScanCKeywords.max_kw_len)
return -1;
if (strlen(text) > ScanCKeywords.max_kw_len)
return -1; /* too long to be any keyword */
/*
* Compute the hash function. Since it's a perfect hash, we need only
* match to the specific keyword it identifies.
*/
h = ScanCKeywords_hash_func(str, len);
kw_string = ScanCKeywords.kw_string;
kw_offsets = ScanCKeywords.kw_offsets;
low = kw_offsets;
high = kw_offsets + (ScanCKeywords.num_keywords - 1);
/* An out-of-range result implies no match */
if (h < 0 || h >= ScanCKeywords.num_keywords)
return -1;
while (low <= high)
{
const uint16 *middle;
int difference;
kw = GetScanKeyword(h, &ScanCKeywords);
middle = low + (high - low) / 2;
difference = strcmp(kw_string + *middle, text);
if (difference == 0)
return ScanCKeywordTokens[middle - kw_offsets];
else if (difference < 0)
low = middle + 1;
else
high = middle - 1;
}
if (strcmp(kw, str) == 0)
return ScanCKeywordTokens[h];
return -1;
}
......@@ -20,8 +20,7 @@
/*
* List of (keyword-name, keyword-token-value) pairs.
*
* !!WARNING!!: This list must be sorted by ASCII name, because binary
* search is used to locate entries.
* Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
*/
/* name, value */
......
......@@ -20,8 +20,7 @@
/*
* List of (keyword-name, keyword-token-value) pairs.
*
* !!WARNING!!: This list must be sorted by ASCII name, because binary
* search is used to locate entries.
* Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
*/
/* name, value */
......
......@@ -29,7 +29,10 @@ REGRESS_OPTS = --dbname=$(PL_TESTDB)
REGRESS = plpgsql_call plpgsql_control plpgsql_domain plpgsql_record \
plpgsql_cache plpgsql_transaction plpgsql_trigger plpgsql_varprops
GEN_KEYWORDLIST = $(top_srcdir)/src/tools/gen_keywordlist.pl
# where to find gen_keywordlist.pl and subsidiary files
TOOLSDIR = $(top_srcdir)/src/tools
GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
all: all-lib
......@@ -76,11 +79,11 @@ plerrcodes.h: $(top_srcdir)/src/backend/utils/errcodes.txt generate-plerrcodes.p
$(PERL) $(srcdir)/generate-plerrcodes.pl $< > $@
# generate keyword headers for the scanner
pl_reserved_kwlist_d.h: pl_reserved_kwlist.h $(GEN_KEYWORDLIST)
$(PERL) $(GEN_KEYWORDLIST) --varname ReservedPLKeywords $<
pl_reserved_kwlist_d.h: pl_reserved_kwlist.h $(GEN_KEYWORDLIST_DEPS)
$(GEN_KEYWORDLIST) --varname ReservedPLKeywords $<
pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h $(GEN_KEYWORDLIST)
$(PERL) $(GEN_KEYWORDLIST) --varname UnreservedPLKeywords $<
pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h $(GEN_KEYWORDLIST_DEPS)
$(GEN_KEYWORDLIST) --varname UnreservedPLKeywords $<
check: submake
......
......@@ -20,10 +20,9 @@
/*
* List of (keyword-name, keyword-token-value) pairs.
*
* Be careful not to put the same word in both lists.
* Be careful not to put the same word into pl_unreserved_kwlist.h.
*
* !!WARNING!!: This list must be sorted by ASCII name, because binary
* search is used to locate entries.
* Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
*/
/* name, value */
......
......@@ -20,11 +20,10 @@
/*
* List of (keyword-name, keyword-token-value) pairs.
*
* Be careful not to put the same word in both lists. Also be sure that
* pl_gram.y's unreserved_keyword production agrees with this list.
* Be careful not to put the same word into pl_reserved_kwlist.h. Also be
* sure that pl_gram.y's unreserved_keyword production agrees with this list.
*
* !!WARNING!!: This list must be sorted by ASCII name, because binary
* search is used to locate entries.
* Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
*/
/* name, value */
......
This diff is collapsed.
......@@ -14,6 +14,12 @@
# variable named according to the -v switch ("ScanKeywords" by default).
# The variable is marked "static" unless the -e switch is given.
#
# ScanKeywordList uses hash-based lookup, so this script also selects
# a minimal perfect hash function for the keyword set, and emits a
# static hash function that is referenced in the ScanKeywordList struct.
# The hash function is case-insensitive unless --no-case-fold is specified.
# Note that case folding works correctly only for all-ASCII keywords!
#
#
# Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
# Portions Copyright (c) 1994, Regents of the University of California
......@@ -25,15 +31,18 @@
use strict;
use warnings;
use Getopt::Long;
use PerfectHash;
my $output_path = '';
my $extern = 0;
my $case_fold = 1;
my $varname = 'ScanKeywords';
GetOptions(
'output:s' => \$output_path,
'extern' => \$extern,
'varname:s' => \$varname) || usage();
'output:s' => \$output_path,
'extern' => \$extern,
'case-fold!' => \$case_fold,
'varname:s' => \$varname) || usage();
my $kw_input_file = shift @ARGV || die "No input file.\n";
......@@ -87,7 +96,22 @@ while (<$kif>)
}
}
# When being case-insensitive, insist that the input be all-lower-case.
if ($case_fold)
{
foreach my $kw (@keywords)
{
die qq|The keyword "$kw" is not lower-case in $kw_input_file\n|
if ($kw ne lc $kw);
}
}
# Error out if the keyword names are not in ASCII order.
#
# While this isn't really necessary with hash-based lookup, it's still
# helpful because it provides a cheap way to reject duplicate keywords.
# Also, insisting on sorted order ensures that code that scans the keyword
# table linearly will see the keywords in a canonical order.
for my $i (0..$#keywords - 1)
{
die qq|The keyword "$keywords[$i + 1]" is out of order in $kw_input_file\n|
......@@ -128,15 +152,25 @@ print $kwdef "};\n\n";
printf $kwdef "#define %s_NUM_KEYWORDS %d\n\n", uc $varname, scalar @keywords;
# Emit the definition of the hash function.
my $funcname = $varname . "_hash_func";
my $f = PerfectHash::generate_hash_function(\@keywords, $funcname,
case_fold => $case_fold);
printf $kwdef qq|static %s\n|, $f;
# Emit the struct that wraps all this lookup info into one variable.
print $kwdef "static " if !$extern;
printf $kwdef "static " if !$extern;
printf $kwdef "const ScanKeywordList %s = {\n", $varname;
printf $kwdef qq|\t%s_kw_string,\n|, $varname;
printf $kwdef qq|\t%s_kw_offsets,\n|, $varname;
printf $kwdef qq|\t%s,\n|, $funcname;
printf $kwdef qq|\t%s_NUM_KEYWORDS,\n|, uc $varname;
printf $kwdef qq|\t%d\n|, $max_len;
print $kwdef "};\n\n";
printf $kwdef "};\n\n";
printf $kwdef "#endif\t\t\t\t\t\t\t/* %s_H */\n", uc $base_filename;
......@@ -144,10 +178,11 @@ printf $kwdef "#endif\t\t\t\t\t\t\t/* %s_H */\n", uc $base_filename;
sub usage
{
die <<EOM;
Usage: gen_keywordlist.pl [--output/-o <path>] [--varname/-v <varname>] [--extern/-e] input_file
--output Output directory (default '.')
--varname Name for ScanKeywordList variable (default 'ScanKeywords')
--extern Allow the ScanKeywordList variable to be globally visible
Usage: gen_keywordlist.pl [--output/-o <path>] [--varname/-v <varname>] [--extern/-e] [--[no-]case-fold] input_file
--output Output directory (default '.')
--varname Name for ScanKeywordList variable (default 'ScanKeywords')
--extern Allow the ScanKeywordList variable to be globally visible
--no-case-fold Keyword matching is to be case-sensitive
gen_keywordlist.pl transforms a list of keywords into a ScanKeywordList.
The output filename is derived from the input file by inserting _d,
......
......@@ -414,7 +414,7 @@ sub GenerateFiles
'src/include/parser/kwlist.h'))
{
print "Generating kwlist_d.h...\n";
system('perl src/tools/gen_keywordlist.pl --extern -o src/common src/include/parser/kwlist.h');
system('perl -I src/tools src/tools/gen_keywordlist.pl --extern -o src/common src/include/parser/kwlist.h');
}
if (IsNewer(
......@@ -426,8 +426,8 @@ sub GenerateFiles
{
print "Generating pl_reserved_kwlist_d.h and pl_unreserved_kwlist_d.h...\n";
chdir('src/pl/plpgsql/src');
system('perl ../../../tools/gen_keywordlist.pl --varname ReservedPLKeywords pl_reserved_kwlist.h');
system('perl ../../../tools/gen_keywordlist.pl --varname UnreservedPLKeywords pl_unreserved_kwlist.h');
system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname ReservedPLKeywords pl_reserved_kwlist.h');
system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname UnreservedPLKeywords pl_unreserved_kwlist.h');
chdir('../../../..');
}
......@@ -440,8 +440,8 @@ sub GenerateFiles
{
print "Generating c_kwlist_d.h and ecpg_kwlist_d.h...\n";
chdir('src/interfaces/ecpg/preproc');
system('perl ../../../tools/gen_keywordlist.pl --varname ScanCKeywords c_kwlist.h');
system('perl ../../../tools/gen_keywordlist.pl --varname ScanECPGKeywords ecpg_kwlist.h');
system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname ScanCKeywords --no-case-fold c_kwlist.h');
system('perl -I ../../../tools ../../../tools/gen_keywordlist.pl --varname ScanECPGKeywords ecpg_kwlist.h');
chdir('../../../..');
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment