Commit 3ccae48f authored by Tom Lane's avatar Tom Lane

Support indexing of regular-expression searches in contrib/pg_trgm.

This works by extracting trigrams from the given regular expression,
in generally the same spirit as the previously-existing support for
LIKE searches, though of course the details are far more complicated.

Currently, only GIN indexes are supported.  We might be able to make
it work with GiST indexes later.

The implementation includes adding API functions to backend/regex/
to provide a view of the search NFA created from a regular expression.
These functions are meant to be generic enough to be supportable in
a standalone version of the regex library, should that ever happen.

Alexander Korotkov, reviewed by Heikki Linnakangas and Tom Lane
parent e60d20a3
# contrib/pg_trgm/Makefile # contrib/pg_trgm/Makefile
MODULE_big = pg_trgm MODULE_big = pg_trgm
OBJS = trgm_op.o trgm_gist.o trgm_gin.o OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o
EXTENSION = pg_trgm EXTENSION = pg_trgm
DATA = pg_trgm--1.0.sql pg_trgm--unpackaged--1.0.sql DATA = pg_trgm--1.1.sql pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql
REGRESS = pg_trgm REGRESS = pg_trgm
......
...@@ -60,7 +60,7 @@ select similarity('---', '####---'); ...@@ -60,7 +60,7 @@ select similarity('---', '####---');
(1 row) (1 row)
CREATE TABLE test_trgm(t text); CREATE TABLE test_trgm(t text);
\copy test_trgm from 'data/trgm.data \copy test_trgm from 'data/trgm.data'
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t; select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
t | sml t | sml
-------------+---------- -------------+----------
...@@ -3470,6 +3470,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198 ...@@ -3470,6 +3470,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
create table test2(t text); create table test2(t text);
insert into test2 values ('abcdef'); insert into test2 values ('abcdef');
insert into test2 values ('quark'); insert into test2 values ('quark');
insert into test2 values (' z foo bar');
create index test2_idx_gin on test2 using gin (t gin_trgm_ops); create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
set enable_seqscan=off; set enable_seqscan=off;
explain (costs off) explain (costs off)
...@@ -3521,6 +3522,142 @@ select * from test2 where t ilike 'qua%'; ...@@ -3521,6 +3522,142 @@ select * from test2 where t ilike 'qua%';
quark quark
(1 row) (1 row)
select * from test2 where t like '%z foo bar%';
t
-------------
z foo bar
(1 row)
select * from test2 where t like ' z foo%';
t
-------------
z foo bar
(1 row)
explain (costs off)
select * from test2 where t ~ '[abc]{3}';
QUERY PLAN
--------------------------------------------
Bitmap Heap Scan on test2
Recheck Cond: (t ~ '[abc]{3}'::text)
-> Bitmap Index Scan on test2_idx_gin
Index Cond: (t ~ '[abc]{3}'::text)
(4 rows)
explain (costs off)
select * from test2 where t ~* 'DEF';
QUERY PLAN
------------------------------------------
Bitmap Heap Scan on test2
Recheck Cond: (t ~* 'DEF'::text)
-> Bitmap Index Scan on test2_idx_gin
Index Cond: (t ~* 'DEF'::text)
(4 rows)
select * from test2 where t ~ '[abc]{3}';
t
--------
abcdef
(1 row)
select * from test2 where t ~ 'a[bc]+d';
t
--------
abcdef
(1 row)
select * from test2 where t ~ '(abc)*$';
t
-------------
abcdef
quark
z foo bar
(3 rows)
select * from test2 where t ~* 'DEF';
t
--------
abcdef
(1 row)
select * from test2 where t ~ 'dEf';
t
---
(0 rows)
select * from test2 where t ~* '^q';
t
-------
quark
(1 row)
select * from test2 where t ~* '[abc]{3}[def]{3}';
t
--------
abcdef
(1 row)
select * from test2 where t ~* 'ab[a-z]{3}';
t
--------
abcdef
(1 row)
select * from test2 where t ~* '(^| )qua';
t
-------
quark
(1 row)
select * from test2 where t ~ 'q.*rk$';
t
-------
quark
(1 row)
select * from test2 where t ~ 'q';
t
-------
quark
(1 row)
select * from test2 where t ~ '[a-z]{3}';
t
-------------
abcdef
quark
z foo bar
(3 rows)
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
t
---
(0 rows)
select * from test2 where t ~ 'z foo bar';
t
-------------
z foo bar
(1 row)
select * from test2 where t ~ ' z foo bar';
t
-------------
z foo bar
(1 row)
select * from test2 where t ~ ' z foo bar';
t
-------------
z foo bar
(1 row)
select * from test2 where t ~ ' z foo';
t
-------------
z foo bar
(1 row)
drop index test2_idx_gin; drop index test2_idx_gin;
create index test2_idx_gist on test2 using gist (t gist_trgm_ops); create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
set enable_seqscan=off; set enable_seqscan=off;
......
/* contrib/pg_trgm/pg_trgm--1.0--1.1.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.1'" to load this file. \quit
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
OPERATOR 5 pg_catalog.~ (text, text),
OPERATOR 6 pg_catalog.~* (text, text);
/* contrib/pg_trgm/pg_trgm--1.0.sql */ /* contrib/pg_trgm/pg_trgm--1.1.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION -- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION pg_trgm" to load this file. \quit \echo Use "CREATE EXTENSION pg_trgm" to load this file. \quit
...@@ -164,3 +164,9 @@ AS ...@@ -164,3 +164,9 @@ AS
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
OPERATOR 3 pg_catalog.~~ (text, text), OPERATOR 3 pg_catalog.~~ (text, text),
OPERATOR 4 pg_catalog.~~* (text, text); OPERATOR 4 pg_catalog.~~* (text, text);
-- Add operators that are new in 9.3.
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
OPERATOR 5 pg_catalog.~ (text, text),
OPERATOR 6 pg_catalog.~* (text, text);
# pg_trgm extension # pg_trgm extension
comment = 'text similarity measurement and index searching based on trigrams' comment = 'text similarity measurement and index searching based on trigrams'
default_version = '1.0' default_version = '1.1'
module_pathname = '$libdir/pg_trgm' module_pathname = '$libdir/pg_trgm'
relocatable = true relocatable = true
...@@ -15,7 +15,7 @@ select similarity('---', '####---'); ...@@ -15,7 +15,7 @@ select similarity('---', '####---');
CREATE TABLE test_trgm(t text); CREATE TABLE test_trgm(t text);
\copy test_trgm from 'data/trgm.data \copy test_trgm from 'data/trgm.data'
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t; select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t; select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
...@@ -43,6 +43,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198 ...@@ -43,6 +43,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
create table test2(t text); create table test2(t text);
insert into test2 values ('abcdef'); insert into test2 values ('abcdef');
insert into test2 values ('quark'); insert into test2 values ('quark');
insert into test2 values (' z foo bar');
create index test2_idx_gin on test2 using gin (t gin_trgm_ops); create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
set enable_seqscan=off; set enable_seqscan=off;
explain (costs off) explain (costs off)
...@@ -54,6 +55,29 @@ select * from test2 where t like '%bcd%'; ...@@ -54,6 +55,29 @@ select * from test2 where t like '%bcd%';
select * from test2 where t like E'%\\bcd%'; select * from test2 where t like E'%\\bcd%';
select * from test2 where t ilike '%BCD%'; select * from test2 where t ilike '%BCD%';
select * from test2 where t ilike 'qua%'; select * from test2 where t ilike 'qua%';
select * from test2 where t like '%z foo bar%';
select * from test2 where t like ' z foo%';
explain (costs off)
select * from test2 where t ~ '[abc]{3}';
explain (costs off)
select * from test2 where t ~* 'DEF';
select * from test2 where t ~ '[abc]{3}';
select * from test2 where t ~ 'a[bc]+d';
select * from test2 where t ~ '(abc)*$';
select * from test2 where t ~* 'DEF';
select * from test2 where t ~ 'dEf';
select * from test2 where t ~* '^q';
select * from test2 where t ~* '[abc]{3}[def]{3}';
select * from test2 where t ~* 'ab[a-z]{3}';
select * from test2 where t ~* '(^| )qua';
select * from test2 where t ~ 'q.*rk$';
select * from test2 where t ~ 'q';
select * from test2 where t ~ '[a-z]{3}';
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
select * from test2 where t ~ 'z foo bar';
select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo';
drop index test2_idx_gin; drop index test2_idx_gin;
create index test2_idx_gist on test2 using gist (t gist_trgm_ops); create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
set enable_seqscan=off; set enable_seqscan=off;
......
...@@ -7,18 +7,20 @@ ...@@ -7,18 +7,20 @@
#include "access/gist.h" #include "access/gist.h"
#include "access/itup.h" #include "access/itup.h"
#include "storage/bufpage.h" #include "storage/bufpage.h"
#include "utils/builtins.h"
/* options */ /*
* Options ... but note that trgm_regexp.c effectively assumes these values
* of LPADDING and RPADDING.
*/
#define LPADDING 2 #define LPADDING 2
#define RPADDING 1 #define RPADDING 1
#define KEEPONLYALNUM #define KEEPONLYALNUM
/* /*
* Caution: IGNORECASE macro means that trigrams are case-insensitive. * Caution: IGNORECASE macro means that trigrams are case-insensitive.
* If this macro is disabled, the ~~* operator must be removed from the * If this macro is disabled, the ~* and ~~* operators must be removed from
* operator classes, because we can't handle case-insensitive wildcard search * the operator classes, because we can't handle case-insensitive wildcard
* with case-sensitive trigrams. Failure to do this will result in "cannot * search with case-sensitive trigrams. Failure to do this will result in
* handle ~~* with case-sensitive trigrams" errors. * "cannot handle ~*(~~*) with case-sensitive trigrams" errors.
*/ */
#define IGNORECASE #define IGNORECASE
#define DIVUNION #define DIVUNION
...@@ -28,6 +30,8 @@ ...@@ -28,6 +30,8 @@
#define DistanceStrategyNumber 2 #define DistanceStrategyNumber 2
#define LikeStrategyNumber 3 #define LikeStrategyNumber 3
#define ILikeStrategyNumber 4 #define ILikeStrategyNumber 4
#define RegExpStrategyNumber 5
#define RegExpICaseStrategyNumber 6
typedef char trgm[3]; typedef char trgm[3];
...@@ -42,11 +46,11 @@ typedef char trgm[3]; ...@@ -42,11 +46,11 @@ typedef char trgm[3];
*(((char*)(a))+2) = *(((char*)(b))+2); \ *(((char*)(a))+2) = *(((char*)(b))+2); \
} while(0); } while(0);
uint32 trgm2int(trgm *ptr);
#ifdef KEEPONLYALNUM #ifdef KEEPONLYALNUM
#define ISWORDCHR(c) (t_isalpha(c) || t_isdigit(c))
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') ) #define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
#else #else
#define ISWORDCHR(c) (!t_isspace(c))
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) ) #define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) )
#endif #endif
#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) ) #define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
...@@ -99,11 +103,18 @@ typedef char *BITVECP; ...@@ -99,11 +103,18 @@ typedef char *BITVECP;
#define GETARR(x) ( (trgm*)( (char*)x+TRGMHDRSIZE ) ) #define GETARR(x) ( (trgm*)( (char*)x+TRGMHDRSIZE ) )
#define ARRNELEM(x) ( ( VARSIZE(x) - TRGMHDRSIZE )/sizeof(trgm) ) #define ARRNELEM(x) ( ( VARSIZE(x) - TRGMHDRSIZE )/sizeof(trgm) )
typedef struct TrgmPackedGraph TrgmPackedGraph;
extern float4 trgm_limit; extern float4 trgm_limit;
TRGM *generate_trgm(char *str, int slen); extern uint32 trgm2int(trgm *ptr);
TRGM *generate_wildcard_trgm(const char *str, int slen); extern void compact_trigram(trgm *tptr, char *str, int bytelen);
float4 cnt_sml(TRGM *trg1, TRGM *trg2); extern TRGM *generate_trgm(char *str, int slen);
bool trgm_contained_by(TRGM *trg1, TRGM *trg2); extern TRGM *generate_wildcard_trgm(const char *str, int slen);
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2);
extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
extern TRGM *createTrgmNFA(text *text_re, TrgmPackedGraph **graph,
Oid collation);
extern bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check);
#endif /* __TRGM_H__ */ #endif /* __TRGM_H__ */
...@@ -80,13 +80,15 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS) ...@@ -80,13 +80,15 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
StrategyNumber strategy = PG_GETARG_UINT16(2); StrategyNumber strategy = PG_GETARG_UINT16(2);
/* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */ /* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ Pointer **extra_data = (Pointer **) PG_GETARG_POINTER(4);
/* bool **nullFlags = (bool **) PG_GETARG_POINTER(5); */ /* bool **nullFlags = (bool **) PG_GETARG_POINTER(5); */
int32 *searchMode = (int32 *) PG_GETARG_POINTER(6); int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
Datum *entries = NULL; Datum *entries = NULL;
TRGM *trg; TRGM *trg;
int32 trglen; int32 trglen;
trgm *ptr; trgm *ptr;
TrgmPackedGraph *graph;
int32 i; int32 i;
switch (strategy) switch (strategy)
...@@ -107,6 +109,33 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS) ...@@ -107,6 +109,33 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
*/ */
trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ); trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
break; break;
case RegExpICaseStrategyNumber:
#ifndef IGNORECASE
elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
#endif
/* FALL THRU */
case RegExpStrategyNumber:
trg = createTrgmNFA(val, &graph, PG_GET_COLLATION());
if (trg && ARRNELEM(trg) > 0)
{
/*
* Successful regex processing: store NFA-like graph as
* extra_data. GIN API requires an array of nentries
* Pointers, but we just put the same value in each element.
*/
trglen = ARRNELEM(trg);
*extra_data = (Pointer *) palloc(sizeof(Pointer) * trglen);
for (i = 0; i < trglen; i++)
(*extra_data)[i] = (Pointer) graph;
}
else
{
/* No result: have to do full index scan. */
*nentries = 0;
*searchMode = GIN_SEARCH_MODE_ALL;
PG_RETURN_POINTER(entries);
}
break;
default: default:
elog(ERROR, "unrecognized strategy number: %d", strategy); elog(ERROR, "unrecognized strategy number: %d", strategy);
trg = NULL; /* keep compiler quiet */ trg = NULL; /* keep compiler quiet */
...@@ -146,8 +175,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS) ...@@ -146,8 +175,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
/* text *query = PG_GETARG_TEXT_P(2); */ /* text *query = PG_GETARG_TEXT_P(2); */
int32 nkeys = PG_GETARG_INT32(3); int32 nkeys = PG_GETARG_INT32(3);
Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4);
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
bool *recheck = (bool *) PG_GETARG_POINTER(5); bool *recheck = (bool *) PG_GETARG_POINTER(5);
bool res; bool res;
int32 i, int32 i,
...@@ -189,6 +217,21 @@ gin_trgm_consistent(PG_FUNCTION_ARGS) ...@@ -189,6 +217,21 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
} }
} }
break; break;
case RegExpICaseStrategyNumber:
#ifndef IGNORECASE
elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
#endif
/* FALL THRU */
case RegExpStrategyNumber:
if (nkeys < 1)
{
/* Regex processing gave no result: do full index scan */
res = true;
}
else
res = trigramsMatchGraph((TrgmPackedGraph *) extra_data[0],
check);
break;
default: default:
elog(ERROR, "unrecognized strategy number: %d", strategy); elog(ERROR, "unrecognized strategy number: %d", strategy);
res = false; /* keep compiler quiet */ res = false; /* keep compiler quiet */
......
...@@ -77,12 +77,6 @@ unique_array(trgm *a, int len) ...@@ -77,12 +77,6 @@ unique_array(trgm *a, int len)
return curend + 1 - a; return curend + 1 - a;
} }
#ifdef KEEPONLYALNUM
#define iswordchr(c) (t_isalpha(c) || t_isdigit(c))
#else
#define iswordchr(c) (!t_isspace(c))
#endif
/* /*
* Finds first word in string, returns pointer to the word, * Finds first word in string, returns pointer to the word,
* endword points to the character after word * endword points to the character after word
...@@ -92,7 +86,7 @@ find_word(char *str, int lenstr, char **endword, int *charlen) ...@@ -92,7 +86,7 @@ find_word(char *str, int lenstr, char **endword, int *charlen)
{ {
char *beginword = str; char *beginword = str;
while (beginword - str < lenstr && !iswordchr(beginword)) while (beginword - str < lenstr && !ISWORDCHR(beginword))
beginword += pg_mblen(beginword); beginword += pg_mblen(beginword);
if (beginword - str >= lenstr) if (beginword - str >= lenstr)
...@@ -100,7 +94,7 @@ find_word(char *str, int lenstr, char **endword, int *charlen) ...@@ -100,7 +94,7 @@ find_word(char *str, int lenstr, char **endword, int *charlen)
*endword = beginword; *endword = beginword;
*charlen = 0; *charlen = 0;
while (*endword - str < lenstr && iswordchr(*endword)) while (*endword - str < lenstr && ISWORDCHR(*endword))
{ {
*endword += pg_mblen(*endword); *endword += pg_mblen(*endword);
(*charlen)++; (*charlen)++;
...@@ -114,7 +108,7 @@ find_word(char *str, int lenstr, char **endword, int *charlen) ...@@ -114,7 +108,7 @@ find_word(char *str, int lenstr, char **endword, int *charlen)
* which is always exactly three bytes. If we have three single-byte * which is always exactly three bytes. If we have three single-byte
* characters, we just use them as-is; otherwise we form a hash value. * characters, we just use them as-is; otherwise we form a hash value.
*/ */
static void void
compact_trigram(trgm *tptr, char *str, int bytelen) compact_trigram(trgm *tptr, char *str, int bytelen)
{ {
if (bytelen == 3) if (bytelen == 3)
...@@ -290,7 +284,7 @@ get_wildcard_part(const char *str, int lenstr, ...@@ -290,7 +284,7 @@ get_wildcard_part(const char *str, int lenstr,
{ {
if (in_escape) if (in_escape)
{ {
if (iswordchr(beginword)) if (ISWORDCHR(beginword))
break; break;
in_escape = false; in_escape = false;
in_leading_wildcard_meta = false; in_leading_wildcard_meta = false;
...@@ -301,7 +295,7 @@ get_wildcard_part(const char *str, int lenstr, ...@@ -301,7 +295,7 @@ get_wildcard_part(const char *str, int lenstr,
in_escape = true; in_escape = true;
else if (ISWILDCARDCHAR(beginword)) else if (ISWILDCARDCHAR(beginword))
in_leading_wildcard_meta = true; in_leading_wildcard_meta = true;
else if (iswordchr(beginword)) else if (ISWORDCHR(beginword))
break; break;
else else
in_leading_wildcard_meta = false; in_leading_wildcard_meta = false;
...@@ -344,7 +338,7 @@ get_wildcard_part(const char *str, int lenstr, ...@@ -344,7 +338,7 @@ get_wildcard_part(const char *str, int lenstr,
clen = pg_mblen(endword); clen = pg_mblen(endword);
if (in_escape) if (in_escape)
{ {
if (iswordchr(endword)) if (ISWORDCHR(endword))
{ {
memcpy(s, endword, clen); memcpy(s, endword, clen);
(*charlen)++; (*charlen)++;
...@@ -372,7 +366,7 @@ get_wildcard_part(const char *str, int lenstr, ...@@ -372,7 +366,7 @@ get_wildcard_part(const char *str, int lenstr,
in_trailing_wildcard_meta = true; in_trailing_wildcard_meta = true;
break; break;
} }
else if (iswordchr(endword)) else if (ISWORDCHR(endword))
{ {
memcpy(s, endword, clen); memcpy(s, endword, clen);
(*charlen)++; (*charlen)++;
......
This diff is collapsed.
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
<para> <para>
The <filename>pg_trgm</filename> module provides functions and operators The <filename>pg_trgm</filename> module provides functions and operators
for determining the similarity of <acronym>ASCII</> for determining the similarity of
alphanumeric text based on trigram matching, as alphanumeric text based on trigram matching, as
well as index operator classes that support fast searching for similar well as index operator classes that support fast searching for similar
strings. strings.
...@@ -28,7 +28,9 @@ ...@@ -28,7 +28,9 @@
<note> <note>
<para> <para>
A string is considered to have two spaces <filename>pg_trgm</filename> ignores non-word characters
(non-alphanumerics) when extracting trigrams from a string.
Each word is considered to have two spaces
prefixed and one space suffixed when determining the set prefixed and one space suffixed when determining the set
of trigrams contained in the string. of trigrams contained in the string.
For example, the set of trigrams in the string For example, the set of trigrams in the string
...@@ -37,6 +39,16 @@ ...@@ -37,6 +39,16 @@
<quote><literal> ca</literal></quote>, <quote><literal> ca</literal></quote>,
<quote><literal>cat</literal></quote>, and <quote><literal>cat</literal></quote>, and
<quote><literal>at </literal></quote>. <quote><literal>at </literal></quote>.
The set of trigrams in the string
<quote><literal>foo|bar</literal></quote> is
<quote><literal> f</literal></quote>,
<quote><literal> fo</literal></quote>,
<quote><literal>foo</literal></quote>,
<quote><literal>oo </literal></quote>,
<quote><literal> b</literal></quote>,
<quote><literal> ba</literal></quote>,
<quote><literal>bar</literal></quote>, and
<quote><literal>ar </literal></quote>.
</para> </para>
</note> </note>
</sect2> </sect2>
...@@ -145,9 +157,10 @@ ...@@ -145,9 +157,10 @@
operator classes that allow you to create an index over a text column for operator classes that allow you to create an index over a text column for
the purpose of very fast similarity searches. These index types support the purpose of very fast similarity searches. These index types support
the above-described similarity operators, and additionally support the above-described similarity operators, and additionally support
trigram-based index searches for <literal>LIKE</> and <literal>ILIKE</> trigram-based index searches for <literal>LIKE</>, <literal>ILIKE</>,
queries. (These indexes do not support equality nor simple comparison <literal>~</> and <literal>~*</> queries. (These indexes do not
operators, so you may need a regular B-tree index too.) support equality nor simple comparison operators, so you may need a
regular B-tree index too.)
</para> </para>
<para> <para>
...@@ -202,6 +215,26 @@ SELECT * FROM test_trgm WHERE t LIKE '%foo%bar'; ...@@ -202,6 +215,26 @@ SELECT * FROM test_trgm WHERE t LIKE '%foo%bar';
searches, the search string need not be left-anchored. searches, the search string need not be left-anchored.
</para> </para>
<para>
Beginning in <productname>PostgreSQL</> 9.3, <filename>pg_trgm</filename>
GIN indexes also support index searches for regular-expression matches
(<literal>~</> and <literal>~*</> operators), for example
<programlisting>
SELECT * FROM test_trgm WHERE t ~ '(foo|bar)';
</programlisting>
The index search works by extracting trigrams from the regular expression
and then looking these up in the index. The more trigrams that can be
extracted from the regular expression, the more effective the index search
is. Unlike B-tree based searches, the search string need not be
left-anchored.
</para>
<para>
For both <literal>LIKE</> and regular-expression searches, keep in mind
that a pattern with no extractable trigrams will degenerate to a full-index
scan.
</para>
<para> <para>
The choice between GiST and GIN indexing depends on the relative The choice between GiST and GIN indexing depends on the relative
performance characteristics of GiST and GIN, which are discussed elsewhere. performance characteristics of GiST and GIN, which are discussed elsewhere.
......
...@@ -12,7 +12,7 @@ subdir = src/backend/regex ...@@ -12,7 +12,7 @@ subdir = src/backend/regex
top_builddir = ../../.. top_builddir = ../../..
include $(top_builddir)/src/Makefile.global include $(top_builddir)/src/Makefile.global
OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o regexport.o
include $(top_srcdir)/src/backend/common.mk include $(top_srcdir)/src/backend/common.mk
......
...@@ -7,8 +7,8 @@ So this file is an attempt to reverse-engineer some docs. ...@@ -7,8 +7,8 @@ So this file is an attempt to reverse-engineer some docs.
General source-file layout General source-file layout
-------------------------- --------------------------
There are five separately-compilable source files, each exposing exactly There are six separately-compilable source files, five of which expose
one exported function: exactly one exported function apiece:
regcomp.c: pg_regcomp regcomp.c: pg_regcomp
regexec.c: pg_regexec regexec.c: pg_regexec
regerror.c: pg_regerror regerror.c: pg_regerror
...@@ -19,6 +19,9 @@ library version from any similar one that might be present on a particular ...@@ -19,6 +19,9 @@ library version from any similar one that might be present on a particular
system. They'd need to be removed or replaced in any standalone version system. They'd need to be removed or replaced in any standalone version
of the library.) of the library.)
The sixth file, regexport.c, exposes multiple functions that allow extraction
of info about a compiled regex (see regexport.h).
There are additional source files regc_*.c that are #include'd in regcomp, There are additional source files regc_*.c that are #include'd in regcomp,
and similarly additional source files rege_*.c that are #include'd in and similarly additional source files rege_*.c that are #include'd in
regexec. This was done to avoid exposing internal symbols globally; regexec. This was done to avoid exposing internal symbols globally;
...@@ -45,6 +48,7 @@ regexec.c Top-level regex execution code ...@@ -45,6 +48,7 @@ regexec.c Top-level regex execution code
rege_dfa.c DFA creation and execution rege_dfa.c DFA creation and execution
regerror.c pg_regerror: generate text for a regex error code regerror.c pg_regerror: generate text for a regex error code
regfree.c pg_regfree: API to free a no-longer-needed regex_t regfree.c pg_regfree: API to free a no-longer-needed regex_t
regexport.c Functions for extracting info from a regex_t
regprefix.c Code for extracting a common prefix from a regex_t regprefix.c Code for extracting a common prefix from a regex_t
The locale-specific code is concerned primarily with case-folding and with The locale-specific code is concerned primarily with case-folding and with
...@@ -56,6 +60,7 @@ The header files for the library are in src/include/regex/: ...@@ -56,6 +60,7 @@ The header files for the library are in src/include/regex/:
regcustom.h Customizes library for particular application regcustom.h Customizes library for particular application
regerrs.h Error message list regerrs.h Error message list
regex.h Exported API regex.h Exported API
regexport.h Exported API for regexport.c
regguts.h Internals declarations regguts.h Internals declarations
......
/*-------------------------------------------------------------------------
*
* regexport.c
* Functions for exporting info about a regex's NFA
*
* In this implementation, the NFA defines a necessary but not sufficient
* condition for a string to match the regex: that is, there can be strings
* that match the NFA but don't match the full regex, but not vice versa.
* Thus, for example, it is okay for the functions below to ignore lookahead
* constraints, which merely constrain the string some more.
*
* Notice that these functions return info into caller-provided arrays
* rather than doing their own malloc's. This simplifies the APIs by
* eliminating a class of error conditions, and in the case of colors
* allows the caller to decide how big is too big to bother with.
*
*
* Portions Copyright (c) 2013, PostgreSQL Global Development Group
* Portions Copyright (c) 1998, 1999 Henry Spencer
*
* IDENTIFICATION
* src/backend/regex/regexport.c
*
*-------------------------------------------------------------------------
*/
#include "regex/regguts.h"
#include "regex/regexport.h"
static void scancolormap(struct colormap * cm, int co,
union tree * t, int level, chr partial,
pg_wchar **chars, int *chars_len);
/*
* Get total number of NFA states.
*/
int
pg_reg_getnumstates(const regex_t *regex)
{
struct cnfa *cnfa;
assert(regex != NULL && regex->re_magic == REMAGIC);
cnfa = &((struct guts *) regex->re_guts)->search;
return cnfa->nstates;
}
/*
* Get initial state of NFA.
*/
int
pg_reg_getinitialstate(const regex_t *regex)
{
struct cnfa *cnfa;
assert(regex != NULL && regex->re_magic == REMAGIC);
cnfa = &((struct guts *) regex->re_guts)->search;
return cnfa->pre;
}
/*
* Get final state of NFA.
*/
int
pg_reg_getfinalstate(const regex_t *regex)
{
struct cnfa *cnfa;
assert(regex != NULL && regex->re_magic == REMAGIC);
cnfa = &((struct guts *) regex->re_guts)->search;
return cnfa->post;
}
/*
* Get number of outgoing NFA arcs of state number "st".
*
* Note: LACON arcs are ignored, both here and in pg_reg_getoutarcs().
*/
int
pg_reg_getnumoutarcs(const regex_t *regex, int st)
{
struct cnfa *cnfa;
struct carc *ca;
int count;
assert(regex != NULL && regex->re_magic == REMAGIC);
cnfa = &((struct guts *) regex->re_guts)->search;
if (st < 0 || st >= cnfa->nstates)
return 0;
count = 0;
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
{
if (ca->co < cnfa->ncolors)
count++;
}
return count;
}
/*
* Write array of outgoing NFA arcs of state number "st" into arcs[],
* whose length arcs_len must be at least as long as indicated by
* pg_reg_getnumoutarcs(), else not all arcs will be returned.
*/
void
pg_reg_getoutarcs(const regex_t *regex, int st,
regex_arc_t *arcs, int arcs_len)
{
struct cnfa *cnfa;
struct carc *ca;
assert(regex != NULL && regex->re_magic == REMAGIC);
cnfa = &((struct guts *) regex->re_guts)->search;
if (st < 0 || st >= cnfa->nstates || arcs_len <= 0)
return;
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
{
if (ca->co < cnfa->ncolors)
{
arcs->co = ca->co;
arcs->to = ca->to;
arcs++;
if (--arcs_len == 0)
break;
}
}
}
/*
* Get total number of colors.
*/
int
pg_reg_getnumcolors(const regex_t *regex)
{
struct colormap *cm;
assert(regex != NULL && regex->re_magic == REMAGIC);
cm = &((struct guts *) regex->re_guts)->cmap;
return cm->max + 1;
}
/*
* Check if color is beginning of line/string.
*
* (We might at some point need to offer more refined handling of pseudocolors,
* but this will do for now.)
*/
int
pg_reg_colorisbegin(const regex_t *regex, int co)
{
struct cnfa *cnfa;
assert(regex != NULL && regex->re_magic == REMAGIC);
cnfa = &((struct guts *) regex->re_guts)->search;
if (co == cnfa->bos[0] || co == cnfa->bos[1])
return true;
else
return false;
}
/*
* Check if color is end of line/string.
*/
int
pg_reg_colorisend(const regex_t *regex, int co)
{
struct cnfa *cnfa;
assert(regex != NULL && regex->re_magic == REMAGIC);
cnfa = &((struct guts *) regex->re_guts)->search;
if (co == cnfa->eos[0] || co == cnfa->eos[1])
return true;
else
return false;
}
/*
* Get number of member chrs of color number "co".
*
* Note: we return -1 if the color number is invalid, or if it is a special
* color (WHITE or a pseudocolor), or if the number of members is uncertain.
* The latter case cannot arise right now but is specified to allow for future
* improvements (see musings about run-time handling of higher character codes
* in regex/README). Callers should not try to extract the members if -1 is
* returned.
*/
int
pg_reg_getnumcharacters(const regex_t *regex, int co)
{
struct colormap *cm;
assert(regex != NULL && regex->re_magic == REMAGIC);
cm = &((struct guts *) regex->re_guts)->cmap;
if (co <= 0 || co > cm->max) /* we reject 0 which is WHITE */
return -1;
if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */
return -1;
return cm->cd[co].nchrs;
}
/*
* Write array of member chrs of color number "co" into chars[],
* whose length chars_len must be at least as long as indicated by
* pg_reg_getnumcharacters(), else not all chars will be returned.
*
* Fetching the members of WHITE or a pseudocolor is not supported.
*
* Caution: this is a relatively expensive operation.
*/
void
pg_reg_getcharacters(const regex_t *regex, int co,
pg_wchar *chars, int chars_len)
{
struct colormap *cm;
assert(regex != NULL && regex->re_magic == REMAGIC);
cm = &((struct guts *) regex->re_guts)->cmap;
if (co <= 0 || co > cm->max || chars_len <= 0)
return;
if (cm->cd[co].flags & PSEUDO)
return;
/* Recursively search the colormap tree */
scancolormap(cm, co, cm->tree, 0, 0, &chars, &chars_len);
}
/*
* Recursively scan the colormap tree to find chrs belonging to color "co".
* See regex/README for info about the tree structure.
*
* t: tree block to scan
* level: level (from 0) of t
* partial: partial chr code for chrs within t
* chars, chars_len: output area
*/
static void
scancolormap(struct colormap * cm, int co,
union tree * t, int level, chr partial,
pg_wchar **chars, int *chars_len)
{
int i;
if (level < NBYTS - 1)
{
/* non-leaf node */
for (i = 0; i < BYTTAB; i++)
{
/*
* We do not support search for chrs of color 0 (WHITE), so
* all-white subtrees need not be searched. These can be
* recognized because they are represented by the fill blocks in
* the colormap struct. This typically allows us to avoid
* scanning large regions of higher-numbered chrs.
*/
if (t->tptr[i] == &cm->tree[level + 1])
continue;
/* Recursively scan next level down */
scancolormap(cm, co,
t->tptr[i], level + 1,
(partial | (chr) i) << BYTBITS,
chars, chars_len);
}
}
else
{
/* leaf node */
for (i = 0; i < BYTTAB; i++)
{
if (t->tcolor[i] == co)
{
if (*chars_len > 0)
{
**chars = partial | (chr) i;
(*chars)++;
(*chars_len)--;
}
}
}
}
}
...@@ -6746,6 +6746,7 @@ gincost_pattern(IndexOptInfo *index, int indexcol, ...@@ -6746,6 +6746,7 @@ gincost_pattern(IndexOptInfo *index, int indexcol,
GinQualCounts *counts) GinQualCounts *counts)
{ {
Oid extractProcOid; Oid extractProcOid;
Oid collation;
int strategy_op; int strategy_op;
Oid lefttype, Oid lefttype,
righttype; righttype;
...@@ -6783,7 +6784,16 @@ gincost_pattern(IndexOptInfo *index, int indexcol, ...@@ -6783,7 +6784,16 @@ gincost_pattern(IndexOptInfo *index, int indexcol,
get_rel_name(index->indexoid)); get_rel_name(index->indexoid));
} }
OidFunctionCall7(extractProcOid, /*
* Choose collation to pass to extractProc (should match initGinState).
*/
if (OidIsValid(index->indexcollations[indexcol]))
collation = index->indexcollations[indexcol];
else
collation = DEFAULT_COLLATION_OID;
OidFunctionCall7Coll(extractProcOid,
collation,
query, query,
PointerGetDatum(&nentries), PointerGetDatum(&nentries),
UInt16GetDatum(strategy_op), UInt16GetDatum(strategy_op),
......
...@@ -24,6 +24,11 @@ ...@@ -24,6 +24,11 @@
*/ */
typedef unsigned int pg_wchar; typedef unsigned int pg_wchar;
/*
* Maximum byte length of multibyte characters in any backend encoding
*/
#define MAX_MULTIBYTE_CHAR_LEN 4
/* /*
* various definitions for EUC * various definitions for EUC
*/ */
......
/*-------------------------------------------------------------------------
*
* regexport.h
* Declarations for exporting info about a regex's NFA (nondeterministic
* finite automaton)
*
* The functions declared here provide accessors to extract the NFA state
* graph and color character sets of a successfully-compiled regex.
*
* An NFA contains one or more states, numbered 0..N-1. There is an initial
* state, as well as a final state --- reaching the final state denotes
* successful matching of an input string. Each state except the final one
* has some out-arcs that lead to successor states, each arc being labeled
* with a color that represents one or more concrete character codes.
* (The colors of a state's out-arcs need not be distinct, since this is an
* NFA not a DFA.) There are also "pseudocolors" representing start/end of
* line and start/end of string. Colors are numbered 0..C-1, but note that
* color 0 is "white" (all unused characters) and can generally be ignored.
*
* Portions Copyright (c) 2013, PostgreSQL Global Development Group
* Portions Copyright (c) 1998, 1999 Henry Spencer
*
* IDENTIFICATION
* src/include/regex/regexport.h
*
*-------------------------------------------------------------------------
*/
#ifndef _REGEXPORT_H_
#define _REGEXPORT_H_
#include "regex/regex.h"
/* information about one arc of a regex's NFA */
typedef struct
{
int co; /* label (character-set color) of arc */
int to; /* next state number */
} regex_arc_t;
/* Functions for gathering information about NFA states and arcs */
extern int pg_reg_getnumstates(const regex_t *regex);
extern int pg_reg_getinitialstate(const regex_t *regex);
extern int pg_reg_getfinalstate(const regex_t *regex);
extern int pg_reg_getnumoutarcs(const regex_t *regex, int st);
extern void pg_reg_getoutarcs(const regex_t *regex, int st,
regex_arc_t *arcs, int arcs_len);
/* Functions for gathering information about colors */
extern int pg_reg_getnumcolors(const regex_t *regex);
extern int pg_reg_colorisbegin(const regex_t *regex, int co);
extern int pg_reg_colorisend(const regex_t *regex, int co);
extern int pg_reg_getnumcharacters(const regex_t *regex, int co);
extern void pg_reg_getcharacters(const regex_t *regex, int co,
pg_wchar *chars, int chars_len);
#endif /* _REGEXPORT_H_ */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment