Commit 6f5b8beb authored by Tom Lane's avatar Tom Lane

Make contrib/pg_trgm also support regex searches with GiST indexes.

This wasn't addressed in the original patch, but it doesn't take very
much additional code to cover the case, so let's get it done.

Since pg_trgm 1.1 hasn't been released yet, I just changed the definition
of what's in it, rather than inventing a 1.2.
parent e543631f
...@@ -3706,3 +3706,135 @@ select * from test2 where t ilike 'qua%'; ...@@ -3706,3 +3706,135 @@ select * from test2 where t ilike 'qua%';
quark quark
(1 row) (1 row)
select * from test2 where t like '%z foo bar%';
t
-------------
z foo bar
(1 row)
select * from test2 where t like ' z foo%';
t
-------------
z foo bar
(1 row)
explain (costs off)
select * from test2 where t ~ '[abc]{3}';
QUERY PLAN
------------------------------------------
Index Scan using test2_idx_gist on test2
Index Cond: (t ~ '[abc]{3}'::text)
(2 rows)
explain (costs off)
select * from test2 where t ~* 'DEF';
QUERY PLAN
------------------------------------------
Index Scan using test2_idx_gist on test2
Index Cond: (t ~* 'DEF'::text)
(2 rows)
select * from test2 where t ~ '[abc]{3}';
t
--------
abcdef
(1 row)
select * from test2 where t ~ 'a[bc]+d';
t
--------
abcdef
(1 row)
select * from test2 where t ~ '(abc)*$';
t
-------------
abcdef
quark
z foo bar
(3 rows)
select * from test2 where t ~* 'DEF';
t
--------
abcdef
(1 row)
select * from test2 where t ~ 'dEf';
t
---
(0 rows)
select * from test2 where t ~* '^q';
t
-------
quark
(1 row)
select * from test2 where t ~* '[abc]{3}[def]{3}';
t
--------
abcdef
(1 row)
select * from test2 where t ~* 'ab[a-z]{3}';
t
--------
abcdef
(1 row)
select * from test2 where t ~* '(^| )qua';
t
-------
quark
(1 row)
select * from test2 where t ~ 'q.*rk$';
t
-------
quark
(1 row)
select * from test2 where t ~ 'q';
t
-------
quark
(1 row)
select * from test2 where t ~ '[a-z]{3}';
t
-------------
abcdef
quark
z foo bar
(3 rows)
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
t
---
(0 rows)
select * from test2 where t ~ 'z foo bar';
t
-------------
z foo bar
(1 row)
select * from test2 where t ~ ' z foo bar';
t
-------------
z foo bar
(1 row)
select * from test2 where t ~ ' z foo bar';
t
-------------
z foo bar
(1 row)
select * from test2 where t ~ ' z foo';
t
-------------
z foo bar
(1 row)
...@@ -3,6 +3,10 @@ ...@@ -3,6 +3,10 @@
-- complain if script is sourced in psql, rather than via CREATE EXTENSION -- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.1'" to load this file. \quit \echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.1'" to load this file. \quit
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 5 pg_catalog.~ (text, text),
OPERATOR 6 pg_catalog.~* (text, text);
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
OPERATOR 5 pg_catalog.~ (text, text), OPERATOR 5 pg_catalog.~ (text, text),
OPERATOR 6 pg_catalog.~* (text, text); OPERATOR 6 pg_catalog.~* (text, text);
...@@ -132,6 +132,12 @@ ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD ...@@ -132,6 +132,12 @@ ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 4 pg_catalog.~~* (text, text), OPERATOR 4 pg_catalog.~~* (text, text),
FUNCTION 8 (text, text) gtrgm_distance (internal, text, int, oid); FUNCTION 8 (text, text) gtrgm_distance (internal, text, int, oid);
-- Add operators that are new in 9.3.
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 5 pg_catalog.~ (text, text),
OPERATOR 6 pg_catalog.~* (text, text);
-- support functions for gin -- support functions for gin
CREATE FUNCTION gin_extract_value_trgm(text, internal) CREATE FUNCTION gin_extract_value_trgm(text, internal)
RETURNS internal RETURNS internal
......
...@@ -90,3 +90,26 @@ select * from test2 where t like '%bcd%'; ...@@ -90,3 +90,26 @@ select * from test2 where t like '%bcd%';
select * from test2 where t like E'%\\bcd%'; select * from test2 where t like E'%\\bcd%';
select * from test2 where t ilike '%BCD%'; select * from test2 where t ilike '%BCD%';
select * from test2 where t ilike 'qua%'; select * from test2 where t ilike 'qua%';
select * from test2 where t like '%z foo bar%';
select * from test2 where t like ' z foo%';
explain (costs off)
select * from test2 where t ~ '[abc]{3}';
explain (costs off)
select * from test2 where t ~* 'DEF';
select * from test2 where t ~ '[abc]{3}';
select * from test2 where t ~ 'a[bc]+d';
select * from test2 where t ~ '(abc)*$';
select * from test2 where t ~* 'DEF';
select * from test2 where t ~ 'dEf';
select * from test2 where t ~* '^q';
select * from test2 where t ~* '[abc]{3}[def]{3}';
select * from test2 where t ~* 'ab[a-z]{3}';
select * from test2 where t ~* '(^| )qua';
select * from test2 where t ~ 'q.*rk$';
select * from test2 where t ~ 'q';
select * from test2 where t ~ '[a-z]{3}';
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
select * from test2 where t ~ 'z foo bar';
select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo bar';
select * from test2 where t ~ ' z foo';
...@@ -113,8 +113,9 @@ extern TRGM *generate_trgm(char *str, int slen); ...@@ -113,8 +113,9 @@ extern TRGM *generate_trgm(char *str, int slen);
extern TRGM *generate_wildcard_trgm(const char *str, int slen); extern TRGM *generate_wildcard_trgm(const char *str, int slen);
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2); extern float4 cnt_sml(TRGM *trg1, TRGM *trg2);
extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2); extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
extern TRGM *createTrgmNFA(text *text_re, TrgmPackedGraph **graph, extern bool *trgm_presence_map(TRGM *query, TRGM *key);
Oid collation); extern TRGM *createTrgmNFA(text *text_re, Oid collation,
TrgmPackedGraph **graph, MemoryContext rcontext);
extern bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check); extern bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check);
#endif /* __TRGM_H__ */ #endif /* __TRGM_H__ */
...@@ -115,7 +115,8 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS) ...@@ -115,7 +115,8 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
#endif #endif
/* FALL THRU */ /* FALL THRU */
case RegExpStrategyNumber: case RegExpStrategyNumber:
trg = createTrgmNFA(val, &graph, PG_GET_COLLATION()); trg = createTrgmNFA(val, PG_GET_COLLATION(),
&graph, CurrentMemoryContext);
if (trg && ARRNELEM(trg) > 0) if (trg && ARRNELEM(trg) > 0)
{ {
/* /*
......
...@@ -8,6 +8,25 @@ ...@@ -8,6 +8,25 @@
#include "access/skey.h" #include "access/skey.h"
typedef struct
{
/* most recent inputs to gtrgm_consistent */
StrategyNumber strategy;
text *query;
/* extracted trigrams for query */
TRGM *trigrams;
/* if a regex operator, the extracted graph */
TrgmPackedGraph *graph;
/*
* The "query" and "trigrams" are stored in the same palloc block as this
* cache struct, at MAXALIGN'ed offsets. The graph however isn't.
*/
} gtrgm_consistent_cache;
#define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key))
PG_FUNCTION_INFO_V1(gtrgm_in); PG_FUNCTION_INFO_V1(gtrgm_in);
Datum gtrgm_in(PG_FUNCTION_ARGS); Datum gtrgm_in(PG_FUNCTION_ARGS);
...@@ -38,8 +57,6 @@ Datum gtrgm_penalty(PG_FUNCTION_ARGS); ...@@ -38,8 +57,6 @@ Datum gtrgm_penalty(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gtrgm_picksplit); PG_FUNCTION_INFO_V1(gtrgm_picksplit);
Datum gtrgm_picksplit(PG_FUNCTION_ARGS); Datum gtrgm_picksplit(PG_FUNCTION_ARGS);
#define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key))
/* Number of one-bits in an unsigned byte */ /* Number of one-bits in an unsigned byte */
static const uint8 number_of_ones[256] = { static const uint8 number_of_ones[256] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
...@@ -191,24 +208,30 @@ gtrgm_consistent(PG_FUNCTION_ARGS) ...@@ -191,24 +208,30 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
TRGM *qtrg; TRGM *qtrg;
bool res; bool res;
Size querysize = VARSIZE(query); Size querysize = VARSIZE(query);
char *cache = (char *) fcinfo->flinfo->fn_extra, gtrgm_consistent_cache *cache;
*cachedQuery = cache + MAXALIGN(sizeof(StrategyNumber));
/* /*
* Store both the strategy number and extracted trigrams in cache, because * We keep the extracted trigrams in cache, because trigram extraction is
* trigram extraction is relatively CPU-expensive. We must include * relatively CPU-expensive. When trying to reuse a cached value, check
* strategy number because trigram extraction depends on strategy. * strategy number not just query itself, because trigram extraction
* depends on strategy.
* *
* The cached structure contains the strategy number, then the input query * The cached structure is a single palloc chunk containing the
* (starting at a MAXALIGN boundary), then the TRGM value (also starting * gtrgm_consistent_cache header, then the input query (starting at a
* at a MAXALIGN boundary). * MAXALIGN boundary), then the TRGM value (also starting at a MAXALIGN
* boundary). However we don't try to include the regex graph (if any) in
* that struct. (XXX currently, this approach can leak regex graphs
* across index rescans. Not clear if that's worth fixing.)
*/ */
cache = (gtrgm_consistent_cache *) fcinfo->flinfo->fn_extra;
if (cache == NULL || if (cache == NULL ||
strategy != *((StrategyNumber *) cache) || cache->strategy != strategy ||
VARSIZE(cachedQuery) != querysize || VARSIZE(cache->query) != querysize ||
memcmp(cachedQuery, query, querysize) != 0) memcmp((char *) cache->query, (char *) query, querysize) != 0)
{ {
char *newcache; gtrgm_consistent_cache *newcache;
TrgmPackedGraph *graph = NULL;
Size qtrgsize;
switch (strategy) switch (strategy)
{ {
...@@ -225,28 +248,58 @@ gtrgm_consistent(PG_FUNCTION_ARGS) ...@@ -225,28 +248,58 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
qtrg = generate_wildcard_trgm(VARDATA(query), qtrg = generate_wildcard_trgm(VARDATA(query),
querysize - VARHDRSZ); querysize - VARHDRSZ);
break; break;
case RegExpICaseStrategyNumber:
#ifndef IGNORECASE
elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
#endif
/* FALL THRU */
case RegExpStrategyNumber:
qtrg = createTrgmNFA(query, PG_GET_COLLATION(),
&graph, fcinfo->flinfo->fn_mcxt);
/* just in case an empty array is returned ... */
if (qtrg && ARRNELEM(qtrg) <= 0)
{
pfree(qtrg);
qtrg = NULL;
}
break;
default: default:
elog(ERROR, "unrecognized strategy number: %d", strategy); elog(ERROR, "unrecognized strategy number: %d", strategy);
qtrg = NULL; /* keep compiler quiet */ qtrg = NULL; /* keep compiler quiet */
break; break;
} }
newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, qtrgsize = qtrg ? VARSIZE(qtrg) : 0;
MAXALIGN(sizeof(StrategyNumber)) +
MAXALIGN(querysize) +
VARSIZE(qtrg));
cachedQuery = newcache + MAXALIGN(sizeof(StrategyNumber));
*((StrategyNumber *) newcache) = strategy; newcache = (gtrgm_consistent_cache *)
memcpy(cachedQuery, query, querysize); MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
memcpy(cachedQuery + MAXALIGN(querysize), qtrg, VARSIZE(qtrg)); MAXALIGN(sizeof(gtrgm_consistent_cache)) +
MAXALIGN(querysize) +
qtrgsize);
newcache->strategy = strategy;
newcache->query = (text *)
((char *) newcache + MAXALIGN(sizeof(gtrgm_consistent_cache)));
memcpy((char *) newcache->query, (char *) query, querysize);
if (qtrg)
{
newcache->trigrams = (TRGM *)
((char *) newcache->query + MAXALIGN(querysize));
memcpy((char *) newcache->trigrams, (char *) qtrg, qtrgsize);
/* release qtrg in case it was made in fn_mcxt */
pfree(qtrg);
}
else
newcache->trigrams = NULL;
newcache->graph = graph;
if (cache) if (cache)
pfree(cache); pfree(cache);
fcinfo->flinfo->fn_extra = newcache; fcinfo->flinfo->fn_extra = (void *) newcache;
cache = newcache;
} }
qtrg = (TRGM *) (cachedQuery + MAXALIGN(querysize)); qtrg = cache->trigrams;
switch (strategy) switch (strategy)
{ {
...@@ -317,6 +370,57 @@ gtrgm_consistent(PG_FUNCTION_ARGS) ...@@ -317,6 +370,57 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
} }
} }
break; break;
case RegExpICaseStrategyNumber:
#ifndef IGNORECASE
elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
#endif
/* FALL THRU */
case RegExpStrategyNumber:
/* Regexp search is inexact */
*recheck = true;
/* Check regex match as much as we can with available info */
if (qtrg)
{
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
bool *check;
check = trgm_presence_map(qtrg, key);
res = trigramsMatchGraph(cache->graph, check);
pfree(check);
}
else if (ISALLTRUE(key))
{ /* non-leaf contains signature */
res = true;
}
else
{ /* non-leaf contains signature */
int32 k,
tmp = 0,
len = ARRNELEM(qtrg);
trgm *ptr = GETARR(qtrg);
BITVECP sign = GETSIGN(key);
/* descend only if at least one trigram is present */
res = false;
for (k = 0; k < len; k++)
{
CPTRGM(((char *) &tmp), ptr + k);
if (GETBIT(sign, HASHVAL(tmp)))
{
res = true;
break;
}
}
}
}
else
{
/* trigram-free query must be rechecked everywhere */
res = true;
}
break;
default: default:
elog(ERROR, "unrecognized strategy number: %d", strategy); elog(ERROR, "unrecognized strategy number: %d", strategy);
res = false; /* keep compiler quiet */ res = false; /* keep compiler quiet */
......
...@@ -616,6 +616,50 @@ trgm_contained_by(TRGM *trg1, TRGM *trg2) ...@@ -616,6 +616,50 @@ trgm_contained_by(TRGM *trg1, TRGM *trg2)
return true; return true;
} }
/*
* Return a palloc'd boolean array showing, for each trigram in "query",
* whether it is present in the trigram array "key".
* This relies on the "key" array being sorted, but "query" need not be.
*/
bool *
trgm_presence_map(TRGM *query, TRGM *key)
{
bool *result;
trgm *ptrq = GETARR(query),
*ptrk = GETARR(key);
int lenq = ARRNELEM(query),
lenk = ARRNELEM(key),
i;
result = (bool *) palloc0(lenq * sizeof(bool));
/* for each query trigram, do a binary search in the key array */
for (i = 0; i < lenq; i++)
{
int lo = 0;
int hi = lenk;
while (lo < hi)
{
int mid = (lo + hi) / 2;
int res = CMPTRGM(ptrq, ptrk + mid);
if (res < 0)
hi = mid;
else if (res > 0)
lo = mid + 1;
else
{
result[i] = true;
break;
}
}
ptrq++;
}
return result;
}
Datum Datum
similarity(PG_FUNCTION_ARGS) similarity(PG_FUNCTION_ARGS)
{ {
......
...@@ -476,10 +476,13 @@ static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams); ...@@ -476,10 +476,13 @@ static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams);
* *
* Returns an array of trigrams required by the regular expression, or NULL if * Returns an array of trigrams required by the regular expression, or NULL if
* the regular expression was too complex to analyze. In addition, a packed * the regular expression was too complex to analyze. In addition, a packed
* graph representation of the regex is returned into *graph. * graph representation of the regex is returned into *graph. The results
* must be allocated in rcontext (which might or might not be the current
* context).
*/ */
TRGM * TRGM *
createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation) createTrgmNFA(text *text_re, Oid collation,
TrgmPackedGraph **graph, MemoryContext rcontext)
{ {
TRGM *trg; TRGM *trg;
regex_t regex; regex_t regex;
...@@ -488,10 +491,9 @@ createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation) ...@@ -488,10 +491,9 @@ createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation)
/* /*
* This processing generates a great deal of cruft, which we'd like to * This processing generates a great deal of cruft, which we'd like to
* clean up before returning (since this function is normally called in a * clean up before returning (since this function may be called in a
* query-lifespan memory context). Make a temp context we can work in so * query-lifespan memory context). Make a temp context we can work in so
* that cleanup is easy. Note that the returned data structures must be * that cleanup is easy.
* allocated in caller's context, however.
*/ */
tmpcontext = AllocSetContextCreate(CurrentMemoryContext, tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
"createTrgmNFA temporary context", "createTrgmNFA temporary context",
...@@ -516,7 +518,7 @@ createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation) ...@@ -516,7 +518,7 @@ createTrgmNFA(text *text_re, TrgmPackedGraph **graph, Oid collation)
*/ */
PG_TRY(); PG_TRY();
{ {
trg = createTrgmNFAInternal(&regex, graph, oldcontext); trg = createTrgmNFAInternal(&regex, graph, rcontext);
} }
PG_CATCH(); PG_CATCH();
{ {
......
...@@ -216,8 +216,8 @@ SELECT * FROM test_trgm WHERE t LIKE '%foo%bar'; ...@@ -216,8 +216,8 @@ SELECT * FROM test_trgm WHERE t LIKE '%foo%bar';
</para> </para>
<para> <para>
Beginning in <productname>PostgreSQL</> 9.3, <filename>pg_trgm</filename> Beginning in <productname>PostgreSQL</> 9.3, these index types also support
GIN indexes also support index searches for regular-expression matches index searches for regular-expression matches
(<literal>~</> and <literal>~*</> operators), for example (<literal>~</> and <literal>~*</> operators), for example
<programlisting> <programlisting>
SELECT * FROM test_trgm WHERE t ~ '(foo|bar)'; SELECT * FROM test_trgm WHERE t ~ '(foo|bar)';
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment