Commit 6e2f3ae8 authored by Tom Lane's avatar Tom Lane

Support LIKE and ILIKE index searches via contrib/pg_trgm indexes.

Unlike Btree-based LIKE optimization, this works for non-left-anchored
search patterns.  The effectiveness of the search depends on how many
trigrams can be extracted from the pattern.  (The worst case, with no
trigrams, degrades to a full-table scan, so this isn't a panacea.  But
it can be very useful.)

Alexander Korotkov, reviewed by Jan Urbanski
parent 6238473a
......@@ -3467,3 +3467,93 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
qwertyu0988 | 0.333333
(1 row)
create table test2(t text);
insert into test2 values ('abcdef');
insert into test2 values ('quark');
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
set enable_seqscan=off;
explain (costs off)
select * from test2 where t like '%BCD%';
QUERY PLAN
------------------------------------------
Bitmap Heap Scan on test2
Recheck Cond: (t ~~ '%BCD%'::text)
-> Bitmap Index Scan on test2_idx_gin
Index Cond: (t ~~ '%BCD%'::text)
(4 rows)
explain (costs off)
select * from test2 where t ilike '%BCD%';
QUERY PLAN
-------------------------------------------
Bitmap Heap Scan on test2
Recheck Cond: (t ~~* '%BCD%'::text)
-> Bitmap Index Scan on test2_idx_gin
Index Cond: (t ~~* '%BCD%'::text)
(4 rows)
select * from test2 where t like '%BCD%';
t
---
(0 rows)
select * from test2 where t like '%bcd%';
t
--------
abcdef
(1 row)
select * from test2 where t ilike '%BCD%';
t
--------
abcdef
(1 row)
select * from test2 where t ilike 'qua%';
t
-------
quark
(1 row)
drop index test2_idx_gin;
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
set enable_seqscan=off;
explain (costs off)
select * from test2 where t like '%BCD%';
QUERY PLAN
------------------------------------------
Index Scan using test2_idx_gist on test2
Index Cond: (t ~~ '%BCD%'::text)
(2 rows)
explain (costs off)
select * from test2 where t ilike '%BCD%';
QUERY PLAN
------------------------------------------
Index Scan using test2_idx_gist on test2
Index Cond: (t ~~* '%BCD%'::text)
(2 rows)
select * from test2 where t like '%BCD%';
t
---
(0 rows)
select * from test2 where t like '%bcd%';
t
--------
abcdef
(1 row)
select * from test2 where t ilike '%BCD%';
t
--------
abcdef
(1 row)
select * from test2 where t ilike 'qua%';
t
-------
quark
(1 row)
......@@ -113,6 +113,8 @@ FOR TYPE text USING gist
AS
OPERATOR 1 % (text, text),
OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops,
OPERATOR 3 pg_catalog.~~ (text, text),
OPERATOR 4 pg_catalog.~~* (text, text),
FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal),
FUNCTION 2 gtrgm_union (bytea, internal),
FUNCTION 3 gtrgm_compress (internal),
......@@ -124,17 +126,17 @@ AS
STORAGE gtrgm;
-- support functions for gin
CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal)
CREATE OR REPLACE FUNCTION gin_extract_value_trgm(text, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal, int2, internal, internal)
CREATE OR REPLACE FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal)
CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
......@@ -144,8 +146,10 @@ CREATE OPERATOR CLASS gin_trgm_ops
FOR TYPE text USING gin
AS
OPERATOR 1 % (text, text),
OPERATOR 3 pg_catalog.~~ (text, text),
OPERATOR 4 pg_catalog.~~* (text, text),
FUNCTION 1 btint4cmp (int4, int4),
FUNCTION 2 gin_extract_trgm (text, internal),
FUNCTION 3 gin_extract_trgm (text, internal, int2, internal, internal),
FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal),
FUNCTION 2 gin_extract_value_trgm (text, internal),
FUNCTION 3 gin_extract_query_trgm (text, internal, int2, internal, internal, internal, internal),
FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal, internal, internal),
STORAGE int4;
......@@ -45,3 +45,28 @@ set enable_seqscan=off;
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
create table test2(t text);
insert into test2 values ('abcdef');
insert into test2 values ('quark');
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
set enable_seqscan=off;
explain (costs off)
select * from test2 where t like '%BCD%';
explain (costs off)
select * from test2 where t ilike '%BCD%';
select * from test2 where t like '%BCD%';
select * from test2 where t like '%bcd%';
select * from test2 where t ilike '%BCD%';
select * from test2 where t ilike 'qua%';
drop index test2_idx_gin;
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
set enable_seqscan=off;
explain (costs off)
select * from test2 where t like '%BCD%';
explain (costs off)
select * from test2 where t ilike '%BCD%';
select * from test2 where t like '%BCD%';
select * from test2 where t like '%bcd%';
select * from test2 where t ilike '%BCD%';
select * from test2 where t ilike 'qua%';
......@@ -13,12 +13,21 @@
#define LPADDING 2
#define RPADDING 1
#define KEEPONLYALNUM
/*
* Caution: IGNORECASE macro means that trigrams are case-insensitive.
* If this macro is disabled, the ~~* operator must be removed from the
* operator classes, because we can't handle case-insensitive wildcard search
* with case-sensitive trigrams. Failure to do this will result in "cannot
* handle ~~* with case-sensitive trigrams" errors.
*/
#define IGNORECASE
#define DIVUNION
/* operator strategy numbers */
#define SimilarityStrategyNumber 1
#define DistanceStrategyNumber 2
#define LikeStrategyNumber 3
#define ILikeStrategyNumber 4
typedef char trgm[3];
......@@ -40,7 +49,10 @@ uint32 trgm2int(trgm *ptr);
#else
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) )
#endif
#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)t) ) && ISPRINTABLECHAR( ((char*)t)+1 ) && ISPRINTABLECHAR( ((char*)t)+2 ) )
#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
#define ISESCAPECHAR(x) (*(x) == '\\') /* Wildcard escape character */
#define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%') /* Wildcard meta-character */
typedef struct
{
......@@ -65,7 +77,7 @@ typedef char *BITVECP;
for(i=0;i<SIGLEN;i++)
#define GETBYTE(x,i) ( *( (BITVECP)(x) + (int)( (i) / BITBYTE ) ) )
#define GETBITBYTE(x,i) ( ((char)(x)) >> i & 0x01 )
#define GETBITBYTE(x,i) ( (((char)(x)) >> (i)) & 0x01 )
#define CLRBIT(x,i) GETBYTE(x,i) &= ~( 0x01 << ( (i) % BITBYTE ) )
#define SETBIT(x,i) GETBYTE(x,i) |= ( 0x01 << ( (i) % BITBYTE ) )
#define GETBIT(x,i) ( (GETBYTE(x,i) >> ( (i) % BITBYTE )) & 0x01 )
......@@ -89,6 +101,8 @@ typedef char *BITVECP;
extern float4 trgm_limit;
TRGM *generate_trgm(char *str, int slen);
TRGM *generate_wildcard_trgm(const char *str, int slen);
float4 cnt_sml(TRGM *trg1, TRGM *trg2);
bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
#endif /* __TRGM_H__ */
......@@ -7,6 +7,7 @@
#include "access/gin.h"
#include "access/itup.h"
#include "access/skey.h"
#include "access/tuptoaster.h"
#include "storage/bufpage.h"
#include "utils/array.h"
......@@ -16,14 +17,31 @@
PG_FUNCTION_INFO_V1(gin_extract_trgm);
Datum gin_extract_trgm(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gin_extract_value_trgm);
Datum gin_extract_value_trgm(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gin_extract_query_trgm);
Datum gin_extract_query_trgm(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(gin_trgm_consistent);
Datum gin_trgm_consistent(PG_FUNCTION_ARGS);
/*
* This function is used as both extractValue and extractQuery
* This function can only be called if a pre-9.1 version of the GIN operator
* class definition is present in the catalogs (probably as a consequence
* of upgrade-in-place). Complain.
*/
Datum
gin_extract_trgm(PG_FUNCTION_ARGS)
{
ereport(ERROR,
(errmsg("GIN operator class for pg_trgm is out of date"),
errhint("Please drop and re-create the pg_trgm catalog entries.")));
PG_RETURN_NULL();
}
Datum
gin_extract_value_trgm(PG_FUNCTION_ARGS)
{
text *val = (text *) PG_GETARG_TEXT_P(0);
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
......@@ -57,34 +75,124 @@ gin_extract_trgm(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(entries);
}
Datum
gin_extract_query_trgm(PG_FUNCTION_ARGS)
{
text *val = (text *) PG_GETARG_TEXT_P(0);
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
StrategyNumber strategy = PG_GETARG_UINT16(2);
/* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
/* bool **nullFlags = (bool **) PG_GETARG_POINTER(5); */
int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
Datum *entries = NULL;
TRGM *trg;
int32 trglen;
trgm *ptr;
int32 i;
switch (strategy)
{
case SimilarityStrategyNumber:
trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
break;
case ILikeStrategyNumber:
#ifndef IGNORECASE
elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
#endif
/* FALL THRU */
case LikeStrategyNumber:
/*
* For wildcard search we extract all the trigrams that every
* potentially-matching string must include.
*/
trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
break;
default:
elog(ERROR, "unrecognized strategy number: %d", strategy);
trg = NULL; /* keep compiler quiet */
break;
}
trglen = ARRNELEM(trg);
*nentries = trglen;
if (trglen > 0)
{
entries = (Datum *) palloc(sizeof(Datum) * trglen);
ptr = GETARR(trg);
for (i = 0; i < trglen; i++)
{
int32 item = trgm2int(ptr);
entries[i] = Int32GetDatum(item);
ptr++;
}
}
/*
* If no trigram was extracted then we have to scan all the index.
*/
if (trglen == 0)
*searchMode = GIN_SEARCH_MODE_ALL;
PG_RETURN_POINTER(entries);
}
Datum
gin_trgm_consistent(PG_FUNCTION_ARGS)
{
bool *check = (bool *) PG_GETARG_POINTER(0);
/* StrategyNumber strategy = PG_GETARG_UINT16(1); */
StrategyNumber strategy = PG_GETARG_UINT16(1);
/* text *query = PG_GETARG_TEXT_P(2); */
int32 nkeys = PG_GETARG_INT32(3);
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
bool *recheck = (bool *) PG_GETARG_POINTER(5);
bool res = FALSE;
bool res;
int32 i,
ntrue = 0;
ntrue;
/* All cases served by this function are inexact */
*recheck = true;
switch (strategy)
{
case SimilarityStrategyNumber:
/* Count the matches */
ntrue = 0;
for (i = 0; i < nkeys; i++)
{
if (check[i])
ntrue++;
}
#ifdef DIVUNION
res = (nkeys == ntrue) ? true : ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >= trgm_limit) ? true : false);
#else
res = (nkeys == 0) ? false : ((((((float4) ntrue) / ((float4) nkeys))) >= trgm_limit) ? true : false);
#endif
break;
case ILikeStrategyNumber:
#ifndef IGNORECASE
elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
#endif
/* FALL THRU */
case LikeStrategyNumber:
/* Check if all extracted trigrams are presented. */
res = true;
for (i = 0; i < nkeys; i++)
{
if (!check[i])
{
res = false;
break;
}
}
break;
default:
elog(ERROR, "unrecognized strategy number: %d", strategy);
res = false; /* keep compiler quiet */
break;
}
PG_RETURN_BOOL(res);
}
......@@ -195,31 +195,61 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg;
bool res;
char *cache = (char *) fcinfo->flinfo->fn_extra;
/* All cases served by this function are exact */
*recheck = false;
char *cache = (char *) fcinfo->flinfo->fn_extra,
*cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0)
/*
* Store both the strategy number and extracted trigrams in cache, because
* trigram extraction is relatively CPU-expensive. We must include
* strategy number because trigram extraction depends on strategy.
*/
if (cache == NULL || strategy != *((StrategyNumber *) cache) ||
VARSIZE(cacheContents) != VARSIZE(query) ||
memcmp(cacheContents, query, VARSIZE(query)) != 0)
{
switch (strategy)
{
case SimilarityStrategyNumber:
qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
break;
case ILikeStrategyNumber:
#ifndef IGNORECASE
elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
#endif
/* FALL THRU */
case LikeStrategyNumber:
qtrg = generate_wildcard_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
break;
default:
elog(ERROR, "unrecognized strategy number: %d", strategy);
qtrg = NULL; /* keep compiler quiet */
break;
}
if (cache)
pfree(cache);
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
fcinfo->flinfo->fn_extra =
MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
MAXALIGN(sizeof(StrategyNumber)) +
MAXALIGN(VARSIZE(query)) +
VARSIZE(qtrg));
cache = (char *) fcinfo->flinfo->fn_extra;
cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
memcpy(cache, query, VARSIZE(query));
memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg));
*((StrategyNumber *) cache) = strategy;
memcpy(cacheContents, query, VARSIZE(query));
memcpy(cacheContents + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg));
}
qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
qtrg = (TRGM *) (cacheContents + MAXALIGN(VARSIZE(query)));
switch (strategy)
{
case SimilarityStrategyNumber:
/* Similarity search is exact */
*recheck = false;
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key, qtrg);
......@@ -242,6 +272,47 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false;
}
break;
case ILikeStrategyNumber:
#ifndef IGNORECASE
elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
#endif
/* FALL THRU */
case LikeStrategyNumber:
/* Wildcard search is inexact */
*recheck = true;
/*
* Check if all the extracted trigrams can be present in child
* nodes.
*/
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
res = trgm_contained_by(qtrg, key);
}
else if (ISALLTRUE(key))
{ /* non-leaf contains signature */
res = true;
}
else
{ /* non-leaf contains signature */
int32 k,
tmp = 0,
len = ARRNELEM(qtrg);
trgm *ptr = GETARR(qtrg);
BITVECP sign = GETSIGN(key);
res = true;
for (k = 0; k < len; k++)
{
CPTRGM(((char *) &tmp), ptr + k);
if (!GETBIT(sign, HASHVAL(tmp)))
{
res = false;
break;
}
}
}
break;
default:
elog(ERROR, "unrecognized strategy number: %d", strategy);
res = false; /* keep compiler quiet */
......
......@@ -18,6 +18,23 @@ float4 trgm_limit = 0.3f;
PG_FUNCTION_INFO_V1(set_limit);
Datum set_limit(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(show_limit);
Datum show_limit(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(show_trgm);
Datum show_trgm(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(similarity);
Datum similarity(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(similarity_dist);
Datum similarity_dist(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(similarity_op);
Datum similarity_op(PG_FUNCTION_ARGS);
Datum
set_limit(PG_FUNCTION_ARGS)
{
......@@ -29,8 +46,6 @@ set_limit(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4(trgm_limit);
}
PG_FUNCTION_INFO_V1(show_limit);
Datum show_limit(PG_FUNCTION_ARGS);
Datum
show_limit(PG_FUNCTION_ARGS)
{
......@@ -120,7 +135,7 @@ cnt_trigram(trgm *tptr, char *str, int bytelen)
#endif
/*
* Adds trigramm from words (already padded).
* Adds trigrams from words (already padded).
*/
static trgm *
make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
......@@ -236,6 +251,225 @@ generate_trgm(char *str, int slen)
return trg;
}
/*
* Extract the next non-wildcard part of a search string, ie, a word bounded
* by '_' or '%' meta-characters, non-word characters or string end.
*
* str: source string, of length lenstr bytes (need not be null-terminated)
* buf: where to return the substring (must be long enough)
* *bytelen: receives byte length of the found substring
* *charlen: receives character length of the found substring
*
* Returns pointer to end+1 of the found substring in the source string.
* Returns NULL if no word found (in which case buf, bytelen, charlen not set)
*
* If the found word is bounded by non-word characters or string boundaries
* then this function will include corresponding padding spaces into buf.
*/
static const char *
get_wildcard_part(const char *str, int lenstr,
char *buf, int *bytelen, int *charlen)
{
const char *beginword = str;
const char *endword;
char *s = buf;
bool in_wildcard_meta = false;
bool in_escape = false;
int clen;
/*
* Find the first word character remembering whether last character was
* wildcard meta-character.
*/
while (beginword - str < lenstr)
{
if (in_escape)
{
in_escape = false;
in_wildcard_meta = false;
if (iswordchr(beginword))
break;
}
else
{
if (ISESCAPECHAR(beginword))
in_escape = true;
else if (ISWILDCARDCHAR(beginword))
in_wildcard_meta = true;
else if (iswordchr(beginword))
break;
else
in_wildcard_meta = false;
}
beginword += pg_mblen(beginword);
}
/*
* Handle string end.
*/
if (beginword - str >= lenstr)
return NULL;
/*
* Add left padding spaces if last character wasn't wildcard
* meta-character.
*/
*charlen = 0;
if (!in_wildcard_meta)
{
if (LPADDING > 0)
{
*s++ = ' ';
(*charlen)++;
if (LPADDING > 1)
{
*s++ = ' ';
(*charlen)++;
}
}
}
/*
* Copy data into buf until wildcard meta-character, non-word character or
* string boundary. Strip escapes during copy.
*/
endword = beginword;
in_wildcard_meta = false;
in_escape = false;
while (endword - str < lenstr)
{
clen = pg_mblen(endword);
if (in_escape)
{
in_escape = false;
in_wildcard_meta = false;
if (iswordchr(endword))
{
memcpy(s, endword, clen);
(*charlen)++;
s += clen;
}
else
break;
}
else
{
if (ISESCAPECHAR(endword))
in_escape = true;
else if (ISWILDCARDCHAR(endword))
{
in_wildcard_meta = true;
break;
}
else if (iswordchr(endword))
{
memcpy(s, endword, clen);
(*charlen)++;
s += clen;
}
else
{
in_wildcard_meta = false;
break;
}
}
endword += clen;
}
/*
* Add right padding spaces if last character wasn't wildcard
* meta-character.
*/
if (!in_wildcard_meta)
{
if (RPADDING > 0)
{
*s++ = ' ';
(*charlen)++;
if (RPADDING > 1)
{
*s++ = ' ';
(*charlen)++;
}
}
}
*bytelen = s - buf;
return endword;
}
/*
* Generates trigrams for wildcard search string.
*
* Returns array of trigrams that must occur in any string that matches the
* wildcard string. For example, given pattern "a%bcd%" the trigrams
* " a", "bcd" would be extracted.
*/
TRGM *
generate_wildcard_trgm(const char *str, int slen)
{
TRGM *trg;
char *buf,
*buf2;
trgm *tptr;
int len,
charlen,
bytelen;
const char *eword;
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
trg->flag = ARRKEY;
SET_VARSIZE(trg, TRGMHDRSIZE);
if (slen + LPADDING + RPADDING < 3 || slen == 0)
return trg;
tptr = GETARR(trg);
buf = palloc(sizeof(char) * (slen + 4));
/*
* Extract trigrams from each substring extracted by get_wildcard_part.
*/
eword = str;
while ((eword = get_wildcard_part(eword, slen - (eword - str),
buf, &bytelen, &charlen)) != NULL)
{
#ifdef IGNORECASE
buf2 = lowerstr_with_len(buf, bytelen);
bytelen = strlen(buf2);
#else
buf2 = buf;
#endif
/*
* count trigrams
*/
tptr = make_trigrams(tptr, buf2, bytelen, charlen);
#ifdef IGNORECASE
pfree(buf2);
#endif
}
pfree(buf);
if ((len = tptr - GETARR(trg)) == 0)
return trg;
/*
* Make trigrams unique.
*/
if (len > 0)
{
qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
len = unique_array(GETARR(trg), len);
}
SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
return trg;
}
uint32
trgm2int(trgm *ptr)
{
......@@ -250,8 +484,6 @@ trgm2int(trgm *ptr)
return val;
}
PG_FUNCTION_INFO_V1(show_trgm);
Datum show_trgm(PG_FUNCTION_ARGS);
Datum
show_trgm(PG_FUNCTION_ARGS)
{
......@@ -340,8 +572,44 @@ cnt_sml(TRGM *trg1, TRGM *trg2)
}
PG_FUNCTION_INFO_V1(similarity);
Datum similarity(PG_FUNCTION_ARGS);
/*
* Returns whether trg2 contains all trigrams in trg1.
* This relies on the trigram arrays being sorted.
*/
bool
trgm_contained_by(TRGM *trg1, TRGM *trg2)
{
trgm *ptr1,
*ptr2;
int len1,
len2;
ptr1 = GETARR(trg1);
ptr2 = GETARR(trg2);
len1 = ARRNELEM(trg1);
len2 = ARRNELEM(trg2);
while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
{
int res = CMPTRGM(ptr1, ptr2);
if (res < 0)
return false;
else if (res > 0)
ptr2++;
else
{
ptr1++;
ptr2++;
}
}
if (ptr1 - GETARR(trg1) < len1)
return false;
else
return true;
}
Datum
similarity(PG_FUNCTION_ARGS)
{
......@@ -364,8 +632,6 @@ similarity(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4(res);
}
PG_FUNCTION_INFO_V1(similarity_dist);
Datum similarity_dist(PG_FUNCTION_ARGS);
Datum
similarity_dist(PG_FUNCTION_ARGS)
{
......@@ -375,8 +641,6 @@ similarity_dist(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4(1.0 - res);
}
PG_FUNCTION_INFO_V1(similarity_op);
Datum similarity_op(PG_FUNCTION_ARGS);
Datum
similarity_op(PG_FUNCTION_ARGS)
{
......
......@@ -25,11 +25,11 @@ DROP TYPE gtrgm CASCADE;
DROP OPERATOR CLASS gin_trgm_ops USING gin;
DROP FUNCTION gin_extract_trgm(text, internal);
DROP FUNCTION gin_extract_value_trgm(text, internal);
DROP FUNCTION gin_extract_trgm(text, internal, int2, internal, internal);
DROP FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal);
DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal);
DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal);
DROP OPERATOR % (text, text);
......
......@@ -137,8 +137,10 @@
The <filename>pg_trgm</filename> module provides GiST and GIN index
operator classes that allow you to create an index over a text column for
the purpose of very fast similarity searches. These index types support
the above-described similarity operators (and no other operators, so you may
want a regular B-tree index too).
the above-described similarity operators, and additionally support
trigram-based index searches for <literal>LIKE</> and <literal>ILIKE</>
queries. (These indexes do not support equality nor simple comparison
operators, so you may need a regular B-tree index too.)
</para>
<para>
......@@ -181,6 +183,18 @@ SELECT t, t &lt;-&gt; '<replaceable>word</>' AS dist
a small number of the closest matches is wanted.
</para>
<para>
Beginning in <productname>PostgreSQL</> 9.1, these index types also support
index searches for <literal>LIKE</> and <literal>ILIKE</>, for example
<programlisting>
SELECT * FROM test_trgm WHERE t LIKE '%foo%bar';
</programlisting>
The index search works by extracting trigrams from the search string
and then looking these up in the index. The more trigrams in the search
string, the more effective the index search is. Unlike B-tree based
searches, the search string need not be left-anchored.
</para>
<para>
The choice between GiST and GIN indexing depends on the relative
performance characteristics of GiST and GIN, which are discussed elsewhere.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment