Commit f576b17c authored by Teodor Sigaev's avatar Teodor Sigaev

Add word_similarity to pg_trgm contrib module.

Patch introduces a concept of similarity over string and just a word from
another string.

Version of extension is not changed because 1.2 was already introduced in 9.6
release cycle, so, there wasn't a public version.

Author: Alexander Korotkov, Artur Zakirov
parent 1c4f001b
......@@ -7,7 +7,7 @@ EXTENSION = pg_trgm
DATA = pg_trgm--1.2.sql pg_trgm--1.0--1.1.sql pg_trgm--1.1--1.2.sql pg_trgm--unpackaged--1.0.sql
PGFILEDESC = "pg_trgm - trigram matching"
REGRESS = pg_trgm
REGRESS = pg_trgm pg_word_trgm
ifdef USE_PGXS
PG_CONFIG = pg_config
......
......@@ -59,7 +59,7 @@ select similarity('---', '####---');
0
(1 row)
CREATE TABLE test_trgm(t text);
CREATE TABLE test_trgm(t text COLLATE "C");
\copy test_trgm from 'data/trgm.data'
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
t | sml
......@@ -3467,7 +3467,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
qwertyu0988 | 0.333333
(1 row)
create table test2(t text);
create table test2(t text COLLATE "C");
insert into test2 values ('abcdef');
insert into test2 values ('quark');
insert into test2 values (' z foo bar');
......
......@@ -3,10 +3,72 @@
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.2'" to load this file. \quit
CREATE FUNCTION word_similarity(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION word_similarity_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE FUNCTION word_similarity_commutator_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE FUNCTION word_similarity_dist_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION word_similarity_dist_commutator_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE OPERATOR <% (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_op,
COMMUTATOR = '%>',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OPERATOR %> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_commutator_op,
COMMUTATOR = '<%',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OPERATOR <<-> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_dist_op,
COMMUTATOR = '<->>'
);
CREATE OPERATOR <->> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_dist_commutator_op,
COMMUTATOR = '<<->'
);
CREATE FUNCTION gin_trgm_triconsistent(internal, int2, text, int4, internal, internal, internal)
RETURNS "char"
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 7 %> (text, text),
OPERATOR 8 <->> (text, text) FOR ORDER BY pg_catalog.float_ops;
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
OPERATOR 7 %> (text, text),
FUNCTION 6 (text, text) gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal);
......@@ -39,6 +39,39 @@ CREATE OPERATOR % (
JOIN = contjoinsel
);
CREATE FUNCTION word_similarity(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION word_similarity_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE FUNCTION word_similarity_commutator_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE OPERATOR <% (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_op,
COMMUTATOR = '%>',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OPERATOR %> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_commutator_op,
COMMUTATOR = '<%',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE FUNCTION similarity_dist(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
......@@ -51,6 +84,30 @@ CREATE OPERATOR <-> (
COMMUTATOR = '<->'
);
CREATE FUNCTION word_similarity_dist_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION word_similarity_dist_commutator_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE OPERATOR <<-> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_dist_op,
COMMUTATOR = '<->>'
);
CREATE OPERATOR <->> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_dist_commutator_op,
COMMUTATOR = '<<->'
);
-- gist key
CREATE FUNCTION gtrgm_in(cstring)
RETURNS gtrgm
......@@ -140,6 +197,12 @@ ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 5 pg_catalog.~ (text, text),
OPERATOR 6 pg_catalog.~* (text, text);
-- Add operators that are new in 9.6 (pg_trgm 1.2).
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 7 %> (text, text),
OPERATOR 8 <->> (text, text) FOR ORDER BY pg_catalog.float_ops;
-- support functions for gin
CREATE FUNCTION gin_extract_value_trgm(text, internal)
RETURNS internal
......@@ -187,4 +250,5 @@ AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
OPERATOR 7 %> (text, text),
FUNCTION 6 (text,text) gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal);
......@@ -13,7 +13,7 @@ select similarity('wow',' WOW ');
select similarity('---', '####---');
CREATE TABLE test_trgm(t text);
CREATE TABLE test_trgm(t text COLLATE "C");
\copy test_trgm from 'data/trgm.data'
......@@ -40,7 +40,7 @@ select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu098
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
create table test2(t text);
create table test2(t text COLLATE "C");
insert into test2 values ('abcdef');
insert into test2 values ('quark');
insert into test2 values (' z foo bar');
......
......@@ -32,7 +32,8 @@
#define ILikeStrategyNumber 4
#define RegExpStrategyNumber 5
#define RegExpICaseStrategyNumber 6
#define WordSimilarityStrategyNumber 7
#define WordDistanceStrategyNumber 8
typedef char trgm[3];
......@@ -103,15 +104,28 @@ typedef char *BITVECP;
#define GETARR(x) ( (trgm*)( (char*)x+TRGMHDRSIZE ) )
#define ARRNELEM(x) ( ( VARSIZE(x) - TRGMHDRSIZE )/sizeof(trgm) )
/*
* If DIVUNION is defined then similarity formula is:
* count / (len1 + len2 - count)
* else if DIVUNION is not defined then similarity formula is:
* count / max(len1, len2)
*/
#ifdef DIVUNION
#define CALCSML(count, len1, len2) ((float4) (count)) / ((float4) ((len1) + (len2) - (count)))
#else
#define CALCSML(count, len1, len2) ((float4) (count)) / ((float4) (((len1) > (len2)) ? (len1) : (len2)))
#endif
typedef struct TrgmPackedGraph TrgmPackedGraph;
extern double similarity_threshold;
extern double word_similarity_threshold;
extern uint32 trgm2int(trgm *ptr);
extern void compact_trigram(trgm *tptr, char *str, int bytelen);
extern TRGM *generate_trgm(char *str, int slen);
extern TRGM *generate_wildcard_trgm(const char *str, int slen);
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2);
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact);
extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
extern bool *trgm_presence_map(TRGM *query, TRGM *key);
extern TRGM *createTrgmNFA(text *text_re, Oid collation,
......
......@@ -89,6 +89,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
switch (strategy)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
break;
case ILikeStrategyNumber:
......@@ -176,6 +177,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
bool res;
int32 i,
ntrue;
double nlimit;
/* All cases served by this function are inexact */
*recheck = true;
......@@ -183,6 +185,10 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
switch (strategy)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
/* Count the matches */
ntrue = 0;
for (i = 0; i < nkeys; i++)
......@@ -207,8 +213,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
* So, independly on DIVUNION the upper bound formula is the same.
*/
res = (nkeys == 0) ? false :
((((((float4) ntrue) / ((float4) nkeys))) >= similarity_threshold)
? true : false);
(((((float4) ntrue) / ((float4) nkeys))) >= nlimit);
break;
case ILikeStrategyNumber:
#ifndef IGNORECASE
......@@ -270,10 +275,15 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
int32 i,
ntrue;
bool *boolcheck;
double nlimit;
switch (strategy)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
/* Count the matches */
ntrue = 0;
for (i = 0; i < nkeys; i++)
......@@ -285,8 +295,8 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
/*
* See comment in gin_trgm_consistent() about * upper bound formula
*/
res = (nkeys == 0) ? GIN_FALSE :
(((((float4) ntrue) / ((float4) nkeys)) >= similarity_threshold)
res = (nkeys == 0)
? GIN_FALSE : (((((float4) ntrue) / ((float4) nkeys)) >= nlimit)
? GIN_MAYBE : GIN_FALSE);
break;
case ILikeStrategyNumber:
......
......@@ -191,6 +191,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
bool res;
Size querysize = VARSIZE(query);
gtrgm_consistent_cache *cache;
double nlimit;
/*
* We keep the extracted trigrams in cache, because trigram extraction is
......@@ -218,6 +219,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
switch (strategy)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
qtrg = generate_trgm(VARDATA(query),
querysize - VARHDRSZ);
break;
......@@ -286,16 +288,23 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
switch (strategy)
{
case SimilarityStrategyNumber:
/* Similarity search is exact */
*recheck = false;
case WordSimilarityStrategyNumber:
/* Similarity search is exact. Word similarity search is inexact */
*recheck = (strategy == WordSimilarityStrategyNumber);
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key, qtrg);
/*
* Prevent gcc optimizing the tmpsml variable using volatile
* keyword. Otherwise comparison of nlimit and tmpsml may give
* wrong results.
*/
float4 volatile tmpsml = cnt_sml(qtrg, key, *recheck);
/* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
res = (*(int *) &tmpsml == *(int *) &similarity_threshold
|| tmpsml > similarity_threshold) ? true : false;
res = (*(int *) &tmpsml == *(int *) &nlimit || tmpsml > nlimit);
}
else if (ISALLTRUE(key))
{ /* non-leaf contains signature */
......@@ -309,8 +318,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
if (len == 0)
res = false;
else
res = (((((float8) count) / ((float8) len))) >= similarity_threshold)
? true : false;
res = (((((float8) count) / ((float8) len))) >= nlimit);
}
break;
case ILikeStrategyNumber:
......@@ -428,6 +436,7 @@ gtrgm_distance(PG_FUNCTION_ARGS)
StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
/* Oid subtype = PG_GETARG_OID(3); */
bool *recheck = (bool *) PG_GETARG_POINTER(4);
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg;
float8 res;
......@@ -463,9 +472,17 @@ gtrgm_distance(PG_FUNCTION_ARGS)
switch (strategy)
{
case DistanceStrategyNumber:
case WordDistanceStrategyNumber:
*recheck = strategy == WordDistanceStrategyNumber;
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
res = 1.0 - cnt_sml(key, qtrg);
/*
* Prevent gcc optimizing the sml variable using volatile
* keyword. Otherwise res can differ from the
* word_similarity_dist_op() function.
*/
float4 volatile sml = cnt_sml(qtrg, key, *recheck);
res = 1.0 - sml;
}
else if (ISALLTRUE(key))
{ /* all leafs contains orig trgm */
......
......@@ -16,6 +16,7 @@ PG_MODULE_MAGIC;
/* GUC variables */
double similarity_threshold = 0.3f;
double word_similarity_threshold = 0.6f;
void _PG_init(void);
......@@ -23,8 +24,20 @@ PG_FUNCTION_INFO_V1(set_limit);
PG_FUNCTION_INFO_V1(show_limit);
PG_FUNCTION_INFO_V1(show_trgm);
PG_FUNCTION_INFO_V1(similarity);
PG_FUNCTION_INFO_V1(word_similarity);
PG_FUNCTION_INFO_V1(similarity_dist);
PG_FUNCTION_INFO_V1(similarity_op);
PG_FUNCTION_INFO_V1(word_similarity_op);
PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
PG_FUNCTION_INFO_V1(word_similarity_dist_op);
PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
/* Trigram with position */
typedef struct
{
trgm trg;
int index;
} pos_trgm;
/*
* Module load callback
......@@ -45,11 +58,23 @@ _PG_init(void)
NULL,
NULL,
NULL);
DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
"Sets the threshold used by the <%% operator.",
"Valid range is 0.0 .. 1.0.",
&word_similarity_threshold,
0.6,
0.0,
1.0,
PGC_USERSET,
0,
NULL,
NULL,
NULL);
}
/*
* Deprecated function.
* Use "pg_trgm.similarity_threshold" GUC variable instead of this function
* Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
*/
Datum
set_limit(PG_FUNCTION_ARGS)
......@@ -59,14 +84,14 @@ set_limit(PG_FUNCTION_ARGS)
if (nlimit < 0 || nlimit > 1.0)
ereport(ERROR,
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
errmsg("wrong limit, should be between 0 and 1")));
errmsg("wrong threshold, should be between 0 and 1")));
similarity_threshold = nlimit;
PG_RETURN_FLOAT4(similarity_threshold);
}
/*
* Deprecated function.
* Use "pg_trgm.similarity_threshold" GUC variable instead of this function
* Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
*/
Datum
show_limit(PG_FUNCTION_ARGS)
......@@ -199,38 +224,28 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
return tptr;
}
TRGM *
generate_trgm(char *str, int slen)
/*
* Make array of trigrams without sorting and removing duplicate items.
*
* trg: where to return the array of trigrams.
* str: source string, of length slen bytes.
*
* Returns length of the generated array.
*/
static int
generate_trgm_only(trgm *trg, char *str, int slen)
{
TRGM *trg;
char *buf;
trgm *tptr;
int len,
charlen,
char *buf;
int charlen,
bytelen;
char *bword,
*eword;
/*
* Guard against possible overflow in the palloc requests below. (We
* don't worry about the additive constants, since palloc can detect
* requests that are a little above MaxAllocSize --- we just need to
* prevent integer overflow in the multiplications.)
*/
if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
(Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("out of memory")));
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
trg->flag = ARRKEY;
SET_VARSIZE(trg, TRGMHDRSIZE);
if (slen + LPADDING + RPADDING < 3 || slen == 0)
return trg;
return 0;
tptr = GETARR(trg);
tptr = trg;
/* Allocate a buffer for case-folded, blank-padded words */
buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
......@@ -270,7 +285,47 @@ generate_trgm(char *str, int slen)
pfree(buf);
if ((len = tptr - GETARR(trg)) == 0)
return tptr - trg;
}
/*
* Guard against possible overflow in the palloc requests below. (We
* don't worry about the additive constants, since palloc can detect
* requests that are a little above MaxAllocSize --- we just need to
* prevent integer overflow in the multiplications.)
*/
static void
protect_out_of_mem(int slen)
{
if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
(Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("out of memory")));
}
/*
* Make array of trigrams with sorting and removing duplicate items.
*
* str: source string, of length slen bytes.
*
* Returns the sorted array of unique trigrams.
*/
TRGM *
generate_trgm(char *str, int slen)
{
TRGM *trg;
int len;
protect_out_of_mem(slen);
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
trg->flag = ARRKEY;
len = generate_trgm_only(GETARR(trg), str, slen);
SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
if (len == 0)
return trg;
/*
......@@ -287,6 +342,285 @@ generate_trgm(char *str, int slen)
return trg;
}
/*
* Make array of positional trigrams from two trigram arrays trg1 and trg2.
*
* trg1: trigram array of search pattern, of length len1. trg1 is required
* word which positions don't matter and replaced with -1.
* trg2: trigram array of text, of length len2. trg2 is haystack where we
* search and have to store its positions.
*
* Returns concatenated trigram array.
*/
static pos_trgm *
make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
{
pos_trgm *result;
int i, len = len1 + len2;
result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
for (i = 0; i < len1; i++)
{
memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
result[i].index = -1;
}
for (i = 0; i < len2; i++)
{
memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
result[i + len1].index = i;
}
return result;
}
/*
* Compare position trigrams: compare trigrams first and position second.
*/
static int
comp_ptrgm(const void *v1, const void *v2)
{
const pos_trgm *p1 = (const pos_trgm *)v1;
const pos_trgm *p2 = (const pos_trgm *)v2;
int cmp;
cmp = CMPTRGM(p1->trg, p2->trg);
if (cmp != 0)
return cmp;
if (p1->index < p2->index)
return -1;
else if (p1->index == p2->index)
return 0;
else
return 1;
}
/*
* Iterative search function which calculates maximum similarity with word in
* the string. But maximum similarity is calculated only if check_only == false.
*
* trg2indexes: array which stores indexes of the array "found".
* found: array which stores true of false values.
* ulen1: count of unique trigrams of array "trg1".
* len2: length of array "trg2" and array "trg2indexes".
* len: length of the array "found".
* check_only: if true then only check existaince of similar search pattern in
* text.
*
* Returns word similarity.
*/
static float4
iterate_word_similarity(int *trg2indexes,
bool *found,
int ulen1,
int len2,
int len,
bool check_only)
{
int *lastpos,
i,
ulen2 = 0,
count = 0,
upper = -1,
lower = -1;
float4 smlr_cur,
smlr_max = 0.0f;
/* Memorise last position of each trigram */
lastpos = (int *) palloc(sizeof(int) * len);
memset(lastpos, -1, sizeof(int) * len);
for (i = 0; i < len2; i++)
{
/* Get index of next trigram */
int trgindex = trg2indexes[i];
/* Update last position of this trigram */
if (lower >= 0 || found[trgindex])
{
if (lastpos[trgindex] < 0)
{
ulen2++;
if (found[trgindex])
count++;
}
lastpos[trgindex] = i;
}
/* Adjust lower bound if this trigram is present in required substing */
if (found[trgindex])
{
int prev_lower,
tmp_ulen2,
tmp_lower,
tmp_count;
upper = i;
if (lower == -1)
{
lower = i;
ulen2 = 1;
}
smlr_cur = CALCSML(count, ulen1, ulen2);
/* Also try to adjust upper bound for greater similarity */
tmp_count = count;
tmp_ulen2 = ulen2;
prev_lower = lower;
for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
{
float smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
int tmp_trgindex;
if (smlr_tmp > smlr_cur)
{
smlr_cur = smlr_tmp;
ulen2 = tmp_ulen2;
lower = tmp_lower;
count = tmp_count;
}
/*
* if we only check that word similarity is greater than
* pg_trgm.word_similarity_threshold we do not need to calculate
* a maximum similarity.
*/
if (check_only && smlr_cur >= word_similarity_threshold)
break;
tmp_trgindex = trg2indexes[tmp_lower];
if (lastpos[tmp_trgindex] == tmp_lower)
{
tmp_ulen2--;
if (found[tmp_trgindex])
tmp_count--;
}
}
smlr_max = Max(smlr_max, smlr_cur);
/*
* if we only check that word similarity is greater than
* pg_trgm.word_similarity_threshold we do not need to calculate a
* maximum similarity
*/
if (check_only && smlr_max >= word_similarity_threshold)
break;
for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
{
int tmp_trgindex;
tmp_trgindex = trg2indexes[tmp_lower];
if (lastpos[tmp_trgindex] == tmp_lower)
lastpos[tmp_trgindex] = -1;
}
}
}
pfree(lastpos);
return smlr_max;
}
/*
* Calculate word similarity.
* This function prepare two arrays: "trg2indexes" and "found". Then this arrays
* are used to calculate word similarity using iterate_word_similarity().
*
* "trg2indexes" is array which stores indexes of the array "found".
* In other words:
* trg2indexes[j] = i;
* found[i] = true (or false);
* If found[i] == true then there is trigram trg2[j] in array "trg1".
* If found[i] == false then there is not trigram trg2[j] in array "trg1".
*
* str1: search pattern string, of length slen1 bytes.
* str2: text in which we are looking for a word, of length slen2 bytes.
* check_only: if true then only check existaince of similar search pattern in
* text.
*
* Returns word similarity.
*/
static float4
calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
bool check_only)
{
bool *found;
pos_trgm *ptrg;
trgm *trg1;
trgm *trg2;
int len1,
len2,
len,
i,
j,
ulen1;
int *trg2indexes;
float4 result;
protect_out_of_mem(slen1 + slen2);
/* Make positional trigrams */
trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
len1 = generate_trgm_only(trg1, str1, slen1);
len2 = generate_trgm_only(trg2, str2, slen2);
ptrg = make_positional_trgm(trg1, len1, trg2, len2);
len = len1 + len2;
qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
pfree(trg1);
pfree(trg2);
/*
* Merge positional trigrams array: enumerate each trigram and find its
* presence in required word.
*/
trg2indexes = (int *) palloc(sizeof(int) * len2);
found = (bool *) palloc0(sizeof(bool) * len);
ulen1 = 0;
j = 0;
for (i = 0; i < len; i++)
{
if (i > 0)
{
int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
if (cmp != 0)
{
if (found[j])
ulen1++;
j++;
}
}
if (ptrg[i].index >= 0)
{
trg2indexes[ptrg[i].index] = j;
}
else
{
found[j] = true;
}
}
if (found[j])
ulen1++;
/* Run iterative procedure to find maximum similarity with word */
result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
check_only);
pfree(trg2indexes);
pfree(found);
pfree(ptrg);
return result;
}
/*
* Extract the next non-wildcard part of a search string, ie, a word bounded
* by '_' or '%' meta-characters, non-word characters or string end.
......@@ -459,17 +793,7 @@ generate_wildcard_trgm(const char *str, int slen)
bytelen;
const char *eword;
/*
* Guard against possible overflow in the palloc requests below. (We
* don't worry about the additive constants, since palloc can detect
* requests that are a little above MaxAllocSize --- we just need to
* prevent integer overflow in the multiplications.)
*/
if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
(Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("out of memory")));
protect_out_of_mem(slen);
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
trg->flag = ARRKEY;
......@@ -590,7 +914,7 @@ show_trgm(PG_FUNCTION_ARGS)
}
float4
cnt_sml(TRGM *trg1, TRGM *trg2)
cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
{
trgm *ptr1,
*ptr2;
......@@ -624,14 +948,15 @@ cnt_sml(TRGM *trg1, TRGM *trg2)
}
}
#ifdef DIVUNION
return ((float4) count) / ((float4) (len1 + len2 - count));
#else
return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
#endif
/*
* If inexact then len2 is equal to count, because we don't know actual
* length of second string in inexact search and we can assume that count
* is a lower bound of len2.
*/
return CALCSML(count, len1, inexact ? count : len2);
}
/*
* Returns whether trg2 contains all trigrams in trg1.
* This relies on the trigram arrays being sorted.
......@@ -726,7 +1051,7 @@ similarity(PG_FUNCTION_ARGS)
trg1 = generate_trgm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
trg2 = generate_trgm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
res = cnt_sml(trg1, trg2);
res = cnt_sml(trg1, trg2, false);
pfree(trg1);
pfree(trg2);
......@@ -736,6 +1061,22 @@ similarity(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4(res);
}
Datum
word_similarity(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
false);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(res);
}
Datum
similarity_dist(PG_FUNCTION_ARGS)
{
......@@ -755,3 +1096,67 @@ similarity_op(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(res >= similarity_threshold);
}
Datum
word_similarity_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
true);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_BOOL(res >= word_similarity_threshold);
}
Datum
word_similarity_commutator_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
true);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_BOOL(res >= word_similarity_threshold);
}
Datum
word_similarity_dist_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
false);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(1.0 - res);
}
Datum
word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
false);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(1.0 - res);
}
......@@ -92,6 +92,21 @@
(In practice this is seldom useful except for debugging.)
</entry>
</row>
<row>
<entry>
<function>word_similarity(text, text)</function>
<indexterm><primary>word_similarity</primary></indexterm>
</entry>
<entry><type>real</type></entry>
<entry>
Returns a number that indicates how similar the first string
to the most similar word of the second string. The function searches in
the second string a most similar word not a most similar substring. The
range of the result is zero (indicating that the two strings are
completely dissimilar) to one (indicating that the first string is
identical to one of the word of the second string).
</entry>
</row>
<row>
<entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
<entry><type>real</type></entry>
......@@ -137,6 +152,16 @@
<varname>pg_trgm.similarity_threshold</>.
</entry>
</row>
<row>
<entry><type>text</> <literal>%&gt;</literal> <type>text</></entry>
<entry><type>boolean</type></entry>
<entry>
Returns <literal>true</> if its first argument has the similar word in
the second argument and they have a similarity that is greater than the
current word similarity threshold set by
<varname>pg_trgm.word_similarity_threshold</> parameter.
</entry>
</row>
<row>
<entry><type>text</> <literal>&lt;-&gt;</literal> <type>text</></entry>
<entry><type>real</type></entry>
......@@ -145,6 +170,16 @@
one minus the <function>similarity()</> value.
</entry>
</row>
<row>
<entry>
<type>text</> <literal>&lt;-&gt;&gt;</literal> <type>text</>
</entry>
<entry><type>real</type></entry>
<entry>
Returns the <quote>distance</> between the arguments, that is
one minus the <function>word_similarity()</> value.
</entry>
</row>
</tbody>
</tgroup>
</table>
......@@ -168,6 +203,23 @@
</para>
</listitem>
</varlistentry>
<varlistentry id="guc-pgtrgm-word-similarity-threshold" xreflabel="pg_trgm.word_similarity_threshold">
<term>
<varname>pg_trgm.word_similarity_threshold</> (<type>real</type>)
<indexterm>
<primary>
<varname>pg_trgm.word_similarity_threshold</> configuration parameter
</primary>
</indexterm>
</term>
<listitem>
<para>
Sets the current word similarity threshold that is used by
the <literal>%&gt;</> operator. The threshold must be between
0 and 1 (default is 0.6).
</para>
</listitem>
</varlistentry>
</variablelist>
</sect2>
......@@ -225,6 +277,33 @@ SELECT t, t &lt;-&gt; '<replaceable>word</>' AS dist
a small number of the closest matches is wanted.
</para>
<para>
Also you can use an index on the <structfield>t</> column for word
similarity. For example:
<programlisting>
SELECT t, word_similarity('<replaceable>word</>', t) AS sml
FROM test_trgm
WHERE t %&gt; '<replaceable>word</>'
ORDER BY sml DESC, t;
</programlisting>
This will return all values in the text column that have a word
which sufficiently similar to <replaceable>word</>, sorted from best
match to worst. The index will be used to make this a fast operation
even over very large data sets.
</para>
<para>
A variant of the above query is
<programlisting>
SELECT t, t &lt;-&gt;&gt; '<replaceable>word</>' AS dist
FROM test_trgm
ORDER BY dist LIMIT 10;
</programlisting>
This can be implemented quite efficiently by GiST indexes, but not
by GIN indexes.
</para>
<para>
Beginning in <productname>PostgreSQL</> 9.1, these index types also support
index searches for <literal>LIKE</> and <literal>ILIKE</>, for example
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment