Commit f576b17c authored by Teodor Sigaev's avatar Teodor Sigaev

Add word_similarity to pg_trgm contrib module.

Patch introduces a concept of similarity over string and just a word from
another string.

Version of extension is not changed because 1.2 was already introduced in 9.6
release cycle, so, there wasn't a public version.

Author: Alexander Korotkov, Artur Zakirov
parent 1c4f001b
...@@ -7,7 +7,7 @@ EXTENSION = pg_trgm ...@@ -7,7 +7,7 @@ EXTENSION = pg_trgm
DATA = pg_trgm--1.2.sql pg_trgm--1.0--1.1.sql pg_trgm--1.1--1.2.sql pg_trgm--unpackaged--1.0.sql DATA = pg_trgm--1.2.sql pg_trgm--1.0--1.1.sql pg_trgm--1.1--1.2.sql pg_trgm--unpackaged--1.0.sql
PGFILEDESC = "pg_trgm - trigram matching" PGFILEDESC = "pg_trgm - trigram matching"
REGRESS = pg_trgm REGRESS = pg_trgm pg_word_trgm
ifdef USE_PGXS ifdef USE_PGXS
PG_CONFIG = pg_config PG_CONFIG = pg_config
......
...@@ -59,7 +59,7 @@ select similarity('---', '####---'); ...@@ -59,7 +59,7 @@ select similarity('---', '####---');
0 0
(1 row) (1 row)
CREATE TABLE test_trgm(t text); CREATE TABLE test_trgm(t text COLLATE "C");
\copy test_trgm from 'data/trgm.data' \copy test_trgm from 'data/trgm.data'
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t; select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
t | sml t | sml
...@@ -3467,7 +3467,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198 ...@@ -3467,7 +3467,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
qwertyu0988 | 0.333333 qwertyu0988 | 0.333333
(1 row) (1 row)
create table test2(t text); create table test2(t text COLLATE "C");
insert into test2 values ('abcdef'); insert into test2 values ('abcdef');
insert into test2 values ('quark'); insert into test2 values ('quark');
insert into test2 values (' z foo bar'); insert into test2 values (' z foo bar');
......
...@@ -3,10 +3,72 @@ ...@@ -3,10 +3,72 @@
-- complain if script is sourced in psql, rather than via ALTER EXTENSION -- complain if script is sourced in psql, rather than via ALTER EXTENSION
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.2'" to load this file. \quit \echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.2'" to load this file. \quit
CREATE FUNCTION word_similarity(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION word_similarity_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE FUNCTION word_similarity_commutator_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE FUNCTION word_similarity_dist_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION word_similarity_dist_commutator_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE OPERATOR <% (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_op,
COMMUTATOR = '%>',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OPERATOR %> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_commutator_op,
COMMUTATOR = '<%',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OPERATOR <<-> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_dist_op,
COMMUTATOR = '<->>'
);
CREATE OPERATOR <->> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_dist_commutator_op,
COMMUTATOR = '<<->'
);
CREATE FUNCTION gin_trgm_triconsistent(internal, int2, text, int4, internal, internal, internal) CREATE FUNCTION gin_trgm_triconsistent(internal, int2, text, int4, internal, internal, internal)
RETURNS "char" RETURNS "char"
AS 'MODULE_PATHNAME' AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT; LANGUAGE C IMMUTABLE STRICT;
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 7 %> (text, text),
OPERATOR 8 <->> (text, text) FOR ORDER BY pg_catalog.float_ops;
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
FUNCTION 6 (text, text) gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal); OPERATOR 7 %> (text, text),
FUNCTION 6 (text, text) gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal);
...@@ -39,6 +39,39 @@ CREATE OPERATOR % ( ...@@ -39,6 +39,39 @@ CREATE OPERATOR % (
JOIN = contjoinsel JOIN = contjoinsel
); );
CREATE FUNCTION word_similarity(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION word_similarity_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE FUNCTION word_similarity_commutator_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE OPERATOR <% (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_op,
COMMUTATOR = '%>',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OPERATOR %> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_commutator_op,
COMMUTATOR = '<%',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE FUNCTION similarity_dist(text,text) CREATE FUNCTION similarity_dist(text,text)
RETURNS float4 RETURNS float4
AS 'MODULE_PATHNAME' AS 'MODULE_PATHNAME'
...@@ -51,6 +84,30 @@ CREATE OPERATOR <-> ( ...@@ -51,6 +84,30 @@ CREATE OPERATOR <-> (
COMMUTATOR = '<->' COMMUTATOR = '<->'
); );
CREATE FUNCTION word_similarity_dist_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION word_similarity_dist_commutator_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE OPERATOR <<-> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_dist_op,
COMMUTATOR = '<->>'
);
CREATE OPERATOR <->> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_dist_commutator_op,
COMMUTATOR = '<<->'
);
-- gist key -- gist key
CREATE FUNCTION gtrgm_in(cstring) CREATE FUNCTION gtrgm_in(cstring)
RETURNS gtrgm RETURNS gtrgm
...@@ -140,6 +197,12 @@ ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD ...@@ -140,6 +197,12 @@ ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 5 pg_catalog.~ (text, text), OPERATOR 5 pg_catalog.~ (text, text),
OPERATOR 6 pg_catalog.~* (text, text); OPERATOR 6 pg_catalog.~* (text, text);
-- Add operators that are new in 9.6 (pg_trgm 1.2).
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 7 %> (text, text),
OPERATOR 8 <->> (text, text) FOR ORDER BY pg_catalog.float_ops;
-- support functions for gin -- support functions for gin
CREATE FUNCTION gin_extract_value_trgm(text, internal) CREATE FUNCTION gin_extract_value_trgm(text, internal)
RETURNS internal RETURNS internal
...@@ -187,4 +250,5 @@ AS 'MODULE_PATHNAME' ...@@ -187,4 +250,5 @@ AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT; LANGUAGE C IMMUTABLE STRICT;
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
OPERATOR 7 %> (text, text),
FUNCTION 6 (text,text) gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal); FUNCTION 6 (text,text) gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal);
...@@ -13,7 +13,7 @@ select similarity('wow',' WOW '); ...@@ -13,7 +13,7 @@ select similarity('wow',' WOW ');
select similarity('---', '####---'); select similarity('---', '####---');
CREATE TABLE test_trgm(t text); CREATE TABLE test_trgm(t text COLLATE "C");
\copy test_trgm from 'data/trgm.data' \copy test_trgm from 'data/trgm.data'
...@@ -40,7 +40,7 @@ select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu098 ...@@ -40,7 +40,7 @@ select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu098
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t; select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t; select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
create table test2(t text); create table test2(t text COLLATE "C");
insert into test2 values ('abcdef'); insert into test2 values ('abcdef');
insert into test2 values ('quark'); insert into test2 values ('quark');
insert into test2 values (' z foo bar'); insert into test2 values (' z foo bar');
......
...@@ -26,13 +26,14 @@ ...@@ -26,13 +26,14 @@
#define DIVUNION #define DIVUNION
/* operator strategy numbers */ /* operator strategy numbers */
#define SimilarityStrategyNumber 1 #define SimilarityStrategyNumber 1
#define DistanceStrategyNumber 2 #define DistanceStrategyNumber 2
#define LikeStrategyNumber 3 #define LikeStrategyNumber 3
#define ILikeStrategyNumber 4 #define ILikeStrategyNumber 4
#define RegExpStrategyNumber 5 #define RegExpStrategyNumber 5
#define RegExpICaseStrategyNumber 6 #define RegExpICaseStrategyNumber 6
#define WordSimilarityStrategyNumber 7
#define WordDistanceStrategyNumber 8
typedef char trgm[3]; typedef char trgm[3];
...@@ -103,15 +104,28 @@ typedef char *BITVECP; ...@@ -103,15 +104,28 @@ typedef char *BITVECP;
#define GETARR(x) ( (trgm*)( (char*)x+TRGMHDRSIZE ) ) #define GETARR(x) ( (trgm*)( (char*)x+TRGMHDRSIZE ) )
#define ARRNELEM(x) ( ( VARSIZE(x) - TRGMHDRSIZE )/sizeof(trgm) ) #define ARRNELEM(x) ( ( VARSIZE(x) - TRGMHDRSIZE )/sizeof(trgm) )
/*
* If DIVUNION is defined then similarity formula is:
* count / (len1 + len2 - count)
* else if DIVUNION is not defined then similarity formula is:
* count / max(len1, len2)
*/
#ifdef DIVUNION
#define CALCSML(count, len1, len2) ((float4) (count)) / ((float4) ((len1) + (len2) - (count)))
#else
#define CALCSML(count, len1, len2) ((float4) (count)) / ((float4) (((len1) > (len2)) ? (len1) : (len2)))
#endif
typedef struct TrgmPackedGraph TrgmPackedGraph; typedef struct TrgmPackedGraph TrgmPackedGraph;
extern double similarity_threshold; extern double similarity_threshold;
extern double word_similarity_threshold;
extern uint32 trgm2int(trgm *ptr); extern uint32 trgm2int(trgm *ptr);
extern void compact_trigram(trgm *tptr, char *str, int bytelen); extern void compact_trigram(trgm *tptr, char *str, int bytelen);
extern TRGM *generate_trgm(char *str, int slen); extern TRGM *generate_trgm(char *str, int slen);
extern TRGM *generate_wildcard_trgm(const char *str, int slen); extern TRGM *generate_wildcard_trgm(const char *str, int slen);
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2); extern float4 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact);
extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2); extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
extern bool *trgm_presence_map(TRGM *query, TRGM *key); extern bool *trgm_presence_map(TRGM *query, TRGM *key);
extern TRGM *createTrgmNFA(text *text_re, Oid collation, extern TRGM *createTrgmNFA(text *text_re, Oid collation,
......
...@@ -89,6 +89,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS) ...@@ -89,6 +89,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
switch (strategy) switch (strategy)
{ {
case SimilarityStrategyNumber: case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ); trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
break; break;
case ILikeStrategyNumber: case ILikeStrategyNumber:
...@@ -176,6 +177,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS) ...@@ -176,6 +177,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
bool res; bool res;
int32 i, int32 i,
ntrue; ntrue;
double nlimit;
/* All cases served by this function are inexact */ /* All cases served by this function are inexact */
*recheck = true; *recheck = true;
...@@ -183,6 +185,10 @@ gin_trgm_consistent(PG_FUNCTION_ARGS) ...@@ -183,6 +185,10 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
switch (strategy) switch (strategy)
{ {
case SimilarityStrategyNumber: case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
/* Count the matches */ /* Count the matches */
ntrue = 0; ntrue = 0;
for (i = 0; i < nkeys; i++) for (i = 0; i < nkeys; i++)
...@@ -207,8 +213,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS) ...@@ -207,8 +213,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
* So, independly on DIVUNION the upper bound formula is the same. * So, independly on DIVUNION the upper bound formula is the same.
*/ */
res = (nkeys == 0) ? false : res = (nkeys == 0) ? false :
((((((float4) ntrue) / ((float4) nkeys))) >= similarity_threshold) (((((float4) ntrue) / ((float4) nkeys))) >= nlimit);
? true : false);
break; break;
case ILikeStrategyNumber: case ILikeStrategyNumber:
#ifndef IGNORECASE #ifndef IGNORECASE
...@@ -270,10 +275,15 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS) ...@@ -270,10 +275,15 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
int32 i, int32 i,
ntrue; ntrue;
bool *boolcheck; bool *boolcheck;
double nlimit;
switch (strategy) switch (strategy)
{ {
case SimilarityStrategyNumber: case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
/* Count the matches */ /* Count the matches */
ntrue = 0; ntrue = 0;
for (i = 0; i < nkeys; i++) for (i = 0; i < nkeys; i++)
...@@ -285,9 +295,9 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS) ...@@ -285,9 +295,9 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
/* /*
* See comment in gin_trgm_consistent() about * upper bound formula * See comment in gin_trgm_consistent() about * upper bound formula
*/ */
res = (nkeys == 0) ? GIN_FALSE : res = (nkeys == 0)
(((((float4) ntrue) / ((float4) nkeys)) >= similarity_threshold) ? GIN_FALSE : (((((float4) ntrue) / ((float4) nkeys)) >= nlimit)
? GIN_MAYBE : GIN_FALSE); ? GIN_MAYBE : GIN_FALSE);
break; break;
case ILikeStrategyNumber: case ILikeStrategyNumber:
#ifndef IGNORECASE #ifndef IGNORECASE
......
...@@ -191,6 +191,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS) ...@@ -191,6 +191,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
bool res; bool res;
Size querysize = VARSIZE(query); Size querysize = VARSIZE(query);
gtrgm_consistent_cache *cache; gtrgm_consistent_cache *cache;
double nlimit;
/* /*
* We keep the extracted trigrams in cache, because trigram extraction is * We keep the extracted trigrams in cache, because trigram extraction is
...@@ -218,6 +219,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS) ...@@ -218,6 +219,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
switch (strategy) switch (strategy)
{ {
case SimilarityStrategyNumber: case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
qtrg = generate_trgm(VARDATA(query), qtrg = generate_trgm(VARDATA(query),
querysize - VARHDRSZ); querysize - VARHDRSZ);
break; break;
...@@ -286,16 +288,23 @@ gtrgm_consistent(PG_FUNCTION_ARGS) ...@@ -286,16 +288,23 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
switch (strategy) switch (strategy)
{ {
case SimilarityStrategyNumber: case SimilarityStrategyNumber:
/* Similarity search is exact */ case WordSimilarityStrategyNumber:
*recheck = false; /* Similarity search is exact. Word similarity search is inexact */
*recheck = (strategy == WordSimilarityStrategyNumber);
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
if (GIST_LEAF(entry)) if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */ { /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key, qtrg); /*
* Prevent gcc optimizing the tmpsml variable using volatile
* keyword. Otherwise comparison of nlimit and tmpsml may give
* wrong results.
*/
float4 volatile tmpsml = cnt_sml(qtrg, key, *recheck);
/* strange bug at freebsd 5.2.1 and gcc 3.3.3 */ /* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
res = (*(int *) &tmpsml == *(int *) &similarity_threshold res = (*(int *) &tmpsml == *(int *) &nlimit || tmpsml > nlimit);
|| tmpsml > similarity_threshold) ? true : false;
} }
else if (ISALLTRUE(key)) else if (ISALLTRUE(key))
{ /* non-leaf contains signature */ { /* non-leaf contains signature */
...@@ -309,8 +318,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS) ...@@ -309,8 +318,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
if (len == 0) if (len == 0)
res = false; res = false;
else else
res = (((((float8) count) / ((float8) len))) >= similarity_threshold) res = (((((float8) count) / ((float8) len))) >= nlimit);
? true : false;
} }
break; break;
case ILikeStrategyNumber: case ILikeStrategyNumber:
...@@ -428,6 +436,7 @@ gtrgm_distance(PG_FUNCTION_ARGS) ...@@ -428,6 +436,7 @@ gtrgm_distance(PG_FUNCTION_ARGS)
StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
/* Oid subtype = PG_GETARG_OID(3); */ /* Oid subtype = PG_GETARG_OID(3); */
bool *recheck = (bool *) PG_GETARG_POINTER(4);
TRGM *key = (TRGM *) DatumGetPointer(entry->key); TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg; TRGM *qtrg;
float8 res; float8 res;
...@@ -463,9 +472,17 @@ gtrgm_distance(PG_FUNCTION_ARGS) ...@@ -463,9 +472,17 @@ gtrgm_distance(PG_FUNCTION_ARGS)
switch (strategy) switch (strategy)
{ {
case DistanceStrategyNumber: case DistanceStrategyNumber:
case WordDistanceStrategyNumber:
*recheck = strategy == WordDistanceStrategyNumber;
if (GIST_LEAF(entry)) if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */ { /* all leafs contains orig trgm */
res = 1.0 - cnt_sml(key, qtrg); /*
* Prevent gcc optimizing the sml variable using volatile
* keyword. Otherwise res can differ from the
* word_similarity_dist_op() function.
*/
float4 volatile sml = cnt_sml(qtrg, key, *recheck);
res = 1.0 - sml;
} }
else if (ISALLTRUE(key)) else if (ISALLTRUE(key))
{ /* all leafs contains orig trgm */ { /* all leafs contains orig trgm */
......
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
PG_MODULE_MAGIC; PG_MODULE_MAGIC;
/* GUC variables */ /* GUC variables */
double similarity_threshold = 0.3f; double similarity_threshold = 0.3f;
double word_similarity_threshold = 0.6f;
void _PG_init(void); void _PG_init(void);
...@@ -23,8 +24,20 @@ PG_FUNCTION_INFO_V1(set_limit); ...@@ -23,8 +24,20 @@ PG_FUNCTION_INFO_V1(set_limit);
PG_FUNCTION_INFO_V1(show_limit); PG_FUNCTION_INFO_V1(show_limit);
PG_FUNCTION_INFO_V1(show_trgm); PG_FUNCTION_INFO_V1(show_trgm);
PG_FUNCTION_INFO_V1(similarity); PG_FUNCTION_INFO_V1(similarity);
PG_FUNCTION_INFO_V1(word_similarity);
PG_FUNCTION_INFO_V1(similarity_dist); PG_FUNCTION_INFO_V1(similarity_dist);
PG_FUNCTION_INFO_V1(similarity_op); PG_FUNCTION_INFO_V1(similarity_op);
PG_FUNCTION_INFO_V1(word_similarity_op);
PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
PG_FUNCTION_INFO_V1(word_similarity_dist_op);
PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
/* Trigram with position */
typedef struct
{
trgm trg;
int index;
} pos_trgm;
/* /*
* Module load callback * Module load callback
...@@ -45,11 +58,23 @@ _PG_init(void) ...@@ -45,11 +58,23 @@ _PG_init(void)
NULL, NULL,
NULL, NULL,
NULL); NULL);
DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
"Sets the threshold used by the <%% operator.",
"Valid range is 0.0 .. 1.0.",
&word_similarity_threshold,
0.6,
0.0,
1.0,
PGC_USERSET,
0,
NULL,
NULL,
NULL);
} }
/* /*
* Deprecated function. * Deprecated function.
* Use "pg_trgm.similarity_threshold" GUC variable instead of this function * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
*/ */
Datum Datum
set_limit(PG_FUNCTION_ARGS) set_limit(PG_FUNCTION_ARGS)
...@@ -59,14 +84,14 @@ set_limit(PG_FUNCTION_ARGS) ...@@ -59,14 +84,14 @@ set_limit(PG_FUNCTION_ARGS)
if (nlimit < 0 || nlimit > 1.0) if (nlimit < 0 || nlimit > 1.0)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
errmsg("wrong limit, should be between 0 and 1"))); errmsg("wrong threshold, should be between 0 and 1")));
similarity_threshold = nlimit; similarity_threshold = nlimit;
PG_RETURN_FLOAT4(similarity_threshold); PG_RETURN_FLOAT4(similarity_threshold);
} }
/* /*
* Deprecated function. * Deprecated function.
* Use "pg_trgm.similarity_threshold" GUC variable instead of this function * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
*/ */
Datum Datum
show_limit(PG_FUNCTION_ARGS) show_limit(PG_FUNCTION_ARGS)
...@@ -199,38 +224,28 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) ...@@ -199,38 +224,28 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
return tptr; return tptr;
} }
TRGM * /*
generate_trgm(char *str, int slen) * Make array of trigrams without sorting and removing duplicate items.
*
* trg: where to return the array of trigrams.
* str: source string, of length slen bytes.
*
* Returns length of the generated array.
*/
static int
generate_trgm_only(trgm *trg, char *str, int slen)
{ {
TRGM *trg;
char *buf;
trgm *tptr; trgm *tptr;
int len, char *buf;
charlen, int charlen,
bytelen; bytelen;
char *bword, char *bword,
*eword; *eword;
/*
* Guard against possible overflow in the palloc requests below. (We
* don't worry about the additive constants, since palloc can detect
* requests that are a little above MaxAllocSize --- we just need to
* prevent integer overflow in the multiplications.)
*/
if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
(Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("out of memory")));
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
trg->flag = ARRKEY;
SET_VARSIZE(trg, TRGMHDRSIZE);
if (slen + LPADDING + RPADDING < 3 || slen == 0) if (slen + LPADDING + RPADDING < 3 || slen == 0)
return trg; return 0;
tptr = GETARR(trg); tptr = trg;
/* Allocate a buffer for case-folded, blank-padded words */ /* Allocate a buffer for case-folded, blank-padded words */
buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4); buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
...@@ -270,7 +285,47 @@ generate_trgm(char *str, int slen) ...@@ -270,7 +285,47 @@ generate_trgm(char *str, int slen)
pfree(buf); pfree(buf);
if ((len = tptr - GETARR(trg)) == 0) return tptr - trg;
}
/*
* Guard against possible overflow in the palloc requests below. (We
* don't worry about the additive constants, since palloc can detect
* requests that are a little above MaxAllocSize --- we just need to
* prevent integer overflow in the multiplications.)
*/
static void
protect_out_of_mem(int slen)
{
if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
(Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("out of memory")));
}
/*
* Make array of trigrams with sorting and removing duplicate items.
*
* str: source string, of length slen bytes.
*
* Returns the sorted array of unique trigrams.
*/
TRGM *
generate_trgm(char *str, int slen)
{
TRGM *trg;
int len;
protect_out_of_mem(slen);
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
trg->flag = ARRKEY;
len = generate_trgm_only(GETARR(trg), str, slen);
SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
if (len == 0)
return trg; return trg;
/* /*
...@@ -287,6 +342,285 @@ generate_trgm(char *str, int slen) ...@@ -287,6 +342,285 @@ generate_trgm(char *str, int slen)
return trg; return trg;
} }
/*
* Make array of positional trigrams from two trigram arrays trg1 and trg2.
*
* trg1: trigram array of search pattern, of length len1. trg1 is required
* word which positions don't matter and replaced with -1.
* trg2: trigram array of text, of length len2. trg2 is haystack where we
* search and have to store its positions.
*
* Returns concatenated trigram array.
*/
static pos_trgm *
make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
{
pos_trgm *result;
int i, len = len1 + len2;
result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
for (i = 0; i < len1; i++)
{
memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
result[i].index = -1;
}
for (i = 0; i < len2; i++)
{
memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
result[i + len1].index = i;
}
return result;
}
/*
* Compare position trigrams: compare trigrams first and position second.
*/
static int
comp_ptrgm(const void *v1, const void *v2)
{
const pos_trgm *p1 = (const pos_trgm *)v1;
const pos_trgm *p2 = (const pos_trgm *)v2;
int cmp;
cmp = CMPTRGM(p1->trg, p2->trg);
if (cmp != 0)
return cmp;
if (p1->index < p2->index)
return -1;
else if (p1->index == p2->index)
return 0;
else
return 1;
}
/*
* Iterative search function which calculates maximum similarity with word in
* the string. But maximum similarity is calculated only if check_only == false.
*
* trg2indexes: array which stores indexes of the array "found".
* found: array which stores true of false values.
* ulen1: count of unique trigrams of array "trg1".
* len2: length of array "trg2" and array "trg2indexes".
* len: length of the array "found".
* check_only: if true then only check existaince of similar search pattern in
* text.
*
* Returns word similarity.
*/
static float4
iterate_word_similarity(int *trg2indexes,
bool *found,
int ulen1,
int len2,
int len,
bool check_only)
{
int *lastpos,
i,
ulen2 = 0,
count = 0,
upper = -1,
lower = -1;
float4 smlr_cur,
smlr_max = 0.0f;
/* Memorise last position of each trigram */
lastpos = (int *) palloc(sizeof(int) * len);
memset(lastpos, -1, sizeof(int) * len);
for (i = 0; i < len2; i++)
{
/* Get index of next trigram */
int trgindex = trg2indexes[i];
/* Update last position of this trigram */
if (lower >= 0 || found[trgindex])
{
if (lastpos[trgindex] < 0)
{
ulen2++;
if (found[trgindex])
count++;
}
lastpos[trgindex] = i;
}
/* Adjust lower bound if this trigram is present in required substing */
if (found[trgindex])
{
int prev_lower,
tmp_ulen2,
tmp_lower,
tmp_count;
upper = i;
if (lower == -1)
{
lower = i;
ulen2 = 1;
}
smlr_cur = CALCSML(count, ulen1, ulen2);
/* Also try to adjust upper bound for greater similarity */
tmp_count = count;
tmp_ulen2 = ulen2;
prev_lower = lower;
for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
{
float smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
int tmp_trgindex;
if (smlr_tmp > smlr_cur)
{
smlr_cur = smlr_tmp;
ulen2 = tmp_ulen2;
lower = tmp_lower;
count = tmp_count;
}
/*
* if we only check that word similarity is greater than
* pg_trgm.word_similarity_threshold we do not need to calculate
* a maximum similarity.
*/
if (check_only && smlr_cur >= word_similarity_threshold)
break;
tmp_trgindex = trg2indexes[tmp_lower];
if (lastpos[tmp_trgindex] == tmp_lower)
{
tmp_ulen2--;
if (found[tmp_trgindex])
tmp_count--;
}
}
smlr_max = Max(smlr_max, smlr_cur);
/*
* if we only check that word similarity is greater than
* pg_trgm.word_similarity_threshold we do not need to calculate a
* maximum similarity
*/
if (check_only && smlr_max >= word_similarity_threshold)
break;
for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
{
int tmp_trgindex;
tmp_trgindex = trg2indexes[tmp_lower];
if (lastpos[tmp_trgindex] == tmp_lower)
lastpos[tmp_trgindex] = -1;
}
}
}
pfree(lastpos);
return smlr_max;
}
/*
* Calculate word similarity.
* This function prepare two arrays: "trg2indexes" and "found". Then this arrays
* are used to calculate word similarity using iterate_word_similarity().
*
* "trg2indexes" is array which stores indexes of the array "found".
* In other words:
* trg2indexes[j] = i;
* found[i] = true (or false);
* If found[i] == true then there is trigram trg2[j] in array "trg1".
* If found[i] == false then there is not trigram trg2[j] in array "trg1".
*
* str1: search pattern string, of length slen1 bytes.
* str2: text in which we are looking for a word, of length slen2 bytes.
* check_only: if true then only check existaince of similar search pattern in
* text.
*
* Returns word similarity.
*/
static float4
calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
bool check_only)
{
bool *found;
pos_trgm *ptrg;
trgm *trg1;
trgm *trg2;
int len1,
len2,
len,
i,
j,
ulen1;
int *trg2indexes;
float4 result;
protect_out_of_mem(slen1 + slen2);
/* Make positional trigrams */
trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
len1 = generate_trgm_only(trg1, str1, slen1);
len2 = generate_trgm_only(trg2, str2, slen2);
ptrg = make_positional_trgm(trg1, len1, trg2, len2);
len = len1 + len2;
qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
pfree(trg1);
pfree(trg2);
/*
* Merge positional trigrams array: enumerate each trigram and find its
* presence in required word.
*/
trg2indexes = (int *) palloc(sizeof(int) * len2);
found = (bool *) palloc0(sizeof(bool) * len);
ulen1 = 0;
j = 0;
for (i = 0; i < len; i++)
{
if (i > 0)
{
int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
if (cmp != 0)
{
if (found[j])
ulen1++;
j++;
}
}
if (ptrg[i].index >= 0)
{
trg2indexes[ptrg[i].index] = j;
}
else
{
found[j] = true;
}
}
if (found[j])
ulen1++;
/* Run iterative procedure to find maximum similarity with word */
result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
check_only);
pfree(trg2indexes);
pfree(found);
pfree(ptrg);
return result;
}
/* /*
* Extract the next non-wildcard part of a search string, ie, a word bounded * Extract the next non-wildcard part of a search string, ie, a word bounded
* by '_' or '%' meta-characters, non-word characters or string end. * by '_' or '%' meta-characters, non-word characters or string end.
...@@ -459,17 +793,7 @@ generate_wildcard_trgm(const char *str, int slen) ...@@ -459,17 +793,7 @@ generate_wildcard_trgm(const char *str, int slen)
bytelen; bytelen;
const char *eword; const char *eword;
/* protect_out_of_mem(slen);
* Guard against possible overflow in the palloc requests below. (We
* don't worry about the additive constants, since palloc can detect
* requests that are a little above MaxAllocSize --- we just need to
* prevent integer overflow in the multiplications.)
*/
if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
(Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("out of memory")));
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3); trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
trg->flag = ARRKEY; trg->flag = ARRKEY;
...@@ -590,7 +914,7 @@ show_trgm(PG_FUNCTION_ARGS) ...@@ -590,7 +914,7 @@ show_trgm(PG_FUNCTION_ARGS)
} }
float4 float4
cnt_sml(TRGM *trg1, TRGM *trg2) cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
{ {
trgm *ptr1, trgm *ptr1,
*ptr2; *ptr2;
...@@ -624,14 +948,15 @@ cnt_sml(TRGM *trg1, TRGM *trg2) ...@@ -624,14 +948,15 @@ cnt_sml(TRGM *trg1, TRGM *trg2)
} }
} }
#ifdef DIVUNION /*
return ((float4) count) / ((float4) (len1 + len2 - count)); * If inexact then len2 is equal to count, because we don't know actual
#else * length of second string in inexact search and we can assume that count
return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2)); * is a lower bound of len2.
#endif */
return CALCSML(count, len1, inexact ? count : len2);
} }
/* /*
* Returns whether trg2 contains all trigrams in trg1. * Returns whether trg2 contains all trigrams in trg1.
* This relies on the trigram arrays being sorted. * This relies on the trigram arrays being sorted.
...@@ -726,7 +1051,7 @@ similarity(PG_FUNCTION_ARGS) ...@@ -726,7 +1051,7 @@ similarity(PG_FUNCTION_ARGS)
trg1 = generate_trgm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ); trg1 = generate_trgm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
trg2 = generate_trgm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ); trg2 = generate_trgm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
res = cnt_sml(trg1, trg2); res = cnt_sml(trg1, trg2, false);
pfree(trg1); pfree(trg1);
pfree(trg2); pfree(trg2);
...@@ -736,6 +1061,22 @@ similarity(PG_FUNCTION_ARGS) ...@@ -736,6 +1061,22 @@ similarity(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4(res); PG_RETURN_FLOAT4(res);
} }
Datum
word_similarity(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
false);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(res);
}
Datum Datum
similarity_dist(PG_FUNCTION_ARGS) similarity_dist(PG_FUNCTION_ARGS)
{ {
...@@ -755,3 +1096,67 @@ similarity_op(PG_FUNCTION_ARGS) ...@@ -755,3 +1096,67 @@ similarity_op(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(res >= similarity_threshold); PG_RETURN_BOOL(res >= similarity_threshold);
} }
Datum
word_similarity_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
true);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_BOOL(res >= word_similarity_threshold);
}
Datum
word_similarity_commutator_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
true);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_BOOL(res >= word_similarity_threshold);
}
Datum
word_similarity_dist_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
false);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(1.0 - res);
}
Datum
word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
{
text *in1 = PG_GETARG_TEXT_PP(0);
text *in2 = PG_GETARG_TEXT_PP(1);
float4 res;
res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
false);
PG_FREE_IF_COPY(in1, 0);
PG_FREE_IF_COPY(in2, 1);
PG_RETURN_FLOAT4(1.0 - res);
}
...@@ -92,6 +92,21 @@ ...@@ -92,6 +92,21 @@
(In practice this is seldom useful except for debugging.) (In practice this is seldom useful except for debugging.)
</entry> </entry>
</row> </row>
<row>
<entry>
<function>word_similarity(text, text)</function>
<indexterm><primary>word_similarity</primary></indexterm>
</entry>
<entry><type>real</type></entry>
<entry>
Returns a number that indicates how similar the first string
to the most similar word of the second string. The function searches in
the second string a most similar word not a most similar substring. The
range of the result is zero (indicating that the two strings are
completely dissimilar) to one (indicating that the first string is
identical to one of the word of the second string).
</entry>
</row>
<row> <row>
<entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry> <entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
<entry><type>real</type></entry> <entry><type>real</type></entry>
...@@ -137,6 +152,16 @@ ...@@ -137,6 +152,16 @@
<varname>pg_trgm.similarity_threshold</>. <varname>pg_trgm.similarity_threshold</>.
</entry> </entry>
</row> </row>
<row>
<entry><type>text</> <literal>%&gt;</literal> <type>text</></entry>
<entry><type>boolean</type></entry>
<entry>
Returns <literal>true</> if its first argument has the similar word in
the second argument and they have a similarity that is greater than the
current word similarity threshold set by
<varname>pg_trgm.word_similarity_threshold</> parameter.
</entry>
</row>
<row> <row>
<entry><type>text</> <literal>&lt;-&gt;</literal> <type>text</></entry> <entry><type>text</> <literal>&lt;-&gt;</literal> <type>text</></entry>
<entry><type>real</type></entry> <entry><type>real</type></entry>
...@@ -145,6 +170,16 @@ ...@@ -145,6 +170,16 @@
one minus the <function>similarity()</> value. one minus the <function>similarity()</> value.
</entry> </entry>
</row> </row>
<row>
<entry>
<type>text</> <literal>&lt;-&gt;&gt;</literal> <type>text</>
</entry>
<entry><type>real</type></entry>
<entry>
Returns the <quote>distance</> between the arguments, that is
one minus the <function>word_similarity()</> value.
</entry>
</row>
</tbody> </tbody>
</tgroup> </tgroup>
</table> </table>
...@@ -168,6 +203,23 @@ ...@@ -168,6 +203,23 @@
</para> </para>
</listitem> </listitem>
</varlistentry> </varlistentry>
<varlistentry id="guc-pgtrgm-word-similarity-threshold" xreflabel="pg_trgm.word_similarity_threshold">
<term>
<varname>pg_trgm.word_similarity_threshold</> (<type>real</type>)
<indexterm>
<primary>
<varname>pg_trgm.word_similarity_threshold</> configuration parameter
</primary>
</indexterm>
</term>
<listitem>
<para>
Sets the current word similarity threshold that is used by
the <literal>%&gt;</> operator. The threshold must be between
0 and 1 (default is 0.6).
</para>
</listitem>
</varlistentry>
</variablelist> </variablelist>
</sect2> </sect2>
...@@ -225,6 +277,33 @@ SELECT t, t &lt;-&gt; '<replaceable>word</>' AS dist ...@@ -225,6 +277,33 @@ SELECT t, t &lt;-&gt; '<replaceable>word</>' AS dist
a small number of the closest matches is wanted. a small number of the closest matches is wanted.
</para> </para>
<para>
Also you can use an index on the <structfield>t</> column for word
similarity. For example:
<programlisting>
SELECT t, word_similarity('<replaceable>word</>', t) AS sml
FROM test_trgm
WHERE t %&gt; '<replaceable>word</>'
ORDER BY sml DESC, t;
</programlisting>
This will return all values in the text column that have a word
which sufficiently similar to <replaceable>word</>, sorted from best
match to worst. The index will be used to make this a fast operation
even over very large data sets.
</para>
<para>
A variant of the above query is
<programlisting>
SELECT t, t &lt;-&gt;&gt; '<replaceable>word</>' AS dist
FROM test_trgm
ORDER BY dist LIMIT 10;
</programlisting>
This can be implemented quite efficiently by GiST indexes, but not
by GIN indexes.
</para>
<para> <para>
Beginning in <productname>PostgreSQL</> 9.1, these index types also support Beginning in <productname>PostgreSQL</> 9.1, these index types also support
index searches for <literal>LIKE</> and <literal>ILIKE</>, for example index searches for <literal>LIKE</> and <literal>ILIKE</>, for example
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment