Commit f576b17c authored by Teodor Sigaev's avatar Teodor Sigaev

Add word_similarity to pg_trgm contrib module.

Patch introduces a concept of similarity over string and just a word from
another string.

Version of extension is not changed because 1.2 was already introduced in 9.6
release cycle, so, there wasn't a public version.

Author: Alexander Korotkov, Artur Zakirov
parent 1c4f001b
......@@ -7,7 +7,7 @@ EXTENSION = pg_trgm
DATA = pg_trgm--1.2.sql pg_trgm--1.0--1.1.sql pg_trgm--1.1--1.2.sql pg_trgm--unpackaged--1.0.sql
PGFILEDESC = "pg_trgm - trigram matching"
REGRESS = pg_trgm
REGRESS = pg_trgm pg_word_trgm
ifdef USE_PGXS
PG_CONFIG = pg_config
......
......@@ -59,7 +59,7 @@ select similarity('---', '####---');
0
(1 row)
CREATE TABLE test_trgm(t text);
CREATE TABLE test_trgm(t text COLLATE "C");
\copy test_trgm from 'data/trgm.data'
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
t | sml
......@@ -3467,7 +3467,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
qwertyu0988 | 0.333333
(1 row)
create table test2(t text);
create table test2(t text COLLATE "C");
insert into test2 values ('abcdef');
insert into test2 values ('quark');
insert into test2 values (' z foo bar');
......
......@@ -3,10 +3,72 @@
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.2'" to load this file. \quit
CREATE FUNCTION word_similarity(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION word_similarity_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE FUNCTION word_similarity_commutator_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE FUNCTION word_similarity_dist_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION word_similarity_dist_commutator_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE OPERATOR <% (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_op,
COMMUTATOR = '%>',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OPERATOR %> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_commutator_op,
COMMUTATOR = '<%',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OPERATOR <<-> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_dist_op,
COMMUTATOR = '<->>'
);
CREATE OPERATOR <->> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_dist_commutator_op,
COMMUTATOR = '<<->'
);
CREATE FUNCTION gin_trgm_triconsistent(internal, int2, text, int4, internal, internal, internal)
RETURNS "char"
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 7 %> (text, text),
OPERATOR 8 <->> (text, text) FOR ORDER BY pg_catalog.float_ops;
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
FUNCTION 6 (text, text) gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal);
OPERATOR 7 %> (text, text),
FUNCTION 6 (text, text) gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal);
......@@ -39,6 +39,39 @@ CREATE OPERATOR % (
JOIN = contjoinsel
);
CREATE FUNCTION word_similarity(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION word_similarity_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE FUNCTION word_similarity_commutator_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE OPERATOR <% (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_op,
COMMUTATOR = '%>',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OPERATOR %> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_commutator_op,
COMMUTATOR = '<%',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE FUNCTION similarity_dist(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
......@@ -51,6 +84,30 @@ CREATE OPERATOR <-> (
COMMUTATOR = '<->'
);
CREATE FUNCTION word_similarity_dist_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION word_similarity_dist_commutator_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE OPERATOR <<-> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_dist_op,
COMMUTATOR = '<->>'
);
CREATE OPERATOR <->> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = word_similarity_dist_commutator_op,
COMMUTATOR = '<<->'
);
-- gist key
CREATE FUNCTION gtrgm_in(cstring)
RETURNS gtrgm
......@@ -140,6 +197,12 @@ ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 5 pg_catalog.~ (text, text),
OPERATOR 6 pg_catalog.~* (text, text);
-- Add operators that are new in 9.6 (pg_trgm 1.2).
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 7 %> (text, text),
OPERATOR 8 <->> (text, text) FOR ORDER BY pg_catalog.float_ops;
-- support functions for gin
CREATE FUNCTION gin_extract_value_trgm(text, internal)
RETURNS internal
......@@ -187,4 +250,5 @@ AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
OPERATOR 7 %> (text, text),
FUNCTION 6 (text,text) gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal);
......@@ -13,7 +13,7 @@ select similarity('wow',' WOW ');
select similarity('---', '####---');
CREATE TABLE test_trgm(t text);
CREATE TABLE test_trgm(t text COLLATE "C");
\copy test_trgm from 'data/trgm.data'
......@@ -40,7 +40,7 @@ select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu098
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
create table test2(t text);
create table test2(t text COLLATE "C");
insert into test2 values ('abcdef');
insert into test2 values ('quark');
insert into test2 values (' z foo bar');
......
......@@ -26,13 +26,14 @@
#define DIVUNION
/* operator strategy numbers */
#define SimilarityStrategyNumber 1
#define DistanceStrategyNumber 2
#define LikeStrategyNumber 3
#define ILikeStrategyNumber 4
#define RegExpStrategyNumber 5
#define RegExpICaseStrategyNumber 6
#define SimilarityStrategyNumber 1
#define DistanceStrategyNumber 2
#define LikeStrategyNumber 3
#define ILikeStrategyNumber 4
#define RegExpStrategyNumber 5
#define RegExpICaseStrategyNumber 6
#define WordSimilarityStrategyNumber 7
#define WordDistanceStrategyNumber 8
typedef char trgm[3];
......@@ -103,15 +104,28 @@ typedef char *BITVECP;
#define GETARR(x) ( (trgm*)( (char*)x+TRGMHDRSIZE ) )
#define ARRNELEM(x) ( ( VARSIZE(x) - TRGMHDRSIZE )/sizeof(trgm) )
/*
* If DIVUNION is defined then similarity formula is:
* count / (len1 + len2 - count)
* else if DIVUNION is not defined then similarity formula is:
* count / max(len1, len2)
*/
#ifdef DIVUNION
#define CALCSML(count, len1, len2) ((float4) (count)) / ((float4) ((len1) + (len2) - (count)))
#else
#define CALCSML(count, len1, len2) ((float4) (count)) / ((float4) (((len1) > (len2)) ? (len1) : (len2)))
#endif
typedef struct TrgmPackedGraph TrgmPackedGraph;
extern double similarity_threshold;
extern double word_similarity_threshold;
extern uint32 trgm2int(trgm *ptr);
extern void compact_trigram(trgm *tptr, char *str, int bytelen);
extern TRGM *generate_trgm(char *str, int slen);
extern TRGM *generate_wildcard_trgm(const char *str, int slen);
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2);
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact);
extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
extern bool *trgm_presence_map(TRGM *query, TRGM *key);
extern TRGM *createTrgmNFA(text *text_re, Oid collation,
......
......@@ -89,6 +89,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
switch (strategy)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
break;
case ILikeStrategyNumber:
......@@ -176,6 +177,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
bool res;
int32 i,
ntrue;
double nlimit;
/* All cases served by this function are inexact */
*recheck = true;
......@@ -183,6 +185,10 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
switch (strategy)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
/* Count the matches */
ntrue = 0;
for (i = 0; i < nkeys; i++)
......@@ -207,8 +213,7 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
* So, independly on DIVUNION the upper bound formula is the same.
*/
res = (nkeys == 0) ? false :
((((((float4) ntrue) / ((float4) nkeys))) >= similarity_threshold)
? true : false);
(((((float4) ntrue) / ((float4) nkeys))) >= nlimit);
break;
case ILikeStrategyNumber:
#ifndef IGNORECASE
......@@ -270,10 +275,15 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
int32 i,
ntrue;
bool *boolcheck;
double nlimit;
switch (strategy)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
/* Count the matches */
ntrue = 0;
for (i = 0; i < nkeys; i++)
......@@ -285,9 +295,9 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
/*
* See comment in gin_trgm_consistent() about * upper bound formula
*/
res = (nkeys == 0) ? GIN_FALSE :
(((((float4) ntrue) / ((float4) nkeys)) >= similarity_threshold)
? GIN_MAYBE : GIN_FALSE);
res = (nkeys == 0)
? GIN_FALSE : (((((float4) ntrue) / ((float4) nkeys)) >= nlimit)
? GIN_MAYBE : GIN_FALSE);
break;
case ILikeStrategyNumber:
#ifndef IGNORECASE
......
......@@ -191,6 +191,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
bool res;
Size querysize = VARSIZE(query);
gtrgm_consistent_cache *cache;
double nlimit;
/*
* We keep the extracted trigrams in cache, because trigram extraction is
......@@ -218,6 +219,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
switch (strategy)
{
case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber:
qtrg = generate_trgm(VARDATA(query),
querysize - VARHDRSZ);
break;
......@@ -286,16 +288,23 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
switch (strategy)
{
case SimilarityStrategyNumber:
/* Similarity search is exact */
*recheck = false;
case WordSimilarityStrategyNumber:
/* Similarity search is exact. Word similarity search is inexact */
*recheck = (strategy == WordSimilarityStrategyNumber);
nlimit = (strategy == SimilarityStrategyNumber) ?
similarity_threshold : word_similarity_threshold;
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key, qtrg);
/*
* Prevent gcc optimizing the tmpsml variable using volatile
* keyword. Otherwise comparison of nlimit and tmpsml may give
* wrong results.
*/
float4 volatile tmpsml = cnt_sml(qtrg, key, *recheck);
/* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
res = (*(int *) &tmpsml == *(int *) &similarity_threshold
|| tmpsml > similarity_threshold) ? true : false;
res = (*(int *) &tmpsml == *(int *) &nlimit || tmpsml > nlimit);
}
else if (ISALLTRUE(key))
{ /* non-leaf contains signature */
......@@ -309,8 +318,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
if (len == 0)
res = false;
else
res = (((((float8) count) / ((float8) len))) >= similarity_threshold)
? true : false;
res = (((((float8) count) / ((float8) len))) >= nlimit);
}
break;
case ILikeStrategyNumber:
......@@ -428,6 +436,7 @@ gtrgm_distance(PG_FUNCTION_ARGS)
StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
/* Oid subtype = PG_GETARG_OID(3); */
bool *recheck = (bool *) PG_GETARG_POINTER(4);
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg;
float8 res;
......@@ -463,9 +472,17 @@ gtrgm_distance(PG_FUNCTION_ARGS)
switch (strategy)
{
case DistanceStrategyNumber:
case WordDistanceStrategyNumber:
*recheck = strategy == WordDistanceStrategyNumber;
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
res = 1.0 - cnt_sml(key, qtrg);
/*
* Prevent gcc optimizing the sml variable using volatile
* keyword. Otherwise res can differ from the
* word_similarity_dist_op() function.
*/
float4 volatile sml = cnt_sml(qtrg, key, *recheck);
res = 1.0 - sml;
}
else if (ISALLTRUE(key))
{ /* all leafs contains orig trgm */
......
This diff is collapsed.
......@@ -92,6 +92,21 @@
(In practice this is seldom useful except for debugging.)
</entry>
</row>
<row>
<entry>
<function>word_similarity(text, text)</function>
<indexterm><primary>word_similarity</primary></indexterm>
</entry>
<entry><type>real</type></entry>
<entry>
Returns a number that indicates how similar the first string
to the most similar word of the second string. The function searches in
the second string a most similar word not a most similar substring. The
range of the result is zero (indicating that the two strings are
completely dissimilar) to one (indicating that the first string is
identical to one of the word of the second string).
</entry>
</row>
<row>
<entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
<entry><type>real</type></entry>
......@@ -137,6 +152,16 @@
<varname>pg_trgm.similarity_threshold</>.
</entry>
</row>
<row>
<entry><type>text</> <literal>%&gt;</literal> <type>text</></entry>
<entry><type>boolean</type></entry>
<entry>
Returns <literal>true</> if its first argument has the similar word in
the second argument and they have a similarity that is greater than the
current word similarity threshold set by
<varname>pg_trgm.word_similarity_threshold</> parameter.
</entry>
</row>
<row>
<entry><type>text</> <literal>&lt;-&gt;</literal> <type>text</></entry>
<entry><type>real</type></entry>
......@@ -145,6 +170,16 @@
one minus the <function>similarity()</> value.
</entry>
</row>
<row>
<entry>
<type>text</> <literal>&lt;-&gt;&gt;</literal> <type>text</>
</entry>
<entry><type>real</type></entry>
<entry>
Returns the <quote>distance</> between the arguments, that is
one minus the <function>word_similarity()</> value.
</entry>
</row>
</tbody>
</tgroup>
</table>
......@@ -168,6 +203,23 @@
</para>
</listitem>
</varlistentry>
<varlistentry id="guc-pgtrgm-word-similarity-threshold" xreflabel="pg_trgm.word_similarity_threshold">
<term>
<varname>pg_trgm.word_similarity_threshold</> (<type>real</type>)
<indexterm>
<primary>
<varname>pg_trgm.word_similarity_threshold</> configuration parameter
</primary>
</indexterm>
</term>
<listitem>
<para>
Sets the current word similarity threshold that is used by
the <literal>%&gt;</> operator. The threshold must be between
0 and 1 (default is 0.6).
</para>
</listitem>
</varlistentry>
</variablelist>
</sect2>
......@@ -225,6 +277,33 @@ SELECT t, t &lt;-&gt; '<replaceable>word</>' AS dist
a small number of the closest matches is wanted.
</para>
<para>
Also you can use an index on the <structfield>t</> column for word
similarity. For example:
<programlisting>
SELECT t, word_similarity('<replaceable>word</>', t) AS sml
FROM test_trgm
WHERE t %&gt; '<replaceable>word</>'
ORDER BY sml DESC, t;
</programlisting>
This will return all values in the text column that have a word
which sufficiently similar to <replaceable>word</>, sorted from best
match to worst. The index will be used to make this a fast operation
even over very large data sets.
</para>
<para>
A variant of the above query is
<programlisting>
SELECT t, t &lt;-&gt;&gt; '<replaceable>word</>' AS dist
FROM test_trgm
ORDER BY dist LIMIT 10;
</programlisting>
This can be implemented quite efficiently by GiST indexes, but not
by GIN indexes.
</para>
<para>
Beginning in <productname>PostgreSQL</> 9.1, these index types also support
index searches for <literal>LIKE</> and <literal>ILIKE</>, for example
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment