Commit be8a7a68 authored by Teodor Sigaev's avatar Teodor Sigaev

Add strict_word_similarity to pg_trgm module

strict_word_similarity is similar to existing word_similarity function but
it takes into account word boundaries to compute similarity.

Author: Alexander Korotkov
Review by: David Steele, Liudmila Mantrova, me
Discussion: https://www.postgresql.org/message-id/flat/CY4PR17MB13207ED8310F847CF117EED0D85A0@CY4PR17MB1320.namprd17.prod.outlook.com
parent f20b3285
...@@ -4,11 +4,12 @@ MODULE_big = pg_trgm ...@@ -4,11 +4,12 @@ MODULE_big = pg_trgm
OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o $(WIN32RES) OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o $(WIN32RES)
EXTENSION = pg_trgm EXTENSION = pg_trgm
DATA = pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \ DATA = pg_trgm--1.3--1.4.sql \
pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql
PGFILEDESC = "pg_trgm - trigram matching" PGFILEDESC = "pg_trgm - trigram matching"
REGRESS = pg_trgm pg_word_trgm REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm
ifdef USE_PGXS ifdef USE_PGXS
PG_CONFIG = pg_config PG_CONFIG = pg_config
......
This diff is collapsed.
/* contrib/pg_trgm/pg_trgm--1.3--1.4.sql */
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.4'" to load this file. \quit
CREATE FUNCTION strict_word_similarity(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
CREATE FUNCTION strict_word_similarity_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE PARALLEL SAFE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE FUNCTION strict_word_similarity_commutator_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE PARALLEL SAFE; -- stable because depends on pg_trgm.word_similarity_threshold
CREATE OPERATOR <<% (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_op,
COMMUTATOR = '%>>',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OPERATOR %>> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_commutator_op,
COMMUTATOR = '<<%',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE FUNCTION strict_word_similarity_dist_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
CREATE FUNCTION strict_word_similarity_dist_commutator_op(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
CREATE OPERATOR <<<-> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_dist_op,
COMMUTATOR = '<->>>'
);
CREATE OPERATOR <->>> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = strict_word_similarity_dist_commutator_op,
COMMUTATOR = '<<<->'
);
ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
OPERATOR 9 %>> (text, text),
OPERATOR 10 <->>> (text, text) FOR ORDER BY pg_catalog.float_ops;
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
OPERATOR 9 %>> (text, text);
# pg_trgm extension # pg_trgm extension
comment = 'text similarity measurement and index searching based on trigrams' comment = 'text similarity measurement and index searching based on trigrams'
default_version = '1.3' default_version = '1.4'
module_pathname = '$libdir/pg_trgm' module_pathname = '$libdir/pg_trgm'
relocatable = true relocatable = true
DROP INDEX trgm_idx2;
\copy test_trgm3 from 'data/trgm2.data'
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
create index trgm_idx2 on test_trgm2 using gist (t gist_trgm_ops);
set enable_seqscan=off;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
explain (costs off)
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
drop index trgm_idx2;
create index trgm_idx2 on test_trgm2 using gin (t gin_trgm_ops);
set enable_seqscan=off;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
set "pg_trgm.strict_word_similarity_threshold" to 0.4;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
set "pg_trgm.strict_word_similarity_threshold" to 0.2;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include "access/gist.h" #include "access/gist.h"
#include "access/itup.h" #include "access/itup.h"
#include "access/stratnum.h"
#include "storage/bufpage.h" #include "storage/bufpage.h"
/* /*
...@@ -34,6 +35,8 @@ ...@@ -34,6 +35,8 @@
#define RegExpICaseStrategyNumber 6 #define RegExpICaseStrategyNumber 6
#define WordSimilarityStrategyNumber 7 #define WordSimilarityStrategyNumber 7
#define WordDistanceStrategyNumber 8 #define WordDistanceStrategyNumber 8
#define StrictWordSimilarityStrategyNumber 9
#define StrictWordDistanceStrategyNumber 10
typedef char trgm[3]; typedef char trgm[3];
...@@ -120,7 +123,9 @@ typedef struct TrgmPackedGraph TrgmPackedGraph; ...@@ -120,7 +123,9 @@ typedef struct TrgmPackedGraph TrgmPackedGraph;
extern double similarity_threshold; extern double similarity_threshold;
extern double word_similarity_threshold; extern double word_similarity_threshold;
extern double strict_word_similarity_threshold;
extern double index_strategy_get_limit(StrategyNumber strategy);
extern uint32 trgm2int(trgm *ptr); extern uint32 trgm2int(trgm *ptr);
extern void compact_trigram(trgm *tptr, char *str, int bytelen); extern void compact_trigram(trgm *tptr, char *str, int bytelen);
extern TRGM *generate_trgm(char *str, int slen); extern TRGM *generate_trgm(char *str, int slen);
......
...@@ -90,6 +90,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS) ...@@ -90,6 +90,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
{ {
case SimilarityStrategyNumber: case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber: case WordSimilarityStrategyNumber:
case StrictWordSimilarityStrategyNumber:
trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val)); trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
break; break;
case ILikeStrategyNumber: case ILikeStrategyNumber:
...@@ -187,8 +188,8 @@ gin_trgm_consistent(PG_FUNCTION_ARGS) ...@@ -187,8 +188,8 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
{ {
case SimilarityStrategyNumber: case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber: case WordSimilarityStrategyNumber:
nlimit = (strategy == SimilarityStrategyNumber) ? case StrictWordSimilarityStrategyNumber:
similarity_threshold : word_similarity_threshold; nlimit = index_strategy_get_limit(strategy);
/* Count the matches */ /* Count the matches */
ntrue = 0; ntrue = 0;
...@@ -282,8 +283,8 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS) ...@@ -282,8 +283,8 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
{ {
case SimilarityStrategyNumber: case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber: case WordSimilarityStrategyNumber:
nlimit = (strategy == SimilarityStrategyNumber) ? case StrictWordSimilarityStrategyNumber:
similarity_threshold : word_similarity_threshold; nlimit = index_strategy_get_limit(strategy);
/* Count the matches */ /* Count the matches */
ntrue = 0; ntrue = 0;
......
...@@ -221,6 +221,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS) ...@@ -221,6 +221,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
{ {
case SimilarityStrategyNumber: case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber: case WordSimilarityStrategyNumber:
case StrictWordSimilarityStrategyNumber:
qtrg = generate_trgm(VARDATA(query), qtrg = generate_trgm(VARDATA(query),
querysize - VARHDRSZ); querysize - VARHDRSZ);
break; break;
...@@ -290,10 +291,11 @@ gtrgm_consistent(PG_FUNCTION_ARGS) ...@@ -290,10 +291,11 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
{ {
case SimilarityStrategyNumber: case SimilarityStrategyNumber:
case WordSimilarityStrategyNumber: case WordSimilarityStrategyNumber:
/* Similarity search is exact. Word similarity search is inexact */ case StrictWordSimilarityStrategyNumber:
*recheck = (strategy == WordSimilarityStrategyNumber); /* Similarity search is exact. (Strict) word similarity search is inexact */
nlimit = (strategy == SimilarityStrategyNumber) ? *recheck = (strategy != SimilarityStrategyNumber);
similarity_threshold : word_similarity_threshold;
nlimit = index_strategy_get_limit(strategy);
if (GIST_LEAF(entry)) if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */ { /* all leafs contains orig trgm */
...@@ -468,7 +470,9 @@ gtrgm_distance(PG_FUNCTION_ARGS) ...@@ -468,7 +470,9 @@ gtrgm_distance(PG_FUNCTION_ARGS)
{ {
case DistanceStrategyNumber: case DistanceStrategyNumber:
case WordDistanceStrategyNumber: case WordDistanceStrategyNumber:
*recheck = strategy == WordDistanceStrategyNumber; case StrictWordDistanceStrategyNumber:
/* Only plain trigram distance is exact */
*recheck = (strategy != DistanceStrategyNumber);
if (GIST_LEAF(entry)) if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */ { /* all leafs contains orig trgm */
......
This diff is collapsed.
...@@ -105,6 +105,17 @@ ...@@ -105,6 +105,17 @@
the explanation below. the explanation below.
</entry> </entry>
</row> </row>
<row>
<entry>
<function>strict_word_similarity(text, text)</function>
<indexterm><primary>strict_word_similarity</primary></indexterm>
</entry>
<entry><type>real</type></entry>
<entry>
Same as <function>word_similarity(text, text)</function>, but forces
extent boundaries to match word boundaries.
</entry>
</row>
<row> <row>
<entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry> <entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
<entry><type>real</type></entry> <entry><type>real</type></entry>
...@@ -157,6 +168,29 @@ ...@@ -157,6 +168,29 @@
a part of the word. a part of the word.
</para> </para>
<para>
At the same time, <function>strict_word_similarity(text, text)</function>
has to select an extent that matches word boundaries. In the example above,
<function>strict_word_similarity(text, text)</function> would select the
extent <literal>{" w"," wo","wor","ord","rds", ds "}</literal>, which
corresponds to the whole word <literal>'words'</literal>.
<programlisting>
# SELECT strict_word_similarity('word', 'two words'), similarity('word', 'words');
strict_word_similarity | similarity
------------------------+------------
0.571429 | 0.571429
(1 row)
</programlisting>
</para>
<para>
Thus, the <function>strict_word_similarity(text, text)</function> function
is useful for finding similar subsets of whole words, while
<function>word_similarity(text, text)</function> is more suitable for
searching similar parts of words.
</para>
<table id="pgtrgm-op-table"> <table id="pgtrgm-op-table">
<title><filename>pg_trgm</filename> Operators</title> <title><filename>pg_trgm</filename> Operators</title>
<tgroup cols="3"> <tgroup cols="3">
...@@ -196,6 +230,24 @@ ...@@ -196,6 +230,24 @@
Commutator of the <literal>&lt;%</literal> operator. Commutator of the <literal>&lt;%</literal> operator.
</entry> </entry>
</row> </row>
<row>
<entry><type>text</type> <literal>&lt;&lt;%</literal> <type>text</type></entry>
<entry><type>boolean</type></entry>
<entry>
Returns <literal>true</literal> if its second argument has a continuous
extent of an ordered trigram set that matches word boundaries,
and its similarity to the trigram set of the first argument is greater
than the current strict word similarity threshold set by the
<varname>pg_trgm.strict_word_similarity_threshold</varname> parameter.
</entry>
</row>
<row>
<entry><type>text</type> <literal>%&gt;&gt;</literal> <type>text</type></entry>
<entry><type>boolean</type></entry>
<entry>
Commutator of the <literal>&lt;&lt;%</literal> operator.
</entry>
</row>
<row> <row>
<entry><type>text</type> <literal>&lt;-&gt;</literal> <type>text</type></entry> <entry><type>text</type> <literal>&lt;-&gt;</literal> <type>text</type></entry>
<entry><type>real</type></entry> <entry><type>real</type></entry>
...@@ -223,6 +275,25 @@ ...@@ -223,6 +275,25 @@
Commutator of the <literal>&lt;&lt;-&gt;</literal> operator. Commutator of the <literal>&lt;&lt;-&gt;</literal> operator.
</entry> </entry>
</row> </row>
<row>
<entry>
<type>text</type> <literal>&lt;&lt;&lt;-&gt;</literal> <type>text</type>
</entry>
<entry><type>real</type></entry>
<entry>
Returns the <quote>distance</quote> between the arguments, that is
one minus the <function>strict_word_similarity()</function> value.
</entry>
</row>
<row>
<entry>
<type>text</type> <literal>&lt;-&gt;&gt;&gt;</literal> <type>text</type>
</entry>
<entry><type>real</type></entry>
<entry>
Commutator of the <literal>&lt;&lt;&lt;-&gt;</literal> operator.
</entry>
</row>
</tbody> </tbody>
</tgroup> </tgroup>
</table> </table>
...@@ -322,12 +393,19 @@ SELECT t, t &lt;-&gt; '<replaceable>word</replaceable>' AS dist ...@@ -322,12 +393,19 @@ SELECT t, t &lt;-&gt; '<replaceable>word</replaceable>' AS dist
<para> <para>
Also you can use an index on the <structfield>t</structfield> column for word Also you can use an index on the <structfield>t</structfield> column for word
similarity. For example: similarity or strict word similarity. Typical queries are:
<programlisting> <programlisting>
SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
FROM test_trgm FROM test_trgm
WHERE '<replaceable>word</replaceable>' &lt;% t WHERE '<replaceable>word</replaceable>' &lt;% t
ORDER BY sml DESC, t; ORDER BY sml DESC, t;
</programlisting>
and
<programlisting>
SELECT t, strict_word_similarity('<replaceable>word</replaceable>', t) AS sml
FROM test_trgm
WHERE '<replaceable>word</replaceable>' &lt;&lt;% t
ORDER BY sml DESC, t;
</programlisting> </programlisting>
This will return all values in the text column for which there is a This will return all values in the text column for which there is a
continuous extent in the corresponding ordered trigram set that is continuous extent in the corresponding ordered trigram set that is
...@@ -337,11 +415,17 @@ SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml ...@@ -337,11 +415,17 @@ SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
</para> </para>
<para> <para>
A variant of the above query is Possible variants of the above queries are:
<programlisting> <programlisting>
SELECT t, '<replaceable>word</replaceable>' &lt;&lt;-&gt; t AS dist SELECT t, '<replaceable>word</replaceable>' &lt;&lt;-&gt; t AS dist
FROM test_trgm FROM test_trgm
ORDER BY dist LIMIT 10; ORDER BY dist LIMIT 10;
</programlisting>
and
<programlisting>
SELECT t, '<replaceable>word</replaceable>' &lt;&lt;&lt;-&gt; t AS dist
FROM test_trgm
ORDER BY dist LIMIT 10;
</programlisting> </programlisting>
This can be implemented quite efficiently by GiST indexes, but not This can be implemented quite efficiently by GiST indexes, but not
by GIN indexes. by GIN indexes.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment