Add strict_word_similarity to pg_trgm module

strict_word_similarity is similar to existing word_similarity function but it takes into account word boundaries to compute similarity. Author: Alexander Korotkov Review by: David Steele, Liudmila Mantrova, me Discussion: https://www.postgresql.org/message-id/flat/CY4PR17MB13207ED8310F847CF117EED0D85A0@CY4PR17MB1320.namprd17.prod.outlook.com

Add strict_word_similarity to pg_trgm module
strict_word_similarity is similar to existing word_similarity function but it takes into account word boundaries to compute similarity. Author: Alexander Korotkov Review by: David Steele, Liudmila Mantrova, me Discussion: https://www.postgresql.org/message-id/flat/CY4PR17MB13207ED8310F847CF117EED0D85A0@CY4PR17MB1320.namprd17.prod.outlook.com
be8a7a68 · Teodor Sigaev · f20b3285 · be8a7a68 · be8a7a68 · be8a7a68
Commit be8a7a68 authored Mar 21, 2018 by Teodor Sigaev
10 changed files
--- a/contrib/pg_trgm/Makefile
+++ b/contrib/pg_trgm/Makefile
@@ -4,11 +4,12 @@ MODULE_big = pg_trgm
 OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o $(WIN32RES)
 EXTENSION = pg_trgm
-DATA = pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
+DATA = pg_trgm--1.3--1.4.sql \
+	pg_trgm--1.3.sql pg_trgm--1.2--1.3.sql pg_trgm--1.1--1.2.sql \
 	pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql
 PGFILEDESC = "pg_trgm - trigram matching"
-REGRESS = pg_trgm pg_word_trgm
+REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm
 ifdef USE_PGXS
 PG_CONFIG = pg_config

--- a/contrib/pg_trgm/expected/pg_strict_word_trgm.out
+++ b/contrib/pg_trgm/expected/pg_strict_word_trgm.out
--- a/contrib/pg_trgm/pg_trgm--1.3--1.4.sql
+++ b/contrib/pg_trgm/pg_trgm--1.3--1.4.sql
+/* contrib/pg_trgm/pg_trgm--1.3--1.4.sql */
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.4'" to load this file. \quit
+CREATE FUNCTION strict_word_similarity(text,text)
+RETURNS float4
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
+CREATE FUNCTION strict_word_similarity_op(text,text)
+RETURNS bool
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE PARALLEL SAFE;  -- stable because depends on pg_trgm.word_similarity_threshold
+CREATE FUNCTION strict_word_similarity_commutator_op(text,text)
+RETURNS bool
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT STABLE PARALLEL SAFE;  -- stable because depends on pg_trgm.word_similarity_threshold
+CREATE OPERATOR <<% (
+        LEFTARG = text,
+        RIGHTARG = text,
+        PROCEDURE = strict_word_similarity_op,
+        COMMUTATOR = '%>>',
+        RESTRICT = contsel,
+        JOIN = contjoinsel
+);
+CREATE OPERATOR %>> (
+        LEFTARG = text,
+        RIGHTARG = text,
+        PROCEDURE = strict_word_similarity_commutator_op,
+        COMMUTATOR = '<<%',
+        RESTRICT = contsel,
+        JOIN = contjoinsel
+);
+CREATE FUNCTION strict_word_similarity_dist_op(text,text)
+RETURNS float4
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
+CREATE FUNCTION strict_word_similarity_dist_commutator_op(text,text)
+RETURNS float4
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE;
+CREATE OPERATOR <<<-> (
+        LEFTARG = text,
+        RIGHTARG = text,
+        PROCEDURE = strict_word_similarity_dist_op,
+        COMMUTATOR = '<->>>'
+);
+CREATE OPERATOR <->>> (
+        LEFTARG = text,
+        RIGHTARG = text,
+        PROCEDURE = strict_word_similarity_dist_commutator_op,
+        COMMUTATOR = '<<<->'
+);
+ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
+        OPERATOR        9       %>> (text, text),
+        OPERATOR        10       <->>> (text, text) FOR ORDER BY pg_catalog.float_ops;
+ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
+        OPERATOR        9       %>> (text, text);
--- a/contrib/pg_trgm/pg_trgm.control
+++ b/contrib/pg_trgm/pg_trgm.control
 # pg_trgm extension
 comment = 'text similarity measurement and index searching based on trigrams'
-default_version = '1.3'
+default_version = '1.4'
 module_pathname = '$libdir/pg_trgm'
 relocatable = true
--- a/contrib/pg_trgm/sql/pg_strict_word_trgm.sql
+++ b/contrib/pg_trgm/sql/pg_strict_word_trgm.sql
+DROP INDEX trgm_idx2;
+\copy test_trgm3 from 'data/trgm2.data'
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
+select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
+create index trgm_idx2 on test_trgm2 using gist (t gist_trgm_ops);
+set enable_seqscan=off;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
+explain (costs off)
+select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
+select t <->>> 'Alaikallupoddakulam', t from test_trgm2 order by t <->>> 'Alaikallupoddakulam' limit 7;
+drop index trgm_idx2;
+create index trgm_idx2 on test_trgm2 using gin (t gin_trgm_ops);
+set enable_seqscan=off;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
+set "pg_trgm.strict_word_similarity_threshold" to 0.4;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
+set "pg_trgm.strict_word_similarity_threshold" to 0.2;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where 'Baykal' <<% t order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where 'Kabankala' <<% t order by sml desc, t;
+select t,strict_word_similarity('Baykal',t) as sml from test_trgm2 where t %>> 'Baykal' order by sml desc, t;
+select t,strict_word_similarity('Kabankala',t) as sml from test_trgm2 where t %>> 'Kabankala' order by sml desc, t;
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -6,6 +6,7 @@
 #include "access/gist.h"
 #include "access/itup.h"
+#include "access/stratnum.h"
 #include "storage/bufpage.h"
 /*
@@ -34,6 +35,8 @@
 #define RegExpICaseStrategyNumber			6
 #define WordSimilarityStrategyNumber		7
 #define WordDistanceStrategyNumber			8
+#define StrictWordSimilarityStrategyNumber	9
+#define StrictWordDistanceStrategyNumber	10
 typedef char trgm[3];
@@ -120,7 +123,9 @@ typedef struct TrgmPackedGraph TrgmPackedGraph;
 extern double similarity_threshold;
 extern double word_similarity_threshold;
+extern double strict_word_similarity_threshold;
+extern double index_strategy_get_limit(StrategyNumber strategy);
 extern uint32 trgm2int(trgm *ptr);
 extern void compact_trigram(trgm *tptr, char *str, int bytelen);
 extern TRGM *generate_trgm(char *str, int slen);

--- a/contrib/pg_trgm/trgm_gin.c
+++ b/contrib/pg_trgm/trgm_gin.c
@@ -90,6 +90,7 @@ gin_extract_query_trgm(PG_FUNCTION_ARGS)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
+		case StrictWordSimilarityStrategyNumber:
 			trg = generate_trgm(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val));
 			break;
 		case ILikeStrategyNumber:
@@ -187,8 +188,8 @@ gin_trgm_consistent(PG_FUNCTION_ARGS)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
-			nlimit = (strategy == SimilarityStrategyNumber) ?
+		case StrictWordSimilarityStrategyNumber:
-				similarity_threshold : word_similarity_threshold;
+			nlimit = index_strategy_get_limit(strategy);
 			/* Count the matches */
 			ntrue = 0;
@@ -282,8 +283,8 @@ gin_trgm_triconsistent(PG_FUNCTION_ARGS)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
-			nlimit = (strategy == SimilarityStrategyNumber) ?
+		case StrictWordSimilarityStrategyNumber:
-				similarity_threshold : word_similarity_threshold;
+			nlimit = index_strategy_get_limit(strategy);
 			/* Count the matches */
 			ntrue = 0;

--- a/contrib/pg_trgm/trgm_gist.c
+++ b/contrib/pg_trgm/trgm_gist.c
@@ -221,6 +221,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 		{
 			case SimilarityStrategyNumber:
 			case WordSimilarityStrategyNumber:
+			case StrictWordSimilarityStrategyNumber:
 				qtrg = generate_trgm(VARDATA(query),
 									 querysize - VARHDRSZ);
 				break;
@@ -290,10 +291,11 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
-			/* Similarity search is exact. Word similarity search is inexact */
+		case StrictWordSimilarityStrategyNumber:
-			*recheck = (strategy == WordSimilarityStrategyNumber);
+			/* Similarity search is exact. (Strict) word similarity search is inexact */
-			nlimit = (strategy == SimilarityStrategyNumber) ?
+			*recheck = (strategy != SimilarityStrategyNumber);
-				similarity_threshold : word_similarity_threshold;
+			nlimit = index_strategy_get_limit(strategy);
 			if (GIST_LEAF(entry))
 			{					/* all leafs contains orig trgm */
@@ -468,7 +470,9 @@ gtrgm_distance(PG_FUNCTION_ARGS)
 	{
 		case DistanceStrategyNumber:
 		case WordDistanceStrategyNumber:
-			*recheck = strategy == WordDistanceStrategyNumber;
+		case StrictWordDistanceStrategyNumber:
+			/* Only plain trigram distance is exact */
+			*recheck = (strategy != DistanceStrategyNumber);
 			if (GIST_LEAF(entry))
 			{					/* all leafs contains orig trgm */

--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
--- a/doc/src/sgml/pgtrgm.sgml
+++ b/doc/src/sgml/pgtrgm.sgml
@@ -105,6 +105,17 @@
       the explanation below.
      </entry>
     </row>
+     <row>
+      <entry>
+       <function>strict_word_similarity(text, text)</function>
+       <indexterm><primary>strict_word_similarity</primary></indexterm>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Same as <function>word_similarity(text, text)</function>, but forces
+       extent boundaries to match word boundaries.
+      </entry>
+     </row>
     <row>
      <entry><function>show_limit()</function><indexterm><primary>show_limit</primary></indexterm></entry>
      <entry><type>real</type></entry>
@@ -157,6 +168,29 @@
   a part of the word.
  </para>
+  <para>
+   At the same time, <function>strict_word_similarity(text, text)</function>
+   has to select an extent that matches word boundaries.  In the example above,
+   <function>strict_word_similarity(text, text)</function> would select the
+   extent <literal>{"  w"," wo","wor","ord","rds", ds "}</literal>, which
+   corresponds to the whole word <literal>'words'</literal>.
+<programlisting>
+# SELECT strict_word_similarity('word', 'two words'), similarity('word', 'words');
+ strict_word_similarity | similarity
+------------------------+------------
+               0.571429 |   0.571429
+(1 row)
+</programlisting>
+  </para>
+  <para>
+   Thus, the <function>strict_word_similarity(text, text)</function> function
+   is useful for finding similar subsets of whole words, while
+   <function>word_similarity(text, text)</function> is more suitable for
+   searching similar parts of words.
+  </para>
  <table id="pgtrgm-op-table">
   <title><filename>pg_trgm</filename> Operators</title>
   <tgroup cols="3">
@@ -196,6 +230,24 @@
       Commutator of the <literal>&lt;%</literal> operator.
      </entry>
     </row>
+     <row>
+      <entry><type>text</type> <literal>&lt;&lt;%</literal> <type>text</type></entry>
+      <entry><type>boolean</type></entry>
+      <entry>
+       Returns <literal>true</literal> if its second argument has a continuous
+       extent of an ordered trigram set that matches word boundaries,
+       and its similarity to the trigram set of the first argument is greater
+       than the current strict word similarity threshold set by the
+       <varname>pg_trgm.strict_word_similarity_threshold</varname> parameter.
+      </entry>
+     </row>
+     <row>
+      <entry><type>text</type> <literal>%&gt;&gt;</literal> <type>text</type></entry>
+      <entry><type>boolean</type></entry>
+      <entry>
+       Commutator of the <literal>&lt;&lt;%</literal> operator.
+      </entry>
+     </row>
     <row>
      <entry><type>text</type> <literal>&lt;-&gt;</literal> <type>text</type></entry>
      <entry><type>real</type></entry>
@@ -223,6 +275,25 @@
       Commutator of the <literal>&lt;&lt;-&gt;</literal> operator.
      </entry>
     </row>
+     <row>
+      <entry>
+       <type>text</type> <literal>&lt;&lt;&lt;-&gt;</literal> <type>text</type>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Returns the <quote>distance</quote> between the arguments, that is
+       one minus the <function>strict_word_similarity()</function> value.
+      </entry>
+     </row>
+     <row>
+      <entry>
+       <type>text</type> <literal>&lt;-&gt;&gt;&gt;</literal> <type>text</type>
+      </entry>
+      <entry><type>real</type></entry>
+      <entry>
+       Commutator of the <literal>&lt;&lt;&lt;-&gt;</literal> operator.
+      </entry>
+     </row>
    </tbody>
   </tgroup>
  </table>
@@ -322,12 +393,19 @@ SELECT t, t &lt;-&gt; '<replaceable>word</replaceable>' AS dist
  <para>
   Also you can use an index on the <structfield>t</structfield> column for word
-   similarity.  For example:
+   similarity or strict word similarity.  Typical queries are:
 <programlisting>
 SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
  FROM test_trgm
  WHERE '<replaceable>word</replaceable>' &lt;% t
  ORDER BY sml DESC, t;
+</programlisting>
+   and
+<programlisting>
+SELECT t, strict_word_similarity('<replaceable>word</replaceable>', t) AS sml
+  FROM test_trgm
+  WHERE '<replaceable>word</replaceable>' &lt;&lt;% t
+  ORDER BY sml DESC, t;
 </programlisting>
   This will return all values in the text column for which there is a
   continuous extent in the corresponding ordered trigram set that is
@@ -337,11 +415,17 @@ SELECT t, word_similarity('<replaceable>word</replaceable>', t) AS sml
  </para>
  <para>
-   A variant of the above query is
+   Possible variants of the above queries are:
 <programlisting>
 SELECT t, '<replaceable>word</replaceable>' &lt;&lt;-&gt; t AS dist
  FROM test_trgm
  ORDER BY dist LIMIT 10;
+</programlisting>
+   and
+<programlisting>
+SELECT t, '<replaceable>word</replaceable>' &lt;&lt;&lt;-&gt; t AS dist
+  FROM test_trgm
+  ORDER BY dist LIMIT 10;
 </programlisting>
   This can be implemented quite efficiently by GiST indexes, but not
   by GIN indexes.