Commit 1664ae19 authored by Teodor Sigaev's avatar Teodor Sigaev

Add websearch_to_tsquery

Error-tolerant conversion function with web-like syntax for search query,
it simplifies  constraining search engine with close to habitual interface for
users.

Bump catalog version

Authors: Victor Drobny, Dmitry Ivanov with editorization by me
Reviewed by: Aleksander Alekseev, Tomas Vondra, Thomas Munro, Aleksandr Parfenov
Discussion: https://www.postgresql.org/message-id/flat/fe931111ff7e9ad79196486ada79e268@postgrespro.ru
parent fbc27330
......@@ -9630,6 +9630,18 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<entry><literal>phraseto_tsquery('english', 'The Fat Rats')</literal></entry>
<entry><literal>'fat' &lt;-&gt; 'rat'</literal></entry>
</row>
<row>
<entry>
<indexterm>
<primary>websearch_to_tsquery</primary>
</indexterm>
<literal><function>websearch_to_tsquery(<optional> <replaceable class="parameter">config</replaceable> <type>regconfig</type> , </optional> <replaceable class="parameter">query</replaceable> <type>text</type>)</function></literal>
</entry>
<entry><type>tsquery</type></entry>
<entry>produce <type>tsquery</type> from a web search style query</entry>
<entry><literal>websearch_to_tsquery('english', '"fat rat" or rat')</literal></entry>
<entry><literal>'fat' &lt;-&gt; 'rat' | 'rat'</literal></entry>
</row>
<row>
<entry>
<indexterm>
......
......@@ -797,13 +797,16 @@ UPDATE tt SET ti =
<para>
<productname>PostgreSQL</productname> provides the
functions <function>to_tsquery</function>,
<function>plainto_tsquery</function>, and
<function>phraseto_tsquery</function>
<function>plainto_tsquery</function>,
<function>phraseto_tsquery</function> and
<function>websearch_to_tsquery</function>
for converting a query to the <type>tsquery</type> data type.
<function>to_tsquery</function> offers access to more features
than either <function>plainto_tsquery</function> or
<function>phraseto_tsquery</function>, but it is less forgiving
about its input.
<function>phraseto_tsquery</function>, but it is less forgiving about its
input. <function>websearch_to_tsquery</function> is a simplified version
of <function>to_tsquery</function> with an alternative syntax, similar
to the one used by web search engines.
</para>
<indexterm>
......@@ -962,6 +965,87 @@ SELECT phraseto_tsquery('english', 'The Fat &amp; Rats:C');
</screen>
</para>
<synopsis>
websearch_to_tsquery(<optional> <replaceable class="parameter">config</replaceable> <type>regconfig</type>, </optional> <replaceable class="parameter">querytext</replaceable> <type>text</type>) returns <type>tsquery</type>
</synopsis>
<para>
<function>websearch_to_tsquery</function> creates a <type>tsquery</type>
value from <replaceable>querytext</replaceable> using an alternative
syntax in which simple unformatted text is a valid query.
Unlike <function>plainto_tsquery</function>
and <function>phraseto_tsquery</function>, it also recognizes certain
operators. Moreover, this function should never raise syntax errors,
which makes it possible to use raw user-supplied input for search.
The following syntax is supported:
<itemizedlist spacing="compact" mark="bullet">
<listitem>
<para>
<literal>unquoted text</literal>: text not inside quote marks will be
converted to terms separated by <literal>&amp;</literal> operators, as
if processed by
<function>plainto_tsquery</function>.
</para>
</listitem>
<listitem>
<para>
<literal>"quoted text"</literal>: text inside quote marks will be
converted to terms separated by <literal>&lt;-&gt;</literal>
operators, as if processed by <function>phraseto_tsquery</function>.
</para>
</listitem>
<listitem>
<para>
<literal>OR</literal>: logical or will be converted to
the <literal>|</literal> operator.
</para>
</listitem>
<listitem>
<para>
<literal>-</literal>: the logical not operator, converted to the
the <literal>!</literal> operator.
</para>
</listitem>
</itemizedlist>
</para>
<para>
Examples:
<screen>
select websearch_to_tsquery('english', 'The fat rats');
websearch_to_tsquery
-----------------
'fat' &amp; 'rat'
(1 row)
</screen>
<screen>
select websearch_to_tsquery('english', '"supernovae stars" -crab');
websearch_to_tsquery
----------------------------------
'supernova' &lt;-&gt; 'star' &amp; !'crab'
(1 row)
</screen>
<screen>
select websearch_to_tsquery('english', '"sad cat" or "fat rat"');
websearch_to_tsquery
-----------------------------------
'sad' &lt;-&gt; 'cat' | 'fat' &lt;-&gt; 'rat'
(1 row)
</screen>
<screen>
select websearch_to_tsquery('english', 'signal -"segmentation fault"');
websearch_to_tsquery
---------------------------------------
'signal' &amp; !( 'segment' &lt;-&gt; 'fault' )
(1 row)
</screen>
<screen>
select websearch_to_tsquery('english', '""" )( dummy \\ query &lt;-&gt;');
websearch_to_tsquery
----------------------
'dummi' &amp; 'queri'
(1 row)
</screen>
</para>
</sect2>
<sect2 id="textsearch-ranking">
......
......@@ -490,7 +490,7 @@ to_tsquery_byid(PG_FUNCTION_ARGS)
query = parse_tsquery(text_to_cstring(in),
pushval_morph,
PointerGetDatum(&data),
false);
0);
PG_RETURN_TSQUERY(query);
}
......@@ -520,7 +520,7 @@ plainto_tsquery_byid(PG_FUNCTION_ARGS)
query = parse_tsquery(text_to_cstring(in),
pushval_morph,
PointerGetDatum(&data),
true);
P_TSQ_PLAIN);
PG_RETURN_POINTER(query);
}
......@@ -551,7 +551,7 @@ phraseto_tsquery_byid(PG_FUNCTION_ARGS)
query = parse_tsquery(text_to_cstring(in),
pushval_morph,
PointerGetDatum(&data),
true);
P_TSQ_PLAIN);
PG_RETURN_TSQUERY(query);
}
......@@ -567,3 +567,35 @@ phraseto_tsquery(PG_FUNCTION_ARGS)
ObjectIdGetDatum(cfgId),
PointerGetDatum(in)));
}
Datum
websearch_to_tsquery_byid(PG_FUNCTION_ARGS)
{
text *in = PG_GETARG_TEXT_PP(1);
MorphOpaque data;
TSQuery query = NULL;
data.cfg_id = PG_GETARG_OID(0);
data.qoperator = OP_AND;
query = parse_tsquery(text_to_cstring(in),
pushval_morph,
PointerGetDatum(&data),
P_TSQ_WEB);
PG_RETURN_TSQUERY(query);
}
Datum
websearch_to_tsquery(PG_FUNCTION_ARGS)
{
text *in = PG_GETARG_TEXT_PP(0);
Oid cfgId;
cfgId = getTSCurrentConfig(true);
PG_RETURN_DATUM(DirectFunctionCall2(websearch_to_tsquery_byid,
ObjectIdGetDatum(cfgId),
PointerGetDatum(in)));
}
This diff is collapsed.
......@@ -200,7 +200,7 @@ tsvectorin(PG_FUNCTION_ARGS)
char *cur;
int buflen = 256; /* allocated size of tmpbuf */
state = init_tsvector_parser(buf, false, false);
state = init_tsvector_parser(buf, 0);
arrlen = 64;
arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
......
......@@ -33,6 +33,7 @@ struct TSVectorParseStateData
int eml; /* max bytes per character */
bool oprisdelim; /* treat ! | * ( ) as delimiters? */
bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
bool is_web; /* we're in websearch_to_tsquery() */
};
......@@ -42,7 +43,7 @@ struct TSVectorParseStateData
* ! | & ( )
*/
TSVectorParseState
init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
init_tsvector_parser(char *input, int flags)
{
TSVectorParseState state;
......@@ -52,8 +53,9 @@ init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
state->len = 32;
state->word = (char *) palloc(state->len);
state->eml = pg_database_encoding_max_length();
state->oprisdelim = oprisdelim;
state->is_tsquery = is_tsquery;
state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
state->is_web = (flags & P_TSV_IS_WEB) != 0;
return state;
}
......@@ -89,16 +91,6 @@ do { \
} \
} while (0)
/* phrase operator begins with '<' */
#define ISOPERATOR(x) \
( pg_mblen(x) == 1 && ( *(x) == '!' || \
*(x) == '&' || \
*(x) == '|' || \
*(x) == '(' || \
*(x) == ')' || \
*(x) == '<' \
) )
/* Fills gettoken_tsvector's output parameters, and returns true */
#define RETURN_TOKEN \
do { \
......@@ -183,14 +175,15 @@ gettoken_tsvector(TSVectorParseState state,
{
if (*(state->prsbuf) == '\0')
return false;
else if (t_iseq(state->prsbuf, '\''))
else if (!state->is_web && t_iseq(state->prsbuf, '\''))
statecode = WAITENDCMPLX;
else if (t_iseq(state->prsbuf, '\\'))
else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
{
statecode = WAITNEXTCHAR;
oldstate = WAITENDWORD;
}
else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
(state->is_web && t_iseq(state->prsbuf, '"')))
PRSSYNTAXERROR;
else if (!t_isspace(state->prsbuf))
{
......@@ -217,13 +210,14 @@ gettoken_tsvector(TSVectorParseState state,
}
else if (statecode == WAITENDWORD)
{
if (t_iseq(state->prsbuf, '\\'))
if (!state->is_web && t_iseq(state->prsbuf, '\\'))
{
statecode = WAITNEXTCHAR;
oldstate = WAITENDWORD;
}
else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
(state->oprisdelim && ISOPERATOR(state->prsbuf)))
(state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
(state->is_web && t_iseq(state->prsbuf, '"')))
{
RESIZEPRSBUF;
if (curpos == state->word)
......@@ -250,11 +244,11 @@ gettoken_tsvector(TSVectorParseState state,
}
else if (statecode == WAITENDCMPLX)
{
if (t_iseq(state->prsbuf, '\''))
if (!state->is_web && t_iseq(state->prsbuf, '\''))
{
statecode = WAITCHARCMPLX;
}
else if (t_iseq(state->prsbuf, '\\'))
else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
{
statecode = WAITNEXTCHAR;
oldstate = WAITENDCMPLX;
......@@ -270,7 +264,7 @@ gettoken_tsvector(TSVectorParseState state,
}
else if (statecode == WAITCHARCMPLX)
{
if (t_iseq(state->prsbuf, '\''))
if (!state->is_web && t_iseq(state->prsbuf, '\''))
{
RESIZEPRSBUF;
COPYCHAR(curpos, state->prsbuf);
......
......@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 201804031
#define CATALOG_VERSION_NO 201804051
#endif
......@@ -4971,6 +4971,8 @@ DATA(insert OID = 3747 ( plainto_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f i s
DESCR("transform to tsquery");
DATA(insert OID = 5006 ( phraseto_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f i s 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ _null_ phraseto_tsquery_byid _null_ _null_ _null_ ));
DESCR("transform to tsquery");
DATA(insert OID = 8889 ( websearch_to_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f i s 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ _null_ websearch_to_tsquery_byid _null_ _null_ _null_ ));
DESCR("transform to tsquery");
DATA(insert OID = 3749 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3614 "25" _null_ _null_ _null_ _null_ _null_ to_tsvector _null_ _null_ _null_ ));
DESCR("transform to tsvector");
DATA(insert OID = 3750 ( to_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ to_tsquery _null_ _null_ _null_ ));
......@@ -4979,6 +4981,8 @@ DATA(insert OID = 3751 ( plainto_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f s s
DESCR("transform to tsquery");
DATA(insert OID = 5001 ( phraseto_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ phraseto_tsquery _null_ _null_ _null_ ));
DESCR("transform to tsquery");
DATA(insert OID = 8890 ( websearch_to_tsquery PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ websearch_to_tsquery _null_ _null_ _null_ ));
DESCR("transform to tsquery");
DATA(insert OID = 4209 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3614 "3802" _null_ _null_ _null_ _null_ _null_ jsonb_to_tsvector _null_ _null_ _null_ ));
DESCR("transform jsonb to tsvector");
DATA(insert OID = 4210 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f t f s s 1 0 3614 "114" _null_ _null_ _null_ _null_ _null_ json_to_tsvector _null_ _null_ _null_ ));
......
......@@ -25,9 +25,11 @@
struct TSVectorParseStateData; /* opaque struct in tsvector_parser.c */
typedef struct TSVectorParseStateData *TSVectorParseState;
extern TSVectorParseState init_tsvector_parser(char *input,
bool oprisdelim,
bool is_tsquery);
#define P_TSV_OPR_IS_DELIM (1 << 0)
#define P_TSV_IS_TSQUERY (1 << 1)
#define P_TSV_IS_WEB (1 << 2)
extern TSVectorParseState init_tsvector_parser(char *input, int flags);
extern void reset_tsvector_parser(TSVectorParseState state, char *input);
extern bool gettoken_tsvector(TSVectorParseState state,
char **token, int *len,
......@@ -35,6 +37,16 @@ extern bool gettoken_tsvector(TSVectorParseState state,
char **endptr);
extern void close_tsvector_parser(TSVectorParseState state);
/* phrase operator begins with '<' */
#define ISOPERATOR(x) \
( pg_mblen(x) == 1 && ( *(x) == '!' || \
*(x) == '&' || \
*(x) == '|' || \
*(x) == '(' || \
*(x) == ')' || \
*(x) == '<' \
) )
/* parse_tsquery */
struct TSQueryParserStateData; /* private in backend/utils/adt/tsquery.c */
......@@ -46,9 +58,13 @@ typedef void (*PushFunction) (Datum opaque, TSQueryParserState state,
* QueryOperand struct */
bool prefix);
#define P_TSQ_PLAIN (1 << 0)
#define P_TSQ_WEB (1 << 1)
extern TSQuery parse_tsquery(char *buf,
PushFunction pushval,
Datum opaque, bool isplain);
PushFunction pushval,
Datum opaque,
int flags);
/* Functions for use by PushFunction implementations */
extern void pushValue(TSQueryParserState state,
......
This diff is collapsed.
......@@ -539,3 +539,97 @@ create index phrase_index_test_idx on phrase_index_test using gin(fts);
set enable_seqscan = off;
select * from phrase_index_test where fts @@ phraseto_tsquery('english', 'fat cat');
set enable_seqscan = on;
-- test websearch_to_tsquery function
select websearch_to_tsquery('simple', 'I have a fat:*ABCD cat');
select websearch_to_tsquery('simple', 'orange:**AABBCCDD');
select websearch_to_tsquery('simple', 'fat:A!cat:B|rat:C<');
select websearch_to_tsquery('simple', 'fat:A : cat:B');
select websearch_to_tsquery('simple', 'fat*rat');
select websearch_to_tsquery('simple', 'fat-rat');
select websearch_to_tsquery('simple', 'fat_rat');
-- weights are completely ignored
select websearch_to_tsquery('simple', 'abc : def');
select websearch_to_tsquery('simple', 'abc:def');
select websearch_to_tsquery('simple', 'a:::b');
select websearch_to_tsquery('simple', 'abc:d');
select websearch_to_tsquery('simple', ':');
-- these operators are ignored
select websearch_to_tsquery('simple', 'abc & def');
select websearch_to_tsquery('simple', 'abc | def');
select websearch_to_tsquery('simple', 'abc <-> def');
select websearch_to_tsquery('simple', 'abc (pg or class)');
-- NOT is ignored in quotes
select websearch_to_tsquery('english', 'My brand new smartphone');
select websearch_to_tsquery('english', 'My brand "new smartphone"');
select websearch_to_tsquery('english', 'My brand "new -smartphone"');
-- test OR operator
select websearch_to_tsquery('simple', 'cat or rat');
select websearch_to_tsquery('simple', 'cat OR rat');
select websearch_to_tsquery('simple', 'cat "OR" rat');
select websearch_to_tsquery('simple', 'cat OR');
select websearch_to_tsquery('simple', 'OR rat');
select websearch_to_tsquery('simple', '"fat cat OR rat"');
select websearch_to_tsquery('simple', 'fat (cat OR rat');
select websearch_to_tsquery('simple', 'or OR or');
-- OR is an operator here ...
select websearch_to_tsquery('simple', '"fat cat"or"fat rat"');
select websearch_to_tsquery('simple', 'fat or(rat');
select websearch_to_tsquery('simple', 'fat or)rat');
select websearch_to_tsquery('simple', 'fat or&rat');
select websearch_to_tsquery('simple', 'fat or|rat');
select websearch_to_tsquery('simple', 'fat or!rat');
select websearch_to_tsquery('simple', 'fat or<rat');
select websearch_to_tsquery('simple', 'fat or>rat');
select websearch_to_tsquery('simple', 'fat or ');
-- ... but not here
select websearch_to_tsquery('simple', 'abc orange');
select websearch_to_tsquery('simple', 'abc orтест');
select websearch_to_tsquery('simple', 'abc OR1234');
select websearch_to_tsquery('simple', 'abc or-abc');
select websearch_to_tsquery('simple', 'abc OR_abc');
-- test quotes
select websearch_to_tsquery('english', '"pg_class pg');
select websearch_to_tsquery('english', 'pg_class pg"');
select websearch_to_tsquery('english', '"pg_class pg"');
select websearch_to_tsquery('english', 'abc "pg_class pg"');
select websearch_to_tsquery('english', '"pg_class pg" def');
select websearch_to_tsquery('english', 'abc "pg pg_class pg" def');
select websearch_to_tsquery('english', ' or "pg pg_class pg" or ');
select websearch_to_tsquery('english', '""pg pg_class pg""');
select websearch_to_tsquery('english', 'abc """"" def');
select websearch_to_tsquery('english', 'cat -"fat rat"');
select websearch_to_tsquery('english', 'cat -"fat rat" cheese');
select websearch_to_tsquery('english', 'abc "def -"');
select websearch_to_tsquery('english', 'abc "def :"');
select websearch_to_tsquery('english', '"A fat cat" has just eaten a -rat.');
select websearch_to_tsquery('english', '"A fat cat" has just eaten OR !rat.');
select websearch_to_tsquery('english', '"A fat cat" has just (+eaten OR -rat)');
select websearch_to_tsquery('english', 'this is ----fine');
select websearch_to_tsquery('english', '(()) )))) this ||| is && -fine, "dear friend" OR good');
select websearch_to_tsquery('english', 'an old <-> cat " is fine &&& too');
select websearch_to_tsquery('english', '"A the" OR just on');
select websearch_to_tsquery('english', '"a fat cat" ate a rat');
select to_tsvector('english', 'A fat cat ate a rat') @@
websearch_to_tsquery('english', '"a fat cat" ate a rat');
select to_tsvector('english', 'A fat grey cat ate a rat') @@
websearch_to_tsquery('english', '"a fat cat" ate a rat');
-- cases handled by gettoken_tsvector()
select websearch_to_tsquery('''');
select websearch_to_tsquery('''abc''''def''');
select websearch_to_tsquery('\abc');
select websearch_to_tsquery('\');
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment