Commit 2a0083ed authored by Teodor Sigaev's avatar Teodor Sigaev

Improve headeline generation. Now headline can contain

several fragments a-la Google.

Sushant Sinha <sushant354@gmail.com>
parent 906b7e5f
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.45 2008/09/23 09:20:34 heikki Exp $ -->
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.46 2008/10/17 18:05:19 teodor Exp $ -->
<chapter id="textsearch">
<title id="textsearch-title">Full Text Search</title>
......@@ -1098,6 +1098,29 @@ ORDER BY rank DESC LIMIT 10;
value of three eliminates the English articles.
</para>
</listitem>
<listitem>
<para>
<literal>MaxFragments</literal>: maximum number of text excerpts
or fragments that matches the query words. It also triggers a
different headline generation function than the default one. This
function finds text fragments with as many query words as possible and
stretches those fragments around the query words. As a result
query words are close to the middle of each fragment and have words on
each side. Each fragment will be of at most MaxWords and will not
have words of size less than or equal to ShortWord at the start or
end of a fragment. If all query words are not found in the document,
then a single fragment of MinWords will be displayed.
</para>
</listitem>
<listitem>
<para>
<literal>FragmentDelimiter</literal>: When more than one fragments are
displayed, then the fragments will be separated by this delimiter. This
option is effective only if MaxFragments is greater than 1 and there are
more than one fragments to be diplayed. This option has no effect on the
default headline generation function.
</para>
</listitem>
<listitem>
<para>
<literal>HighlightAll</literal>: Boolean flag; if
......@@ -1109,7 +1132,7 @@ ORDER BY rank DESC LIMIT 10;
Any unspecified options receive these defaults:
<programlisting>
StartSel=&lt;b&gt;, StopSel=&lt;/b&gt;, MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
StartSel=&lt;b&gt;, StopSel=&lt;/b&gt;, MaxFragments=0, FragmentDelimiter=" ... ", MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
</programlisting>
</para>
......
......@@ -7,7 +7,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.8 2008/05/16 16:31:01 tgl Exp $
* $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.9 2008/10/17 18:05:19 teodor Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -583,8 +583,11 @@ text *
generateHeadline(HeadlineParsedText *prs)
{
text *out;
int len = 128;
char *ptr;
int len = 128;
int numfragments = 0;
int2 infrag = 0;
HeadlineWordEntry *wrd = prs->words;
out = (text *) palloc(len);
......@@ -592,7 +595,7 @@ generateHeadline(HeadlineParsedText *prs)
while (wrd - prs->words < prs->curwords)
{
while (wrd->len + prs->stopsellen + prs->startsellen + (ptr - ((char *) out)) >= len)
while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
{
int dist = ptr - ((char *) out);
......@@ -603,6 +606,20 @@ generateHeadline(HeadlineParsedText *prs)
if (wrd->in && !wrd->repeated)
{
if (!infrag)
{
/* start of a new fragment */
infrag = 1;
numfragments ++;
/* add a fragment delimitor if this is after the first one */
if (numfragments > 1)
{
memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
ptr += prs->fragdelimlen;
}
}
if (wrd->replace)
{
*ptr = ' ';
......@@ -625,7 +642,11 @@ generateHeadline(HeadlineParsedText *prs)
}
}
else if (!wrd->repeated)
{
if (infrag)
infrag = 0;
pfree(wrd->word);
}
wrd++;
}
......
This diff is collapsed.
......@@ -6,7 +6,7 @@
*
* Copyright (c) 1998-2008, PostgreSQL Global Development Group
*
* $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.10 2008/06/18 18:42:54 momjian Exp $
* $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.11 2008/10/17 18:05:19 teodor Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -52,8 +52,10 @@ typedef struct
int4 curwords;
char *startsel;
char *stopsel;
char *fragdelim;
int2 startsellen;
int2 stopsellen;
int2 fragdelimlen;
} HeadlineParsedText;
/*
......
......@@ -632,6 +632,98 @@ to_tsquery('english', 'sea&foo'), 'HighlightAll=true');
</html>
(1 row)
--Check if headline fragments work
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'ocean'), 'MaxFragments=1');
ts_headline
------------------------------------
after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted <b>Ocean</b>.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop
(1 row)
--Check if more than one fragments are displayed
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2');
ts_headline
----------------------------------------------
after day, day after day,
We <b>stuck</b>, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where ... drop to drink.
S. T. <b>Coleridge</b>
(1 row)
--Fragments when there all query words are not in the document
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'ocean & seahorse'), 'MaxFragments=1');
ts_headline
------------------------------------
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as
(1 row)
--FragmentDelimiter option
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2,FragmentDelimiter=***');
ts_headline
--------------------------------------------
after day, day after day,
We <b>stuck</b>, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where***drop to drink.
S. T. <b>Coleridge</b>
(1 row)
--Rewrite sub system
CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
\set ECHO none
......
......@@ -208,6 +208,58 @@ ff-bg
</html>',
to_tsquery('english', 'sea&foo'), 'HighlightAll=true');
--Check if headline fragments work
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'ocean'), 'MaxFragments=1');
--Check if more than one fragments are displayed
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2');
--Fragments when there all query words are not in the document
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'ocean & seahorse'), 'MaxFragments=1');
--FragmentDelimiter option
SELECT ts_headline('english', '
Day after day, day after day,
We stuck, nor breath nor motion,
As idle as a painted Ship
Upon a painted Ocean.
Water, water, every where
And all the boards did shrink;
Water, water, every where,
Nor any drop to drink.
S. T. Coleridge (1772-1834)
', to_tsquery('english', 'Coleridge & stuck'), 'MaxFragments=2,FragmentDelimiter=***');
--Rewrite sub system
CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment