Commit bb892375 authored by Teodor Sigaev's avatar Teodor Sigaev

1 Eliminate duplicate field HLWORD->skip

2 Rework support for html tags in parser
3 add HighlightAll to headline function for generating highlighted
  whole text with saved html tags
parent e48cfacb
...@@ -458,20 +458,20 @@ select * from parse('default', '345 qwe@efd.r \' http://www.com/ http://aew.werc ...@@ -458,20 +458,20 @@ select * from parse('default', '345 qwe@efd.r \' http://www.com/ http://aew.werc
12 | 12 |
1 | asdf 1 | asdf
12 | 12 |
13 | 13 | <fr>
1 | qwer 1 | qwer
12 | 12 |
1 | jf 1 | jf
12 | 12 |
1 | sdjk 1 | sdjk
13 | 13 | <we hjwer <werrwe>
12 | 12 |
3 | ewr1 3 | ewr1
12 | > 12 | >
12 | 12 |
3 | ewri2 3 | ewri2
12 | 12 |
13 | 13 | <a href="qwe<qwe>">
12 | 12 |
19 | /usr/local/fff 19 | /usr/local/fff
...@@ -515,7 +515,7 @@ select * from parse('default', '345 qwe@efd.r \' http://www.com/ http://aew.werc ...@@ -515,7 +515,7 @@ select * from parse('default', '345 qwe@efd.r \' http://www.com/ http://aew.werc
22 | 234 22 | 234
12 | 12 |
13 | 13 | <i <b>
12 | 12 |
1 | wow 1 | wow
12 | 12 |
...@@ -2130,6 +2130,35 @@ A thousand years to trace ...@@ -2130,6 +2130,35 @@ A thousand years to trace
The granite features of this cliff The granite features of this cliff
(1 row) (1 row)
select headline('
<html>
<!-- some comment -->
<body>
Sea view wow <u>foo bar</u> <i>qq</i>
<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
ff-bg
<script>
document.write(15);
</script>
</body>
</html>',
to_tsquery('sea&foo'), 'HighlightAll=true');
headline
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
<html>
<!-- some comment -->
<body>
<b>Sea</b> view wow <u><b>foo</b> bar</u> <i>qq</i>
<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
ff-bg
<script>
document.write(15);
</script>
</body>
</html>
(1 row)
--check debug --check debug
select * from ts_debug('Tsearch module for PostgreSQL 7.3.3'); select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');
ts_name | tok_type | description | token | dict_name | tsvector ts_name | tok_type | description | token | dict_name | tsvector
......
...@@ -253,6 +253,20 @@ The sculpture of these granite seams, ...@@ -253,6 +253,20 @@ The sculpture of these granite seams,
Upon a woman s face. E. J. Pratt (1882 1964) Upon a woman s face. E. J. Pratt (1882 1964)
', to_tsquery('sea')); ', to_tsquery('sea'));
select headline('
<html>
<!-- some comment -->
<body>
Sea view wow <u>foo bar</u> <i>qq</i>
<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
ff-bg
<script>
document.write(15);
</script>
</body>
</html>',
to_tsquery('sea&foo'), 'HighlightAll=true');
--check debug --check debug
select * from ts_debug('Tsearch module for PostgreSQL 7.3.3'); select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');
......
...@@ -510,7 +510,7 @@ genhl(HLPRSTEXT * prs) ...@@ -510,7 +510,7 @@ genhl(HLPRSTEXT * prs)
ptr = ((char *) out) + dist; ptr = ((char *) out) + dist;
} }
if (wrd->in && !wrd->skip && !wrd->repeated) if (wrd->in && !wrd->repeated)
{ {
if (wrd->replace) if (wrd->replace)
{ {
...@@ -532,7 +532,7 @@ genhl(HLPRSTEXT * prs) ...@@ -532,7 +532,7 @@ genhl(HLPRSTEXT * prs)
ptr += prs->stopsellen; ptr += prs->stopsellen;
} }
} }
} } else
if (!wrd->repeated) if (!wrd->repeated)
pfree(wrd->word); pfree(wrd->word);
......
...@@ -46,13 +46,13 @@ typedef struct ...@@ -46,13 +46,13 @@ typedef struct
typedef struct typedef struct
{ {
uint16 len; uint32 selected:1,
uint8 selected:1,
in:1, in:1,
skip:1,
replace:1, replace:1,
repeated:1; repeated:1,
uint8 type; unused:4,
type:8,
len:16;
char *word; char *word;
ITEM *item; ITEM *item;
} HLWORD; } HLWORD;
......
...@@ -10,10 +10,48 @@ ...@@ -10,10 +10,48 @@
char *token = NULL; /* pointer to token */ char *token = NULL; /* pointer to token */
int tokenlen; int tokenlen;
char *s = NULL; /* to return WHOLE hyphenated-word */ static char *s = NULL; /* to return WHOLE hyphenated-word */
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */ YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
typedef struct {
int tlen;
int clen;
char *str;
} TagStorage;
static TagStorage ts={0,0,NULL};
static void
addTag() {
while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
ts.tlen*=2;
ts.str=realloc(ts.str,ts.tlen);
if (!ts.str)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
ts.clen+=tsearch2_yyleng;
ts.str[ts.clen]='\0';
}
static void
startTag() {
if ( ts.str==NULL ) {
ts.tlen=tsearch2_yyleng+1;
ts.str=malloc(ts.tlen);
if (!ts.str)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
ts.clen=0;
ts.str[0]='\0';
addTag();
}
%} %}
%option 8bit %option 8bit
...@@ -46,47 +84,46 @@ URI [-_[:alnum:]/%,\.;=&?#]+ ...@@ -46,47 +84,46 @@ URI [-_[:alnum:]/%,\.;=&?#]+
%% %%
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; } "<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }
<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" { <INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
BEGIN INITIAL; BEGIN INITIAL;
*tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0'; addTag();
token = tsearch2_yytext; token = ts.str;
tokenlen = tsearch2_yyleng; tokenlen = ts.clen;
return SPACE; return TAG;
} }
"<!--" { BEGIN INCOMMENT; } "<!--" { BEGIN INCOMMENT; startTag(); }
<INCOMMENT>"-->" { <INCOMMENT>"-->" {
BEGIN INITIAL; BEGIN INITIAL;
*tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0'; addTag();
token = tsearch2_yytext; token = ts.str;
tokenlen = tsearch2_yyleng; tokenlen = ts.clen;
return SPACE; return TAG;
} }
"<"[\![:alpha:]] { BEGIN INTAG; } "<"[\![:alpha:]] { BEGIN INTAG; startTag(); }
"</"[[:alpha:]] { BEGIN INTAG; } "</"[[:alpha:]] { BEGIN INTAG; startTag(); }
<INTAG>"\"" { BEGIN QINTAG; } <INTAG>"\"" { BEGIN QINTAG; addTag(); }
<QINTAG>"\\\"" ; <QINTAG>"\\\"" { addTag(); }
<QINTAG>"\"" { BEGIN INTAG; } <QINTAG>"\"" { BEGIN INTAG; addTag(); }
<INTAG>">" { <INTAG>">" {
BEGIN INITIAL; BEGIN INITIAL;
token = tsearch2_yytext; addTag();
*tsearch2_yytext=' '; token = ts.str;
token = tsearch2_yytext; tokenlen = ts.clen;
tokenlen = 1;
return TAG; return TAG;
} }
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n ; <QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }
\&(quot|amp|nbsp|lt|gt)\; { \&(quot|amp|nbsp|lt|gt)\; {
token = tsearch2_yytext; token = tsearch2_yytext;
...@@ -295,3 +332,4 @@ void tsearch2_start_parse_str(char* str, int limit) { ...@@ -295,3 +332,4 @@ void tsearch2_start_parse_str(char* str, int limit) {
tsearch2_yy_switch_to_buffer( buf ); tsearch2_yy_switch_to_buffer( buf );
BEGIN INITIAL; BEGIN INITIAL;
} }
...@@ -78,6 +78,7 @@ prsd_end(PG_FUNCTION_ARGS) ...@@ -78,6 +78,7 @@ prsd_end(PG_FUNCTION_ARGS)
#define IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 ) #define IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 ) #define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) ) #define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || IDIGNORE(x) ) #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || IDIGNORE(x) )
...@@ -196,6 +197,7 @@ prsd_headline(PG_FUNCTION_ARGS) ...@@ -196,6 +197,7 @@ prsd_headline(PG_FUNCTION_ARGS)
curlen; curlen;
int i; int i;
int highlight=0;
/* config */ /* config */
prs->startsel = NULL; prs->startsel = NULL;
...@@ -220,6 +222,15 @@ prsd_headline(PG_FUNCTION_ARGS) ...@@ -220,6 +222,15 @@ prsd_headline(PG_FUNCTION_ARGS)
prs->startsel = pstrdup(mptr->value); prs->startsel = pstrdup(mptr->value);
else if (pg_strcasecmp(mptr->key, "StopSel") == 0) else if (pg_strcasecmp(mptr->key, "StopSel") == 0)
prs->stopsel = pstrdup(mptr->value); prs->stopsel = pstrdup(mptr->value);
else if (pg_strcasecmp(mptr->key, "HighlightAll") == 0)
highlight = (
pg_strcasecmp(mptr->value, "1")==0 ||
pg_strcasecmp(mptr->value, "on")==0 ||
pg_strcasecmp(mptr->value, "true")==0 ||
pg_strcasecmp(mptr->value, "t")==0 ||
pg_strcasecmp(mptr->value, "y")==0 ||
pg_strcasecmp(mptr->value, "yes")==0 ) ?
1 : 0;
pfree(mptr->key); pfree(mptr->key);
pfree(mptr->value); pfree(mptr->value);
...@@ -228,6 +239,7 @@ prsd_headline(PG_FUNCTION_ARGS) ...@@ -228,6 +239,7 @@ prsd_headline(PG_FUNCTION_ARGS)
} }
pfree(map); pfree(map);
if (highlight==0) {
if (min_words >= max_words) if (min_words >= max_words)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
...@@ -241,7 +253,9 @@ prsd_headline(PG_FUNCTION_ARGS) ...@@ -241,7 +253,9 @@ prsd_headline(PG_FUNCTION_ARGS)
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("ShortWord should be >= 0"))); errmsg("ShortWord should be >= 0")));
} }
}
if (highlight==0) {
while (hlCover(prs, query, &p, &q)) while (hlCover(prs, query, &p, &q))
{ {
/* find cover len in words */ /* find cover len in words */
...@@ -325,7 +339,6 @@ prsd_headline(PG_FUNCTION_ARGS) ...@@ -325,7 +339,6 @@ prsd_headline(PG_FUNCTION_ARGS)
if (bestlen < 0) if (bestlen < 0)
{ {
curlen = 0; curlen = 0;
poslen = 0;
for (i = 0; i < prs->curwords && curlen < min_words; i++) for (i = 0; i < prs->curwords && curlen < min_words; i++)
{ {
if (!NONWORDTOKEN(prs->words[i].type)) if (!NONWORDTOKEN(prs->words[i].type))
...@@ -335,17 +348,24 @@ prsd_headline(PG_FUNCTION_ARGS) ...@@ -335,17 +348,24 @@ prsd_headline(PG_FUNCTION_ARGS)
bestb = 0; bestb = 0;
beste = pose; beste = pose;
} }
} else {
bestb=0;
beste=prs->curwords-1;
}
for (i = bestb; i <= beste; i++) for (i = bestb; i <= beste; i++)
{ {
if (prs->words[i].item) if (prs->words[i].item)
prs->words[i].selected = 1; prs->words[i].selected = 1;
if (prs->words[i].repeated) if ( highlight==0 ) {
prs->words[i].skip = 1;
if (HLIDIGNORE(prs->words[i].type)) if (HLIDIGNORE(prs->words[i].type))
prs->words[i].replace = 1; prs->words[i].replace = 1;
} else {
if (HTMLHLIDIGNORE(prs->words[i].type))
prs->words[i].replace = 1;
}
prs->words[i].in = 1; prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
} }
if (!prs->startsel) if (!prs->startsel)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment