Commit 1157f3cc authored by Andrew Dunstan's avatar Andrew Dunstan

Change descriptions of entity and tag objects to "XML entity" and "XML tag".

Allow tag and entity names that follow XML rules. Provide for hexadecimal
as well as decimal numeric entities. Adjust code names to coincide with
new descriptions.
parent a262394c
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.36 2007/11/16 03:23:07 tgl Exp $ --> <!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.37 2007/11/20 02:25:22 adunstan Exp $ -->
<chapter id="textsearch"> <chapter id="textsearch">
<title id="textsearch-title">Full Text Search</title> <title id="textsearch-title">Full Text Search</title>
...@@ -1862,12 +1862,12 @@ LIMIT 10; ...@@ -1862,12 +1862,12 @@ LIMIT 10;
</row> </row>
<row> <row>
<entry><literal>tag</></entry> <entry><literal>tag</></entry>
<entry>HTML tag</entry> <entry>XML tag</entry>
<entry><literal>&lt;A HREF="dictionaries.html"&gt;</literal></entry> <entry><literal>&lt;a href="dictionaries.html"&gt;</literal></entry>
</row> </row>
<row> <row>
<entry><literal>entity</></entry> <entry><literal>entity</></entry>
<entry>HTML entity</entry> <entry>XML entity</entry>
<entry><literal>&amp;amp;</literal></entry> <entry><literal>&amp;amp;</literal></entry>
</row> </row>
<row> <row>
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.10 2007/11/15 22:25:16 momjian Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.11 2007/11/20 02:25:22 adunstan Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -50,7 +50,7 @@ ...@@ -50,7 +50,7 @@
#define DECIMAL 20 #define DECIMAL 20
#define SIGNEDINT 21 #define SIGNEDINT 21
#define UNSIGNEDINT 22 #define UNSIGNEDINT 22
#define HTMLENTITY 23 #define XMLENTITY 23
#define LASTNUM 23 #define LASTNUM 23
...@@ -95,7 +95,7 @@ static const char *const lex_descr[] = { ...@@ -95,7 +95,7 @@ static const char *const lex_descr[] = {
"Hyphenated word part, all letters", "Hyphenated word part, all letters",
"Hyphenated word part, all ASCII", "Hyphenated word part, all ASCII",
"Space symbols", "Space symbols",
"HTML tag", "XML tag",
"Protocol head", "Protocol head",
"Hyphenated word, letters and digits", "Hyphenated word, letters and digits",
"Hyphenated word, all ASCII", "Hyphenated word, all ASCII",
...@@ -105,7 +105,7 @@ static const char *const lex_descr[] = { ...@@ -105,7 +105,7 @@ static const char *const lex_descr[] = {
"Decimal notation", "Decimal notation",
"Signed integer", "Signed integer",
"Unsigned integer", "Unsigned integer",
"HTML entity" "XML entity"
}; };
...@@ -132,11 +132,13 @@ typedef enum ...@@ -132,11 +132,13 @@ typedef enum
TPS_InMantissaFirst, TPS_InMantissaFirst,
TPS_InMantissaSign, TPS_InMantissaSign,
TPS_InMantissa, TPS_InMantissa,
TPS_InHTMLEntityFirst, TPS_InXMLEntityFirst,
TPS_InHTMLEntity, TPS_InXMLEntity,
TPS_InHTMLEntityNumFirst, TPS_InXMLEntityNumFirst,
TPS_InHTMLEntityNum, TPS_InXMLEntityNum,
TPS_InHTMLEntityEnd, TPS_InXMLEntityHexNumFirst,
TPS_InXMLEntityHexNum,
TPS_InXMLEntityEnd,
TPS_InTagFirst, TPS_InTagFirst,
TPS_InXMLBegin, TPS_InXMLBegin,
TPS_InTagCloseFirst, TPS_InTagCloseFirst,
...@@ -653,7 +655,7 @@ static const TParserStateActionItem actionTPS_Base[] = { ...@@ -653,7 +655,7 @@ static const TParserStateActionItem actionTPS_Base[] = {
{p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
{p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL}, {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
{p_iseqC, '&', A_PUSH, TPS_InHTMLEntityFirst, 0, NULL}, {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
{p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL}, {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
...@@ -811,35 +813,56 @@ static const TParserStateActionItem actionTPS_InMantissa[] = { ...@@ -811,35 +813,56 @@ static const TParserStateActionItem actionTPS_InMantissa[] = {
{NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL} {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
}; };
static const TParserStateActionItem actionTPS_InHTMLEntityFirst[] = { static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '#', A_NEXT, TPS_InHTMLEntityNumFirst, 0, NULL}, {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
{p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
static const TParserStateActionItem actionTPS_InHTMLEntity[] = { static const TParserStateActionItem actionTPS_InXMLEntity[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL}, {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
{p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL}, {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
{p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
{p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
{p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
static const TParserStateActionItem actionTPS_InHTMLEntityNumFirst[] = { static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHTMLEntityNum, 0, NULL}, {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
static const TParserStateActionItem actionTPS_InHTMLEntityNum[] = { static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHTMLEntityNum, 0, NULL}, {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
{p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
static const TParserStateActionItem actionTPS_InHTMLEntityEnd[] = { static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
{NULL, 0, A_BINGO | A_CLEAR, TPS_Base, HTMLENTITY, NULL} {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
{p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
{p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
{NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
}; };
static const TParserStateActionItem actionTPS_InTagFirst[] = { static const TParserStateActionItem actionTPS_InTagFirst[] = {
...@@ -854,8 +877,8 @@ static const TParserStateActionItem actionTPS_InTagFirst[] = { ...@@ -854,8 +877,8 @@ static const TParserStateActionItem actionTPS_InTagFirst[] = {
static const TParserStateActionItem actionTPS_InXMLBegin[] = { static const TParserStateActionItem actionTPS_InXMLBegin[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
/* <?xml ... */ /* <?xml ... */
/* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
{p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL}, {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
{p_iseqC, 'X', A_NEXT, TPS_InTag, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
...@@ -1278,11 +1301,13 @@ static const TParserStateAction Actions[] = { ...@@ -1278,11 +1301,13 @@ static const TParserStateAction Actions[] = {
TPARSERSTATEACTION(TPS_InMantissaFirst), TPARSERSTATEACTION(TPS_InMantissaFirst),
TPARSERSTATEACTION(TPS_InMantissaSign), TPARSERSTATEACTION(TPS_InMantissaSign),
TPARSERSTATEACTION(TPS_InMantissa), TPARSERSTATEACTION(TPS_InMantissa),
TPARSERSTATEACTION(TPS_InHTMLEntityFirst), TPARSERSTATEACTION(TPS_InXMLEntityFirst),
TPARSERSTATEACTION(TPS_InHTMLEntity), TPARSERSTATEACTION(TPS_InXMLEntity),
TPARSERSTATEACTION(TPS_InHTMLEntityNumFirst), TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
TPARSERSTATEACTION(TPS_InHTMLEntityNum), TPARSERSTATEACTION(TPS_InXMLEntityNum),
TPARSERSTATEACTION(TPS_InHTMLEntityEnd), TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
TPARSERSTATEACTION(TPS_InXMLEntityEnd),
TPARSERSTATEACTION(TPS_InTagFirst), TPARSERSTATEACTION(TPS_InTagFirst),
TPARSERSTATEACTION(TPS_InXMLBegin), TPARSERSTATEACTION(TPS_InXMLBegin),
TPARSERSTATEACTION(TPS_InTagCloseFirst), TPARSERSTATEACTION(TPS_InTagCloseFirst),
...@@ -1556,9 +1581,9 @@ prsd_end(PG_FUNCTION_ARGS) ...@@ -1556,9 +1581,9 @@ prsd_end(PG_FUNCTION_ARGS)
#define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD ) #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
#define ENDPUNCTOKEN(x) ( (x)==SPACE ) #define ENDPUNCTOKEN(x) ( (x)==SPACE )
#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==HTMLENTITY ) #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
#define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD ) #define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
#define HTMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD ) #define XMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
#define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) ) #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) ) #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
...@@ -1839,7 +1864,7 @@ prsd_headline(PG_FUNCTION_ARGS) ...@@ -1839,7 +1864,7 @@ prsd_headline(PG_FUNCTION_ARGS)
} }
else else
{ {
if (HTMLHLIDIGNORE(prs->words[i].type)) if (XMLHLIDIGNORE(prs->words[i].type))
prs->words[i].replace = 1; prs->words[i].replace = 1;
} }
......
...@@ -222,7 +222,7 @@ SELECT * FROM ts_token_type('default'); ...@@ -222,7 +222,7 @@ SELECT * FROM ts_token_type('default');
10 | hword_part | Hyphenated word part, all letters 10 | hword_part | Hyphenated word part, all letters
11 | hword_asciipart | Hyphenated word part, all ASCII 11 | hword_asciipart | Hyphenated word part, all ASCII
12 | blank | Space symbols 12 | blank | Space symbols
13 | tag | HTML tag 13 | tag | XML tag
14 | protocol | Protocol head 14 | protocol | Protocol head
15 | numhword | Hyphenated word, letters and digits 15 | numhword | Hyphenated word, letters and digits
16 | asciihword | Hyphenated word, all ASCII 16 | asciihword | Hyphenated word, all ASCII
...@@ -232,7 +232,7 @@ SELECT * FROM ts_token_type('default'); ...@@ -232,7 +232,7 @@ SELECT * FROM ts_token_type('default');
20 | float | Decimal notation 20 | float | Decimal notation
21 | int | Signed integer 21 | int | Signed integer
22 | uint | Unsigned integer 22 | uint | Unsigned integer
23 | entity | HTML entity 23 | entity | XML entity
(23 rows) (23 rows)
SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment