Commit 2860041b authored by Bruce Momjian's avatar Bruce Momjian

August 13, 2002

         Use parser of OpenFTS v0.33.

--
Teodor Sigaev
parent 12763562
...@@ -4,6 +4,11 @@ a searchable data type (textual) with indexed access. ...@@ -4,6 +4,11 @@ a searchable data type (textual) with indexed access.
All work was done by Teodor Sigaev (teodor@stack.net) and Oleg Bartunov All work was done by Teodor Sigaev (teodor@stack.net) and Oleg Bartunov
(oleg@sai.msu.su). (oleg@sai.msu.su).
CHANGES:
August 13, 2002
Use parser of OpenFTS v0.33.
IMPORTANT NOTICE: IMPORTANT NOTICE:
This is a first step of our work on integration of OpenFTS This is a first step of our work on integration of OpenFTS
......
...@@ -2,28 +2,33 @@ ...@@ -2,28 +2,33 @@
#define __DEFLEX_H__ #define __DEFLEX_H__
/* rememder !!!! */ /* rememder !!!! */
#define LASTNUM 19 #define LASTNUM 23
#define LATWORD 1 #define LATWORD 1
#define NONLATINWORD 2 #define CYRWORD 2
#define UWORD 3 #define UWORD 3
#define EMAIL 4 #define EMAIL 4
#define FURL 5 #define FURL 5
#define HOST 6 #define HOST 6
#define FLOAT 7 #define SCIENTIFIC 7
#define FINT 8 #define VERSIONNUMBER 8
#define PARTWORD 9 #define PARTHYPHENWORD 9
#define NONLATINPARTWORD 10 #define CYRPARTHYPHENWORD 10
#define LATPARTWORD 11 #define LATPARTHYPHENWORD 11
#define SPACE 12 #define SPACE 12
#define SYMTAG 13 #define TAG 13
#define HTTP 14 #define HTTP 14
#define DEFISWORD 15 #define HYPHENWORD 15
#define DEFISLATWORD 16 #define LATHYPHENWORD 16
#define DEFISNONLATINWORD 17 #define CYRHYPHENWORD 17
#define URI 18 #define URI 18
#define FILEPATH 19 #define FILEPATH 19
#define DECIMAL 20
#define SIGNEDINT 21
#define UNSIGNEDINT 22
#define HTMLENTITY 23
extern const char *descr[]; extern const char *descr[];
#endif #endif
...@@ -689,9 +689,9 @@ SELECT count(*) FROM test_txtidx WHERE a ## '(eq|yt)&(wR|qh)'; ...@@ -689,9 +689,9 @@ SELECT count(*) FROM test_txtidx WHERE a ## '(eq|yt)&(wR|qh)';
select txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> select txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234 /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
<i <b> wow < jqw <> qwerty'); <i <b> wow < jqw <> qwerty');
txt2txtidx txt2txtidx

'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' 'qwe@efd.r' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' 'teodor@stack.net' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32' 'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' 'qwe@efd.r' 'readline-4' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' 'teodor@stack.net' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32'
(1 row) (1 row)
select txtidxsize(txt2txtidx('345 qw')); select txtidxsize(txt2txtidx('345 qw'));
...@@ -705,7 +705,7 @@ select txtidxsize(txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.e ...@@ -705,7 +705,7 @@ select txtidxsize(txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.e
<i <b> wow < jqw <> qwerty')); <i <b> wow < jqw <> qwerty'));
txtidxsize txtidxsize
------------ ------------
52 53
(1 row) (1 row)
insert into test_txtidx (a) values ('345 qwerty'); insert into test_txtidx (a) values ('345 qwerty');
......
...@@ -75,19 +75,23 @@ static MAPDICT mapdict[] = { ...@@ -75,19 +75,23 @@ static MAPDICT mapdict[] = {
{NODICT, NODICT}, /* EMAIL */ {NODICT, NODICT}, /* EMAIL */
{NODICT, NODICT}, /* FURL */ {NODICT, NODICT}, /* FURL */
{NODICT, NODICT}, /* HOST */ {NODICT, NODICT}, /* HOST */
{NODICT, NODICT}, /* FLOAT */ {NODICT, NODICT}, /* SCIENTIFIC */
{NODICT, NODICT}, /* FINT */ {NODICT, NODICT}, /* VERSIONNUMBER */
{BYLOCALE, DEFAULTDICT}, /* PARTWORD */ {BYLOCALE, DEFAULTDICT}, /* PARTHYPHENWORD */
{BYLOCALE, NODICT}, /* NONLATINPARTWORD */ {BYLOCALE, NODICT}, /* CYRPARTHYPHENWORD */
{DEFAULTDICT, NODICT}, /* LATPARTWORD */ {DEFAULTDICT, NODICT}, /* LATPARTHYPHENWORD */
{STOPLEXEM, NODICT}, /* SPACE */ {STOPLEXEM, NODICT}, /* SPACE */
{STOPLEXEM, NODICT}, /* SYMTAG */ {STOPLEXEM, NODICT}, /* TAG */
{STOPLEXEM, NODICT}, /* HTTP */ {STOPLEXEM, NODICT}, /* HTTP */
{BYLOCALE, DEFAULTDICT}, /* DEFISWORD */ {BYLOCALE, DEFAULTDICT}, /* HYPHENWORD */
{DEFAULTDICT, NODICT}, /* DEFISLATWORD */ {DEFAULTDICT, NODICT}, /* LATHYPHENWORD */
{BYLOCALE, NODICT}, /* DEFISNONLATINWORD */ {BYLOCALE, NODICT}, /* CYRHYPHENWORD */
{NODICT, NODICT}, /* URI */ {NODICT, NODICT}, /* URI */
{NODICT, NODICT} /* FILEPATH */ {NODICT, NODICT}, /* FILEPATH */
{NODICT, NODICT}, /* DECIMAL */
{NODICT, NODICT}, /* SIGNEDINT */
{NODICT, NODICT}, /* UNSIGNEDINT */
{STOPLEXEM, NODICT} /* HTMLENTITY */
}; };
static bool inited = false; static bool inited = false;
......
...@@ -5,18 +5,17 @@ ...@@ -5,18 +5,17 @@
/* postgres allocation function */ /* postgres allocation function */
#include "postgres.h" #include "postgres.h"
#define free pfree #define free pfree
#define malloc palloc #define malloc palloc
#define realloc repalloc #define realloc repalloc
#ifdef strdup #ifdef strdup
#undef strdup #undef strdup
#endif #endif
#define strdup pstrdup #define strdup pstrdup
char *token = NULL; /* pointer to token */ char *token = NULL; /* pointer to token */
char *s = NULL; /* for returning full defis-word */ char *s = NULL; /* to return WHOLE hyphenated-word */
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */ YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
...@@ -57,21 +56,21 @@ int bytestoread = 0; /* for limiting read from filehandle */ ...@@ -57,21 +56,21 @@ int bytestoread = 0; /* for limiting read from filehandle */
%option nounput %option nounput
%option noyywrap %option noyywrap
/* parser's state for parsing hyphenated-word */
/* parser's state for parsing defis-word */
%x DELIM %x DELIM
/* parser's state for parsing URL*/ /* parser's state for parsing URL*/
%x URL %x URL
%x SERVER %x SERVER
/* parser's state for parsing filepath */ /* parser's state for parsing TAGS */
%x INTAG %x INTAG
%x QINTAG %x QINTAG
%x INCOMMENT
%x INSCRIPT
/* NONLATIN char */ /* cyrillic koi8 char */
NONLATINALNUM [0-9\200-\377] CYRALNUM [0-9\200-\377]
NONLATINALPHA [\200-\377] CYRALPHA [\200-\377]
ALPHA [a-zA-Z\200-\377] ALPHA [a-zA-Z\200-\377]
ALNUM [0-9a-zA-Z\200-\377] ALNUM [0-9a-zA-Z\200-\377]
...@@ -81,66 +80,59 @@ URI [-_[:alnum:]/%,\.;=&?#]+ ...@@ -81,66 +80,59 @@ URI [-_[:alnum:]/%,\.;=&?#]+
%% %%
"<"[[:alpha:]] { BEGIN INTAG; "<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
"</"[[:alpha:]] { BEGIN INTAG;
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
"<>" { <INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
BEGIN INITIAL;
*tsearch_yytext=' '; *(tsearch_yytext+1) = '\0';
token = tsearch_yytext; token = tsearch_yytext;
tokenlen = tsearch_yyleng; tokenlen = tsearch_yyleng;
return SYMTAG; return SPACE;
} }
"<"[^>[:alpha:]] { "<!--" { BEGIN INCOMMENT; }
<INCOMMENT>"-->" {
BEGIN INITIAL;
*tsearch_yytext=' '; *(tsearch_yytext+1) = '\0';
token = tsearch_yytext; token = tsearch_yytext;
tokenlen = tsearch_yyleng; tokenlen = tsearch_yyleng;
return SPACE; return SPACE;
} }
<INTAG>"\"" { BEGIN QINTAG;
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
<QINTAG>"\\\"" { "<"[\![:alpha:]] { BEGIN INTAG; }
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
<QINTAG>"\"" { BEGIN INTAG; "</"[[:alpha:]] { BEGIN INTAG; }
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
<QINTAG>.|\n { <INTAG>"\"" { BEGIN QINTAG; }
<QINTAG>"\\\"" ;
<QINTAG>"\"" { BEGIN INTAG; }
<INTAG>">" {
BEGIN INITIAL;
token = tsearch_yytext; token = tsearch_yytext;
tokenlen = tsearch_yyleng; *tsearch_yytext=' ';
return SYMTAG; token = tsearch_yytext;
tokenlen = 1;
return TAG;
} }
<INTAG>">" { BEGIN INITIAL; <QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n ;
\&(quot|amp|nbsp|lt|gt)\; {
token = tsearch_yytext; token = tsearch_yytext;
tokenlen = tsearch_yyleng; tokenlen = tsearch_yyleng;
return SYMTAG; return HTMLENTITY;
} }
<INTAG>.|\n { \&\#[0-9][0-9]?[0-9]?\; {
token = tsearch_yytext; token = tsearch_yytext;
tokenlen = tsearch_yyleng; tokenlen = tsearch_yyleng;
return SYMTAG; return HTMLENTITY;
} }
[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ { [-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
token = tsearch_yytext; token = tsearch_yytext;
...@@ -148,22 +140,34 @@ URI [-_[:alnum:]/%,\.;=&?#]+ ...@@ -148,22 +140,34 @@ URI [-_[:alnum:]/%,\.;=&?#]+
return EMAIL; return EMAIL;
} }
<DELIM,INITIAL>[0-9] /* digit's and point (might be a version) */ { [+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ {
token = tsearch_yytext; token = tsearch_yytext;
tokenlen = tsearch_yyleng; tokenlen = tsearch_yyleng;
return FINT; return SCIENTIFIC;
}
[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return VERSIONNUMBER;
}
[+-]?[0-9]+\.[0-9]+ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return DECIMAL;
} }
<DELIM,INITIAL>[0-9]+[0-9\.]*[0-9] /* digit's and point (might be a version) */ { [+-][0-9]+ {
token = tsearch_yytext; token = tsearch_yytext;
tokenlen = tsearch_yyleng; tokenlen = tsearch_yyleng;
return FINT; return SIGNEDINT;
} }
[+-]?[0-9\.]+[eE][+-]?[0-9]+ /* float */ { <DELIM,INITIAL>[0-9]+ {
token = tsearch_yytext; token = tsearch_yytext;
tokenlen = tsearch_yyleng; tokenlen = tsearch_yyleng;
return FLOAT; return UNSIGNEDINT;
} }
http"://" { http"://" {
...@@ -208,52 +212,58 @@ ftp"://" { ...@@ -208,52 +212,58 @@ ftp"://" {
return FILEPATH; return FILEPATH;
} }
({NONLATINALNUM}+-)+{NONLATINALPHA}+ /* composite-word */ { ({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ {
BEGIN DELIM; BEGIN DELIM;
if (s) { free(s); s=NULL; } if (s) { free(s); s=NULL; }
s = strdup( tsearch_yytext ); s = strdup( tsearch_yytext );
tokenlen = tsearch_yyleng; tokenlen = tsearch_yyleng;
yyless( 0 ); yyless( 0 );
token = s; token = s;
return DEFISNONLATINWORD; return CYRHYPHENWORD;
} }
([[:alnum:]]+-)+[[:alpha:]]+ /* composite-word */ { ([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ {
BEGIN DELIM; BEGIN DELIM;
if (s) { free(s); s=NULL; } if (s) { free(s); s=NULL; }
tokenlen = tsearch_yyleng;
s = strdup( tsearch_yytext ); s = strdup( tsearch_yytext );
tokenlen = tsearch_yyleng;
yyless( 0 ); yyless( 0 );
token = s; token = s;
return DEFISLATWORD; return LATHYPHENWORD;
} }
({ALNUM}+-)+{ALPHA}+ /* composite-word */ { ({ALNUM}+-)+{ALNUM}+ /* composite-word */ {
BEGIN DELIM; BEGIN DELIM;
if (s) { free(s); s=NULL; } if (s) { free(s); s=NULL; }
s = strdup( tsearch_yytext ); s = strdup( tsearch_yytext );
tokenlen = tsearch_yyleng; tokenlen = tsearch_yyleng;
yyless( 0 ); yyless( 0 );
token = s; token = s;
return DEFISWORD; return HYPHENWORD;
}
<DELIM>\+?[0-9]+\.[0-9]+ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return DECIMAL;
} }
<DELIM>{NONLATINALNUM}+ /* one word in composite-word */ { <DELIM>{CYRALPHA}+ /* one word in composite-word */ {
token = tsearch_yytext; token = tsearch_yytext;
tokenlen = tsearch_yyleng; tokenlen = tsearch_yyleng;
return NONLATINPARTWORD; return CYRPARTHYPHENWORD;
} }
<DELIM>[[:alnum:]]+ /* one word in composite-word */ { <DELIM>[[:alpha:]]+ /* one word in composite-word */ {
token = tsearch_yytext; token = tsearch_yytext;
tokenlen = tsearch_yyleng; tokenlen = tsearch_yyleng;
return LATPARTWORD; return LATPARTHYPHENWORD;
} }
<DELIM>{ALNUM}+ /* one word in composite-word */ { <DELIM>{ALNUM}+ /* one word in composite-word */ {
token = tsearch_yytext; token = tsearch_yytext;
tokenlen = tsearch_yyleng; tokenlen = tsearch_yyleng;
return PARTWORD; return PARTHYPHENWORD;
} }
<DELIM>- { <DELIM>- {
...@@ -264,17 +274,16 @@ ftp"://" { ...@@ -264,17 +274,16 @@ ftp"://" {
<DELIM,SERVER,URL>.|\n /* return in basic state */ { <DELIM,SERVER,URL>.|\n /* return in basic state */ {
BEGIN INITIAL; BEGIN INITIAL;
tokenlen = tsearch_yyleng;
yyless( 0 ); yyless( 0 );
} }
{NONLATINALNUM}+ /* normal word */ { {CYRALPHA}+ /* normal word */ {
token = tsearch_yytext; token = tsearch_yytext;
tokenlen = tsearch_yyleng; tokenlen = tsearch_yyleng;
return NONLATINWORD; return CYRWORD;
} }
[[:alnum:]]+ /* normal word */ { [[:alpha:]]+ /* normal word */ {
token = tsearch_yytext; token = tsearch_yytext;
tokenlen = tsearch_yyleng; tokenlen = tsearch_yyleng;
return LATWORD; return LATWORD;
...@@ -286,7 +295,13 @@ ftp"://" { ...@@ -286,7 +295,13 @@ ftp"://" {
return UWORD; return UWORD;
} }
.|\n { [ \r\n\t]+ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SPACE;
}
. {
token = tsearch_yytext; token = tsearch_yytext;
tokenlen = tsearch_yyleng; tokenlen = tsearch_yyleng;
return SPACE; return SPACE;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment