Commit c52795d1 authored by Teodor Sigaev's avatar Teodor Sigaev

Text parser rewritten:

        - supports multibyte encodings
        - more strict rules for lexemes
        - flex isn't used
Add:
        - tsquery plainto_tsquery(text)
          Function makes tsquery from plain text.
        - &&, ||, !! operation for tsquery for combining
          tsquery from it's parts:  'foo & bar' || 'asd' => 'foo & bar | asd'
parent b91e6ed9
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.11 2005/11/08 17:08:46 teodor Exp $ # $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.12 2005/11/21 12:27:57 teodor Exp $
MODULE_big = tsearch2 MODULE_big = tsearch2
OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \ OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
...@@ -6,7 +6,8 @@ OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \ ...@@ -6,7 +6,8 @@ OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
wparser.o wparser_def.o \ wparser.o wparser_def.o \
ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \ ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
tsvector_op.o rank.o ts_stat.o \ tsvector_op.o rank.o ts_stat.o \
query_util.o query_support.o query_rewrite.o query_gist.o query_util.o query_support.o query_rewrite.o query_gist.o \
ts_locale.o
SUBDIRS := snowball ispell wordparser SUBDIRS := snowball ispell wordparser
SUBDIROBJS := $(SUBDIRS:%=%/SUBSYS.o) SUBDIROBJS := $(SUBDIRS:%=%/SUBSYS.o)
......
...@@ -13,12 +13,12 @@ psql:tsearch2.sql:342: NOTICE: argument type tsvector is only a shell ...@@ -13,12 +13,12 @@ psql:tsearch2.sql:342: NOTICE: argument type tsvector is only a shell
psql:tsearch2.sql:396: NOTICE: type "tsquery" is not yet defined psql:tsearch2.sql:396: NOTICE: type "tsquery" is not yet defined
DETAIL: Creating a shell type definition. DETAIL: Creating a shell type definition.
psql:tsearch2.sql:401: NOTICE: argument type tsquery is only a shell psql:tsearch2.sql:401: NOTICE: argument type tsquery is only a shell
psql:tsearch2.sql:544: NOTICE: type "gtsvector" is not yet defined psql:tsearch2.sql:559: NOTICE: type "gtsvector" is not yet defined
DETAIL: Creating a shell type definition. DETAIL: Creating a shell type definition.
psql:tsearch2.sql:549: NOTICE: argument type gtsvector is only a shell psql:tsearch2.sql:564: NOTICE: argument type gtsvector is only a shell
psql:tsearch2.sql:998: NOTICE: type "gtsq" is not yet defined psql:tsearch2.sql:1054: NOTICE: type "gtsq" is not yet defined
DETAIL: Creating a shell type definition. DETAIL: Creating a shell type definition.
psql:tsearch2.sql:1003: NOTICE: argument type gtsq is only a shell psql:tsearch2.sql:1059: NOTICE: argument type gtsq is only a shell
--tsvector --tsvector
SELECT '1'::tsvector; SELECT '1'::tsvector;
tsvector tsvector
...@@ -653,7 +653,7 @@ select * from token_type('default'); ...@@ -653,7 +653,7 @@ select * from token_type('default');
11 | lpart_hword | Latin part of hyphenated word 11 | lpart_hword | Latin part of hyphenated word
12 | blank | Space symbols 12 | blank | Space symbols
13 | tag | HTML Tag 13 | tag | HTML Tag
14 | http | HTTP head 14 | protocol | Protocol head
15 | hword | Hyphenated word 15 | hword | Hyphenated word
16 | lhword | Latin hyphenated word 16 | lhword | Latin hyphenated word
17 | nlhword | Non-latin hyphenated word 17 | nlhword | Non-latin hyphenated word
...@@ -672,14 +672,13 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc ...@@ -672,14 +672,13 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
-------+-------------------------------------- -------+--------------------------------------
22 | 345 22 | 345
12 | 12 |
4 | qwe@efd.r 1 | qwe
12 | 12 | @
12 | ' 19 | efd.r
12 | 12 | '
14 | http:// 14 | http://
6 | www.com 6 | www.com
12 | / 12 | /
12 |
14 | http:// 14 | http://
5 | aew.werc.ewr/?ad=qwe&dw 5 | aew.werc.ewr/?ad=qwe&dw
6 | aew.werc.ewr 6 | aew.werc.ewr
...@@ -700,10 +699,8 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc ...@@ -700,10 +699,8 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
6 | 4aew.werc.ewr 6 | 4aew.werc.ewr
12 | 12 |
14 | http:// 14 | http://
5 | 5aew.werc.ewr:8100/? 6 | 5aew.werc.ewr:8100
6 | 5aew.werc.ewr 12 | /?
18 | :8100/?
12 |
1 | ad 1 | ad
12 | = 12 | =
1 | qwe 1 | qwe
...@@ -711,12 +708,12 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc ...@@ -711,12 +708,12 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
1 | dw 1 | dw
12 | 12 |
5 | 6aew.werc.ewr:8100/?ad=qwe&dw 5 | 6aew.werc.ewr:8100/?ad=qwe&dw
6 | 6aew.werc.ewr 6 | 6aew.werc.ewr:8100
18 | :8100/?ad=qwe&dw 18 | /?ad=qwe&dw
12 | 12 |
5 | 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 5 | 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32
6 | 7aew.werc.ewr 6 | 7aew.werc.ewr:8100
18 | :8100/?ad=qwe&dw=%20%32 18 | /?ad=qwe&dw=%20%32
12 | 12 |
7 | +4.0e-10 7 | +4.0e-10
12 | 12 |
...@@ -747,11 +744,15 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc ...@@ -747,11 +744,15 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
1 | jf 1 | jf
12 | 12 |
1 | sdjk 1 | sdjk
13 | <we hjwer <werrwe> 12 | <
1 | we
12 | 12 |
3 | ewr1 1 | hjwer
12 | > 12 |
13 | <werrwe>
12 | 12 |
3 | ewr1
12 | >
3 | ewri2 3 | ewri2
12 | 12 |
13 | <a href="qwe<qwe>"> 13 | <a href="qwe<qwe>">
...@@ -767,57 +768,53 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc ...@@ -767,57 +768,53 @@ select * from parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc
12 | 12 |
19 | /wqe-324/ewr 19 | /wqe-324/ewr
12 | 12 |
6 | gist.h 19 | gist.h
12 |
6 | gist.h.c
12 | 12 |
6 | gist.c 19 | gist.h.c
12 | .
12 | 12 |
19 | gist.c
12 | .
1 | readline 1 | readline
12 | 12 |
20 | 4.2 20 | 4.2
12 | 12 |
20 | 4.2 20 | 4.2
12 | . 12 | .
12 |
20 | 4.2 20 | 4.2
12 | , 12 | ,
12 | 15 | readline-4.2
15 | readline-4
11 | readline 11 | readline
12 | - 12 | -
20 | 4.2 20 | 4.2
12 | 12 |
15 | readline-4 15 | readline-4.2
11 | readline 11 | readline
12 | - 12 | -
20 | 4.2 20 | 4.2
12 | . 12 | .
12 |
22 | 234 22 | 234
12 | 12 |
13 | <i <b> 12 | <
1 | i
12 |
13 | <b>
12 | 12 |
1 | wow 1 | wow
12 | 12 |
12 | < 12 | <
12 |
1 | jqw 1 | jqw
12 | 12 |
12 | < 12 | <>
12 | >
12 |
1 | qwerty 1 | qwerty
(138 rows) (135 rows)
SELECT to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> SELECT to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234 /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
<i <b> wow < jqw <> qwerty'); <i <b> wow < jqw <> qwerty');
to_tsvector to_tsvector

'ad':18 'dw':20 'jf':40 '234':62 '345':1 '4.2':53,54,55,58,61 '455':32 'jqw':64 'qwe':19,28,29,36 'wer':37 'wow':63 'asdf':38 'ewr1':42 'qwer':39 'sdjk':41 '5.005':33 'ewri2':43 'qwqwe':30 'wefjn':47 'gist.c':51 'gist.h':49 'qwerti':65 '234.435':31 ':8100/?':17 'qwe-wer':35 'readlin':52,57,60 'www.com':3 '+4.0e-10':27 'gist.h.c':50 'rewt/ewr':46 'qwe@efd.r':2 'readline-4':56,59 '/?ad=qwe&dw':6,9,13 '/wqe-324/ewr':48 'aew.werc.ewr':5 '1aew.werc.ewr':8 '2aew.werc.ewr':10 '3aew.werc.ewr':12 '4aew.werc.ewr':14 '5aew.werc.ewr':16 '6aew.werc.ewr':22 '7aew.werc.ewr':25 '/usr/local/fff':44 '/awdf/dwqe/4325':45 ':8100/?ad=qwe&dw':23 'teodor@stack.net':34 '5aew.werc.ewr:8100/?':15 ':8100/?ad=qwe&dw=%20%32':26 'aew.werc.ewr/?ad=qwe&dw':4 '1aew.werc.ewr/?ad=qwe&dw':7 '3aew.werc.ewr/?ad=qwe&dw':11 '6aew.werc.ewr:8100/?ad=qwe&dw':21 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':24 'ad':17 'dw':19 'jf':39 '234':63 '345':1 '4.2':54,55,56,59,62 '455':31 'jqw':66 'qwe':2,18,27,28,35 'wer':36 'wow':65 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':67 '234.435':30 'qwe-wer':34 'readlin':53,58,61 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 'readline-4.2':57,60 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23
(1 row) (1 row)
SELECT length(to_tsvector('default', '345 qw')); SELECT length(to_tsvector('default', '345 qw'));
...@@ -831,7 +828,7 @@ SELECT length(to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://ae ...@@ -831,7 +828,7 @@ SELECT length(to_tsvector('default', '345 qwe@efd.r '' http://www.com/ http://ae
<i <b> wow < jqw <> qwerty')); <i <b> wow < jqw <> qwerty'));
length length
-------- --------
53 51
(1 row) (1 row)
select to_tsquery('default', 'qwe & sKies '); select to_tsquery('default', 'qwe & sKies ');
...@@ -876,6 +873,36 @@ select to_tsquery('default', '(the|and&(i&1))&fghj'); ...@@ -876,6 +873,36 @@ select to_tsquery('default', '(the|and&(i&1))&fghj');
'1' & 'fghj' '1' & 'fghj'
(1 row) (1 row)
select plainto_tsquery('default', 'the and z 1))& fghj');
plainto_tsquery
--------------------
'z' & '1' & 'fghj'
(1 row)
select plainto_tsquery('default', 'foo bar') && plainto_tsquery('default', 'asd');
?column?
-----------------------
'foo' & 'bar' & 'asd'
(1 row)
select plainto_tsquery('default', 'foo bar') || plainto_tsquery('default', 'asd fg');
?column?
------------------------------
'foo' & 'bar' | 'asd' & 'fg'
(1 row)
select plainto_tsquery('default', 'foo bar') || !!plainto_tsquery('default', 'asd fg');
?column?
-----------------------------------
'foo' & 'bar' | !( 'asd' & 'fg' )
(1 row)
select plainto_tsquery('default', 'foo bar') && 'asd | fg';
?column?
----------------------------------
'foo' & 'bar' & ( 'asd' | 'fg' )
(1 row)
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca'; select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca';
?column? ?column?
---------- ----------
......
...@@ -51,10 +51,20 @@ Datum to_tsquery_name(PG_FUNCTION_ARGS); ...@@ -51,10 +51,20 @@ Datum to_tsquery_name(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(to_tsquery_current); PG_FUNCTION_INFO_V1(to_tsquery_current);
Datum to_tsquery_current(PG_FUNCTION_ARGS); Datum to_tsquery_current(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(plainto_tsquery);
Datum plainto_tsquery(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(plainto_tsquery_name);
Datum plainto_tsquery_name(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(plainto_tsquery_current);
Datum plainto_tsquery_current(PG_FUNCTION_ARGS);
/* parser's states */ /* parser's states */
#define WAITOPERAND 1 #define WAITOPERAND 1
#define WAITOPERATOR 2 #define WAITOPERATOR 2
#define WAITFIRSTOPERAND 3 #define WAITFIRSTOPERAND 3
#define WAITSINGLEOPERAND 4
/* /*
* node of query tree, also used * node of query tree, also used
...@@ -195,6 +205,14 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2 ...@@ -195,6 +205,14 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
else if (*(state->buf) != ' ') else if (*(state->buf) != ' ')
return ERR; return ERR;
break; break;
case WAITSINGLEOPERAND:
if ( *(state->buf) == '\0' )
return END;
*strval = state->buf;
*lenval = strlen( state->buf );
state->buf += strlen( state->buf );
state->count++;
return VAL;
default: default:
return ERR; return ERR;
break; break;
...@@ -582,7 +600,7 @@ findoprnd(ITEM * ptr, int4 *pos) ...@@ -582,7 +600,7 @@ findoprnd(ITEM * ptr, int4 *pos)
* input * input
*/ */
static QUERYTYPE * static QUERYTYPE *
queryin(char *buf, void (*pushval) (QPRS_STATE *, int, char *, int, int2), int cfg_id) queryin(char *buf, void (*pushval) (QPRS_STATE *, int, char *, int, int2), int cfg_id, bool isplain)
{ {
QPRS_STATE state; QPRS_STATE state;
int4 i; int4 i;
...@@ -599,7 +617,7 @@ static QUERYTYPE * ...@@ -599,7 +617,7 @@ static QUERYTYPE *
/* init state */ /* init state */
state.buf = buf; state.buf = buf;
state.state = WAITFIRSTOPERAND; state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND;
state.count = 0; state.count = 0;
state.num = 0; state.num = 0;
state.str = NULL; state.str = NULL;
...@@ -679,7 +697,7 @@ Datum ...@@ -679,7 +697,7 @@ Datum
tsquery_in(PG_FUNCTION_ARGS) tsquery_in(PG_FUNCTION_ARGS)
{ {
SET_FUNCOID(); SET_FUNCOID();
PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0)); PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0, false));
} }
/* /*
...@@ -910,7 +928,7 @@ to_tsquery(PG_FUNCTION_ARGS) ...@@ -910,7 +928,7 @@ to_tsquery(PG_FUNCTION_ARGS)
str = text2char(in); str = text2char(in);
PG_FREE_IF_COPY(in, 1); PG_FREE_IF_COPY(in, 1);
query = queryin(str, pushval_morph, PG_GETARG_INT32(0)); query = queryin(str, pushval_morph, PG_GETARG_INT32(0),false);
if ( query->size == 0 ) if ( query->size == 0 )
PG_RETURN_POINTER(query); PG_RETURN_POINTER(query);
...@@ -950,3 +968,59 @@ to_tsquery_current(PG_FUNCTION_ARGS) ...@@ -950,3 +968,59 @@ to_tsquery_current(PG_FUNCTION_ARGS)
Int32GetDatum(get_currcfg()), Int32GetDatum(get_currcfg()),
PG_GETARG_DATUM(0))); PG_GETARG_DATUM(0)));
} }
Datum
plainto_tsquery(PG_FUNCTION_ARGS)
{
text *in = PG_GETARG_TEXT_P(1);
char *str;
QUERYTYPE *query;
ITEM *res;
int4 len;
SET_FUNCOID();
str = text2char(in);
PG_FREE_IF_COPY(in, 1);
query = queryin(str, pushval_morph, PG_GETARG_INT32(0), true);
if ( query->size == 0 )
PG_RETURN_POINTER(query);
res = clean_fakeval_v2(GETQUERY(query), &len);
if (!res)
{
query->len = HDRSIZEQT;
query->size = 0;
PG_RETURN_POINTER(query);
}
memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(ITEM));
pfree(res);
PG_RETURN_POINTER(query);
}
Datum
plainto_tsquery_name(PG_FUNCTION_ARGS)
{
text *name = PG_GETARG_TEXT_P(0);
Datum res;
SET_FUNCOID();
res = DirectFunctionCall2(plainto_tsquery,
Int32GetDatum(name2id_cfg(name)),
PG_GETARG_DATUM(1));
PG_FREE_IF_COPY(name, 0);
PG_RETURN_DATUM(res);
}
Datum
plainto_tsquery_current(PG_FUNCTION_ARGS)
{
SET_FUNCOID();
PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery,
Int32GetDatum(get_currcfg()),
PG_GETARG_DATUM(0)));
}
...@@ -14,6 +14,117 @@ tsquery_numnode(PG_FUNCTION_ARGS) { ...@@ -14,6 +14,117 @@ tsquery_numnode(PG_FUNCTION_ARGS) {
PG_RETURN_INT32(nnode); PG_RETURN_INT32(nnode);
} }
static QTNode*
join_tsqueries(QUERYTYPE *a, QUERYTYPE *b) {
QTNode *res=(QTNode*)palloc0( sizeof(QTNode) );
res->flags |= QTN_NEEDFREE;
res->valnode = (ITEM*)palloc0( sizeof(ITEM) );
res->valnode->type = OPR;
res->child = (QTNode**)palloc0( sizeof(QTNode*)*2 );
res->child[0] = QT2QTN( GETQUERY(b), GETOPERAND(b) );
res->child[1] = QT2QTN( GETQUERY(a), GETOPERAND(a) );
res->nchild = 2;
return res;
}
PG_FUNCTION_INFO_V1(tsquery_and);
Datum tsquery_and(PG_FUNCTION_ARGS);
Datum
tsquery_and(PG_FUNCTION_ARGS) {
QUERYTYPE *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
QUERYTYPE *b = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1)));
QTNode *res;
QUERYTYPE *query;
if ( a->size == 0 ) {
PG_FREE_IF_COPY(a,1);
PG_RETURN_POINTER(b);
} else if ( b->size == 0 ) {
PG_FREE_IF_COPY(b,1);
PG_RETURN_POINTER(a);
}
res = join_tsqueries(a, b);
res->valnode->val = '&';
query = QTN2QT( res, PlainMemory );
QTNFree(res);
PG_FREE_IF_COPY(a,0);
PG_FREE_IF_COPY(b,1);
PG_RETURN_POINTER(query);
}
PG_FUNCTION_INFO_V1(tsquery_or);
Datum tsquery_or(PG_FUNCTION_ARGS);
Datum
tsquery_or(PG_FUNCTION_ARGS) {
QUERYTYPE *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
QUERYTYPE *b = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1)));
QTNode *res;
QUERYTYPE *query;
if ( a->size == 0 ) {
PG_FREE_IF_COPY(a,1);
PG_RETURN_POINTER(b);
} else if ( b->size == 0 ) {
PG_FREE_IF_COPY(b,1);
PG_RETURN_POINTER(a);
}
res = join_tsqueries(a, b);
res->valnode->val = '|';
query = QTN2QT( res, PlainMemory );
QTNFree(res);
PG_FREE_IF_COPY(a,0);
PG_FREE_IF_COPY(b,1);
PG_RETURN_POINTER(query);
}
PG_FUNCTION_INFO_V1(tsquery_not);
Datum tsquery_not(PG_FUNCTION_ARGS);
Datum
tsquery_not(PG_FUNCTION_ARGS) {
QUERYTYPE *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
QTNode *res;
QUERYTYPE *query;
if ( a->size == 0 )
PG_RETURN_POINTER(a);
res=(QTNode*)palloc0( sizeof(QTNode) );
res->flags |= QTN_NEEDFREE;
res->valnode = (ITEM*)palloc0( sizeof(ITEM) );
res->valnode->type = OPR;
res->valnode->val = '!';
res->child = (QTNode**)palloc0( sizeof(QTNode*) );
res->child[0] = QT2QTN( GETQUERY(a), GETOPERAND(a) );
res->nchild = 1;
query = QTN2QT( res, PlainMemory );
QTNFree(res);
PG_FREE_IF_COPY(a,0);
PG_RETURN_POINTER(query);
}
static int static int
CompareTSQ( QUERYTYPE *a, QUERYTYPE *b ) { CompareTSQ( QUERYTYPE *a, QUERYTYPE *b ) {
if ( a->size != b->size ) { if ( a->size != b->size ) {
......
...@@ -173,6 +173,13 @@ select to_tsquery('default', 'asd&(and|fghj)'); ...@@ -173,6 +173,13 @@ select to_tsquery('default', 'asd&(and|fghj)');
select to_tsquery('default', '(asd&and)|fghj'); select to_tsquery('default', '(asd&and)|fghj');
select to_tsquery('default', '(asd&!and)|fghj'); select to_tsquery('default', '(asd&!and)|fghj');
select to_tsquery('default', '(the|and&(i&1))&fghj'); select to_tsquery('default', '(the|and&(i&1))&fghj');
select plainto_tsquery('default', 'the and z 1))& fghj');
select plainto_tsquery('default', 'foo bar') && plainto_tsquery('default', 'asd');
select plainto_tsquery('default', 'foo bar') || plainto_tsquery('default', 'asd fg');
select plainto_tsquery('default', 'foo bar') || !!plainto_tsquery('default', 'asd fg');
select plainto_tsquery('default', 'foo bar') && 'asd | fg';
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca'; select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca';
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:B'; select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:B';
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:A'; select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:A';
......
#include "ts_locale.h"
#include "utils/builtins.h"
#include "utils/pg_locale.h"
#include "mb/pg_wchar.h"
#if defined(TS_USE_WIDE) && defined(WIN32)
size_t
wchar2char( const char *to, const wchar_t *from, size_t len ) {
if (GetDatabaseEncoding() == PG_UTF8) {
int r;
if (len==0)
return 0;
r = WideCharToMultiByte(CP_UTF8, 0, from, len, to, nbytes,
NULL, NULL);
if ( r==0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("UTF-16 to UTF-8 translation failed: %lu",
GetLastError())));
return r;
}
return wcstombs(to, from, len);
}
size_t
char2wchar( const wchar_t *to, const char *from, size_t len ) {
if (GetDatabaseEncoding() == PG_UTF8) {
int r;
if (len==0)
return 0;
r = MultiByteToWideChar(CP_UTF8, 0, from, len,
to, len);
if (!r) {
pg_verifymbstr(from, len, false);
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("invalid multibyte character for locale"),
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
}
Assert(r <= nbytes);
return r;
}
return mbstowcs(to, from, len);
}
#endif
#ifndef __TSLOCALE_H__
#define __TSLOCALE_H__
#include "postgres.h"
#include <ctype.h>
#include <limits.h>
/*
* towlower() and friends should be in <wctype.h>, but some pre-C99 systems
* declare them in <wchar.h>.
*/
#ifdef HAVE_WCHAR_H
#include <wchar.h>
#endif
#ifdef HAVE_WCTYPE_H
#include <wctype.h>
#endif
#if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
#define TS_USE_WIDE
#ifdef WIN32
size_t wchar2char( const char *to, const wchar_t *from, size_t len );
size_t char2wchar( const wchar_t *to, const char *from, size_t len );
#else /* WIN32 */
/* correct mbstowcs */
#define char2wchar mbstowcs
#define wchar2char wcstombs
#endif /* WIN32 */
#endif /* defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER) */
#endif /* __TSLOCALE_H__ */
...@@ -427,6 +427,21 @@ RETURNS tsquery ...@@ -427,6 +427,21 @@ RETURNS tsquery
AS 'MODULE_PATHNAME','to_tsquery_current' AS 'MODULE_PATHNAME','to_tsquery_current'
LANGUAGE 'c' with (isstrict,iscachable); LANGUAGE 'c' with (isstrict,iscachable);
CREATE FUNCTION plainto_tsquery(oid, text)
RETURNS tsquery
AS 'MODULE_PATHNAME'
LANGUAGE 'c' with (isstrict,iscachable);
CREATE FUNCTION plainto_tsquery(text, text)
RETURNS tsquery
AS 'MODULE_PATHNAME','plainto_tsquery_name'
LANGUAGE 'c' with (isstrict,iscachable);
CREATE FUNCTION plainto_tsquery(text)
RETURNS tsquery
AS 'MODULE_PATHNAME','plainto_tsquery_current'
LANGUAGE 'c' with (isstrict,iscachable);
--operations --operations
CREATE FUNCTION exectsq(tsvector, tsquery) CREATE FUNCTION exectsq(tsvector, tsquery)
RETURNS bool RETURNS bool
...@@ -929,6 +944,47 @@ CREATE OR REPLACE FUNCTION numnode(tsquery) ...@@ -929,6 +944,47 @@ CREATE OR REPLACE FUNCTION numnode(tsquery)
language 'C' language 'C'
with (isstrict,iscachable); with (isstrict,iscachable);
CREATE OR REPLACE FUNCTION tsquery_and(tsquery,tsquery)
returns tsquery
as 'MODULE_PATHNAME', 'tsquery_and'
language 'C'
with (isstrict,iscachable);
CREATE OPERATOR && (
LEFTARG = tsquery,
RIGHTARG = tsquery,
PROCEDURE = tsquery_and,
COMMUTATOR = '&&',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OR REPLACE FUNCTION tsquery_or(tsquery,tsquery)
returns tsquery
as 'MODULE_PATHNAME', 'tsquery_or'
language 'C'
with (isstrict,iscachable);
CREATE OPERATOR || (
LEFTARG = tsquery,
RIGHTARG = tsquery,
PROCEDURE = tsquery_or,
COMMUTATOR = '||',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OR REPLACE FUNCTION tsquery_not(tsquery)
returns tsquery
as 'MODULE_PATHNAME', 'tsquery_not'
language 'C'
with (isstrict,iscachable);
CREATE OPERATOR !! (
RIGHTARG = tsquery,
PROCEDURE = tsquery_not
);
--------------rewrite subsystem --------------rewrite subsystem
CREATE OR REPLACE FUNCTION rewrite(tsquery, text) CREATE OR REPLACE FUNCTION rewrite(tsquery, text)
......
# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.8 2005/10/18 01:30:49 tgl Exp $ # $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.9 2005/11/21 12:27:57 teodor Exp $
SUBOBJS = parser.o deflex.o SUBOBJS = parser.o deflex.o
EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) parser.c EXTRA_CLEAN = SUBSYS.o $(SUBOBJS)
PG_CPPFLAGS = -I$(srcdir)/.. PG_CPPFLAGS = -I$(srcdir)/..
...@@ -20,13 +20,6 @@ override CFLAGS += $(CFLAGS_SL) ...@@ -20,13 +20,6 @@ override CFLAGS += $(CFLAGS_SL)
all: SUBSYS.o all: SUBSYS.o
parser.c: parser.l
ifdef FLEX
$(FLEX) $(FLEXFLAGS) -8 -Ptsearch2_yy -o'$@' $<
else
@$(missing) flex $< $@
endif
SUBSYS.o: $(SUBOBJS) SUBSYS.o: $(SUBOBJS)
$(LD) $(LDREL) $(LDOUT) $@ $^ $(LD) $(LDREL) $(LDOUT) $@ $^
......
...@@ -15,7 +15,7 @@ const char *lex_descr[] = { ...@@ -15,7 +15,7 @@ const char *lex_descr[] = {
"Latin part of hyphenated word", "Latin part of hyphenated word",
"Space symbols", "Space symbols",
"HTML Tag", "HTML Tag",
"HTTP head", "Protocol head",
"Hyphenated word", "Hyphenated word",
"Latin hyphenated word", "Latin hyphenated word",
"Non-latin hyphenated word", "Non-latin hyphenated word",
...@@ -42,7 +42,7 @@ const char *tok_alias[] = { ...@@ -42,7 +42,7 @@ const char *tok_alias[] = {
"lpart_hword", "lpart_hword",
"blank", "blank",
"tag", "tag",
"http", "protocol",
"hword", "hword",
"lhword", "lhword",
"nlhword", "nlhword",
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#define LATPARTHYPHENWORD 11 #define LATPARTHYPHENWORD 11
#define SPACE 12 #define SPACE 12
#define TAG 13 #define TAG 13
#define HTTP 14 #define PROTOCOL 14
#define HYPHENWORD 15 #define HYPHENWORD 15
#define LATHYPHENWORD 16 #define LATHYPHENWORD 16
#define CYRHYPHENWORD 17 #define CYRHYPHENWORD 17
......
#include "postgres.h"
#include "utils/builtins.h"
#include "utils/pg_locale.h"
#include "mb/pg_wchar.h"
#include "deflex.h"
#include "parser.h"
#include "ts_locale.h"
static TParserPosition*
newTParserPosition(TParserPosition *prev) {
TParserPosition *res = (TParserPosition*)palloc(sizeof(TParserPosition));
if ( prev )
memcpy(res, prev, sizeof(TParserPosition));
else
memset(res, 0, sizeof(TParserPosition));
res->prev = prev;
res->pushedAtAction = NULL;
return res;
}
TParser*
TParserInit( char *str, int len ) {
TParser *prs = (TParser*)palloc0( sizeof(TParser) );
prs->charmaxlen = pg_database_encoding_max_length();
prs->str = str;
prs->lenstr = len;
#ifdef TS_USE_WIDE
/*
* Use wide char code only when max encoding length > 1 and ctype != C.
* Some operating systems fail with multi-byte encodings and a C locale.
* Also, for a C locale there is no need to process as multibyte.
* From backend/utils/adt/oracle_compat.c Teodor
*/
if ( prs->charmaxlen > 1 && !lc_ctype_is_c() ) {
prs->usewide=true;
prs->wstr = (wchar_t*)palloc( sizeof(wchar_t) * prs->lenstr );
prs->lenwstr = char2wchar( prs->wstr, prs->str, prs->lenstr );
} else
#endif
prs->usewide=false;
prs->state = newTParserPosition(NULL);
prs->state->state = TPS_Base;
return prs;
}
void
TParserClose( TParser* prs ) {
while( prs->state ) {
TParserPosition *ptr = prs->state->prev;
pfree( prs->state );
prs->state = ptr;
}
if ( prs->wstr )
pfree( prs->wstr );
pfree( prs );
}
/*
* defining support function, equvalent is* macroses, but
* working with any possible encodings and locales
*/
#ifdef TS_USE_WIDE
#define p_iswhat(type) \
static int \
p_is##type(TParser *prs) { \
Assert( prs->state ); \
return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
} \
\
static int \
p_isnot##type(TParser *prs) { \
return !p_is##type(prs); \
}
/* p_iseq should be used only for ascii symbols */
static int
p_iseq(TParser *prs, char c) {
Assert( prs->state );
return ( ( prs->state->charlen==1 && *( prs->str + prs->state->posbyte ) == c ) ) ? 1 : 0;
}
#else /* TS_USE_WIDE */
#define p_iswhat(type) \
static int \
p_is##type(TParser *prs) { \
Assert( prs->state ); \
return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
} \
\
static int \
p_isnot##type(TParser *prs) { \
return !p_is##type(prs); \
}
static int
p_iseq(TParser *prs, char c) {
Assert( prs->state );
return ( *( prs->str + prs->state->posbyte ) == c ) ) ? 1 : 0;
}
#endif /* TS_USE_WIDE */
p_iswhat(alnum)
p_iswhat(alpha)
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
p_iswhat(punct)
p_iswhat(space)
p_iswhat(upper)
p_iswhat(xdigit)
static int
p_isEOF(TParser *prs) {
Assert( prs->state );
return (prs->state->posbyte == prs->lenstr || prs->state->charlen==0) ? 1 : 0;
}
static int
p_iseqC(TParser *prs) {
return p_iseq(prs, prs->c);
}
static int
p_isneC(TParser *prs) {
return !p_iseq(prs, prs->c);
}
static int
p_isascii(TParser *prs) {
return ( prs->state->charlen==1 && isascii( (unsigned char) *( prs->str + prs->state->posbyte ) ) ) ? 1 : 0;
}
static int
p_islatin(TParser *prs) {
return ( p_isalpha(prs) && p_isascii(prs) ) ? 1 : 0;
}
static int
p_isnonlatin(TParser *prs) {
return ( p_isalpha(prs) && !p_isascii(prs) ) ? 1 : 0;
}
void _make_compiler_happy(void);
void
_make_compiler_happy(void) {
p_isalnum(NULL); p_isnotalnum(NULL);
p_isalpha(NULL); p_isnotalpha(NULL);
p_isdigit(NULL); p_isnotdigit(NULL);
p_islower(NULL); p_isnotlower(NULL);
p_isprint(NULL); p_isnotprint(NULL);
p_ispunct(NULL); p_isnotpunct(NULL);
p_isspace(NULL); p_isnotspace(NULL);
p_isupper(NULL); p_isnotupper(NULL);
p_isxdigit(NULL); p_isnotxdigit(NULL);
p_isEOF(NULL);
p_iseqC(NULL); p_isneC(NULL);
}
static void
SpecialTags(TParser *prs) {
switch( prs->state->lencharlexeme ) {
case 8: /* </script */
if ( pg_strncasecmp( prs->lexeme, "</script", 8 ) == 0 )
prs->ignore = false;
break;
case 7: /* <script || </style */
if ( pg_strncasecmp( prs->lexeme, "</style", 7 ) == 0 )
prs->ignore = false;
else if ( pg_strncasecmp( prs->lexeme, "<script", 7 ) == 0 )
prs->ignore = true;
break;
case 6: /* <style */
if ( pg_strncasecmp( prs->lexeme, "<style", 6 ) == 0 )
prs->ignore = true;
break;
default: break;
}
}
static void
SpecialFURL(TParser *prs) {
prs->wanthost = true;
prs->state->posbyte -= prs->state->lenbytelexeme;
prs->state->poschar -= prs->state->lencharlexeme;
}
static void
SpecialHyphen(TParser *prs) {
prs->state->posbyte -= prs->state->lenbytelexeme;
prs->state->poschar -= prs->state->lencharlexeme;
}
static int
p_isstophost(TParser *prs) {
if ( prs->wanthost ) {
prs->wanthost = false;
return 1;
}
return 0;
}
static int
p_isignore(TParser *prs) {
return (prs->ignore) ? 1 : 0;
}
static int
p_ishost(TParser *prs) {
TParser *tmpprs = TParserInit( prs->str+prs->state->posbyte, prs->lenstr - prs->state->posbyte );
int res = 0;
if ( TParserGet(tmpprs) && tmpprs->type == HOST ) {
prs->state->posbyte += tmpprs->lenbytelexeme;
prs->state->poschar += tmpprs->lencharlexeme;
prs->state->lenbytelexeme += tmpprs->lenbytelexeme;
prs->state->lencharlexeme += tmpprs->lencharlexeme;
prs->state->charlen = tmpprs->state->charlen;
res = 1;
}
TParserClose(tmpprs);
return res;
}
static int
p_isURI(TParser *prs) {
TParser *tmpprs = TParserInit( prs->str+prs->state->posbyte, prs->lenstr - prs->state->posbyte );
int res = 0;
tmpprs->state = newTParserPosition( tmpprs->state );
tmpprs->state->state = TPS_InFileFirst;
if ( TParserGet(tmpprs) && (tmpprs->type == URI || tmpprs->type == FILEPATH) ) {
prs->state->posbyte += tmpprs->lenbytelexeme;
prs->state->poschar += tmpprs->lencharlexeme;
prs->state->lenbytelexeme += tmpprs->lenbytelexeme;
prs->state->lencharlexeme += tmpprs->lencharlexeme;
prs->state->charlen = tmpprs->state->charlen;
res = 1;
}
TParserClose(tmpprs);
return res;
}
/*
* Table of state/action of parser
*/
#define A_NEXT 0x0000
#define A_BINGO 0x0001
#define A_POP 0x0002
#define A_PUSH 0x0004
#define A_RERUN 0x0008
#define A_CLEAR 0x0010
#define A_MERGE 0x0020
#define A_CLRALL 0x0040
static TParserStateActionItem actionTPS_Base[] = {
{p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
{p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InLatWord, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InCyrWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
{p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
{p_iseqC, '&', A_PUSH, TPS_InHTMLEntityFirst, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
};
static TParserStateActionItem actionTPS_InUWord[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, UWORD, NULL},
{p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, UWORD, NULL}
};
static TParserStateActionItem actionTPS_InLatWord[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, LATWORD, NULL},
{p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst,0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, LATWORD, NULL}
};
static TParserStateActionItem actionTPS_InCyrWord[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, CYRWORD, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_Null, 0, NULL},
{p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenCyrWordFirst,0, NULL},
{NULL, 0, A_BINGO, TPS_Base, CYRWORD, NULL}
};
static TParserStateActionItem actionTPS_InUnsignedInt[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_islatin, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InUWord, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
};
static TParserStateActionItem actionTPS_InSignedIntFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT|A_CLEAR, TPS_InSignedInt, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InSignedInt[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
};
static TParserStateActionItem actionTPS_InSpace[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
{p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
{p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
{p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
{p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
{p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
{p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
};
static TParserStateActionItem actionTPS_InUDecimalFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InUDecimal[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
{p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
};
static TParserStateActionItem actionTPS_InDecimalFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InDecimal[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
{p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
};
static TParserStateActionItem actionTPS_InVersionFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InVersion[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
{p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
};
static TParserStateActionItem actionTPS_InMantissaFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
{p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InMantissaSign[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InMantissa[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
{p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
};
static TParserStateActionItem actionTPS_InHTMLEntityFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '#', A_NEXT, TPS_InHTMLEntityNumFirst,0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHTMLEntity[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
{p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHTMLEntityNumFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHTMLEntityNum, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHTMLEntityNum[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHTMLEntityNum, 0, NULL},
{p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHTMLEntityEnd[] = {
{NULL, 0, A_BINGO|A_CLEAR,TPS_Base, HTMLENTITY, NULL}
};
static TParserStateActionItem actionTPS_InTagFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
{p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
{p_islatin, 0, A_PUSH, TPS_InTag, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InTagCloseFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InTag, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InTag[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
{p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
{p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
{p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InTagEscapeK[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
{p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
{NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
};
static TParserStateActionItem actionTPS_InTagEscapeKK[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
{p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
{NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
};
static TParserStateActionItem actionTPS_InTagBackSleshed[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{NULL, 0, A_MERGE, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InTagEnd[] = {
{NULL, 0, A_BINGO|A_CLRALL,TPS_Base, TAG, NULL}
};
static TParserStateActionItem actionTPS_InCommentFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InCommentLast[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InComment[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst,0, NULL},
{NULL, 0, A_NEXT, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
{NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
};
static TParserStateActionItem actionTPS_InCloseCommentLast[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
{NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
};
static TParserStateActionItem actionTPS_InCommentEnd[] = {
{NULL, 0, A_BINGO|A_CLRALL,TPS_Base, TAG, NULL}
};
static TParserStateActionItem actionTPS_InHostFirstDomen[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHostDomenSecond, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
//{p_iseqC, '-', A_POP, TPS_InHostFirstAN, 0, NULL},
//{p_iseqC, '.', A_POP, TPS_InHostFirstDomen, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHostDomenSecond[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHostDomen, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHostDomen[] = {
{p_isEOF, 0, A_BINGO|A_CLRALL,TPS_Base, HOST, NULL},
{p_islatin, 0, A_NEXT, TPS_InHostDomen, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
{p_isstophost, 0, A_BINGO|A_CLRALL,TPS_InURIStart, HOST, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
{NULL, 0, A_BINGO|A_CLRALL,TPS_Base, HOST, NULL}
};
static TParserStateActionItem actionTPS_InPortFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InPort[] = {
{p_isEOF, 0, A_BINGO|A_CLRALL,TPS_Base, HOST, NULL},
{p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
{p_isstophost, 0, A_BINGO|A_CLRALL,TPS_InURIStart, HOST, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
{NULL, 0, A_BINGO|A_CLRALL,TPS_Base, HOST, NULL}
};
static TParserStateActionItem actionTPS_InHostFirstAN[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHost[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InEmail[] = {
{p_ishost, 0, A_BINGO|A_CLRALL, TPS_Base, EMAIL, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InFileFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_CLEAR, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_CLEAR, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
{p_iseqC, '?', A_PUSH, TPS_InURIFirst, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InFile[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_iseqC, '?', A_PUSH, TPS_InURIFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
};
static TParserStateActionItem actionTPS_InFileNext[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_CLEAR, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InURIFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '"', A_POP, TPS_Null, 0, NULL},
{p_iseqC, '\'', A_POP, TPS_Null, 0, NULL},
{p_isnotspace, 0, A_CLEAR, TPS_InURI, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL},
};
static TParserStateActionItem actionTPS_InURIStart[] = {
{NULL, 0, A_NEXT, TPS_InURI, 0, NULL}
};
static TParserStateActionItem actionTPS_InURI[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, URI, NULL},
{p_iseqC, '"', A_BINGO, TPS_Base, URI, NULL},
{p_iseqC, '\'', A_BINGO, TPS_Base, URI, NULL},
{p_isnotspace, 0, A_NEXT, TPS_InURI, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, URI, NULL}
};
static TParserStateActionItem actionTPS_InFURL[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isURI, 0, A_BINGO|A_CLRALL,TPS_Base, FURL, SpecialFURL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InProtocolFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InProtocolSecond[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InProtocolEnd[] = {
{NULL, 0, A_BINGO|A_CLRALL,TPS_Base, PROTOCOL, NULL}
};
static TParserStateActionItem actionTPS_InHyphenLatWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenLatWord, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHyphenLatWord[] = {
{p_isEOF, 0, A_BINGO|A_CLRALL,TPS_InParseHyphen, LATHYPHENWORD, SpecialHyphen},
{p_islatin, 0, A_NEXT, TPS_InHyphenLatWord, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst,0, NULL},
{NULL, 0, A_BINGO|A_CLRALL,TPS_InParseHyphen, LATHYPHENWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenCyrWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWord, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHyphenCyrWord[] = {
{p_isEOF, 0, A_BINGO|A_CLRALL,TPS_InParseHyphen, CYRHYPHENWORD, SpecialHyphen},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWord, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenCyrWordFirst,0, NULL},
{NULL, 0, A_BINGO|A_CLRALL,TPS_InParseHyphen, CYRHYPHENWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenUWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_isalnum, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHyphenUWord[] = {
{p_isEOF, 0, A_BINGO|A_CLRALL,TPS_InParseHyphen, HYPHENWORD, SpecialHyphen},
{p_isalnum, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst,0, NULL},
{NULL, 0, A_BINGO|A_CLRALL,TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHyphenValue[] = {
{p_isEOF, 0, A_BINGO|A_CLRALL,TPS_InParseHyphen, HYPHENWORD, SpecialHyphen},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst,0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{NULL, 0, A_BINGO|A_CLRALL,TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenValueExact[] = {
{p_isEOF, 0, A_BINGO|A_CLRALL,TPS_InParseHyphen, HYPHENWORD, SpecialHyphen},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
{NULL, 0, A_BINGO|A_CLRALL,TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InParseHyphen[] = {
{p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenLatWordPart,0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWordPart,0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt,0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen,0, NULL},
{NULL, 0, A_RERUN, TPS_Base, 0, NULL}
};
static TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isalnum, 0, A_BINGO|A_CLEAR,TPS_InParseHyphen, SPACE, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHyphenCyrWordPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, CYRPARTHYPHENWORD,NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWordPart,0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, CYRPARTHYPHENWORD,NULL}
};
static TParserStateActionItem actionTPS_InHyphenLatWordPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, LATPARTHYPHENWORD,NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenLatWordPart,0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, LATPARTHYPHENWORD,NULL}
};
static TParserStateActionItem actionTPS_InHyphenUWordPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, PARTHYPHENWORD, NULL},
{p_isalnum, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHYPHENWORD, NULL}
};
static TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt,0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHDecimalPartFirst,0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, UNSIGNEDINT, NULL}
};
static TParserStateActionItem actionTPS_InHDecimalPartFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InHDecimalPart, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHDecimalPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHDecimalPart, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHVersionPartFirst,0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, DECIMAL, NULL}
};
static TParserStateActionItem actionTPS_InHVersionPartFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InHVersionPart, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHVersionPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHVersionPart, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHVersionPartFirst,0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, VERSIONNUMBER, NULL}
};
/*
* order should be the same as in typedef enum {} TParserState!!
*/
static const TParserStateAction Actions[] = {
{ TPS_Base, actionTPS_Base },
{ TPS_InUWord, actionTPS_InUWord },
{ TPS_InLatWord, actionTPS_InLatWord },
{ TPS_InCyrWord, actionTPS_InCyrWord },
{ TPS_InUnsignedInt, actionTPS_InUnsignedInt },
{ TPS_InSignedIntFirst, actionTPS_InSignedIntFirst },
{ TPS_InSignedInt, actionTPS_InSignedInt },
{ TPS_InSpace, actionTPS_InSpace },
{ TPS_InUDecimalFirst, actionTPS_InUDecimalFirst },
{ TPS_InUDecimal, actionTPS_InUDecimal },
{ TPS_InDecimalFirst, actionTPS_InDecimalFirst },
{ TPS_InDecimal, actionTPS_InDecimal },
{ TPS_InVersionFirst, actionTPS_InVersionFirst },
{ TPS_InVersion, actionTPS_InVersion },
{ TPS_InMantissaFirst, actionTPS_InMantissaFirst },
{ TPS_InMantissaSign, actionTPS_InMantissaSign },
{ TPS_InMantissa, actionTPS_InMantissa },
{ TPS_InHTMLEntityFirst, actionTPS_InHTMLEntityFirst },
{ TPS_InHTMLEntity, actionTPS_InHTMLEntity },
{ TPS_InHTMLEntityNumFirst, actionTPS_InHTMLEntityNumFirst },
{ TPS_InHTMLEntityNum, actionTPS_InHTMLEntityNum },
{ TPS_InHTMLEntityEnd, actionTPS_InHTMLEntityEnd },
{ TPS_InTagFirst, actionTPS_InTagFirst },
{ TPS_InTagCloseFirst, actionTPS_InTagCloseFirst },
{ TPS_InTag, actionTPS_InTag },
{ TPS_InTagEscapeK, actionTPS_InTagEscapeK },
{ TPS_InTagEscapeKK, actionTPS_InTagEscapeKK },
{ TPS_InTagBackSleshed, actionTPS_InTagBackSleshed },
{ TPS_InTagEnd, actionTPS_InTagEnd },
{ TPS_InCommentFirst, actionTPS_InCommentFirst },
{ TPS_InCommentLast, actionTPS_InCommentLast },
{ TPS_InComment, actionTPS_InComment },
{ TPS_InCloseCommentFirst, actionTPS_InCloseCommentFirst },
{ TPS_InCloseCommentLast, actionTPS_InCloseCommentLast },
{ TPS_InCommentEnd, actionTPS_InCommentEnd },
{ TPS_InHostFirstDomen, actionTPS_InHostFirstDomen },
{ TPS_InHostDomenSecond, actionTPS_InHostDomenSecond },
{ TPS_InHostDomen, actionTPS_InHostDomen },
{ TPS_InPortFirst, actionTPS_InPortFirst },
{ TPS_InPort, actionTPS_InPort },
{ TPS_InHostFirstAN, actionTPS_InHostFirstAN },
{ TPS_InHost, actionTPS_InHost },
{ TPS_InEmail, actionTPS_InEmail },
{ TPS_InFileFirst, actionTPS_InFileFirst },
{ TPS_InFile, actionTPS_InFile },
{ TPS_InFileNext, actionTPS_InFileNext },
{ TPS_InURIFirst, actionTPS_InURIFirst },
{ TPS_InURIStart, actionTPS_InURIStart },
{ TPS_InURI, actionTPS_InURI },
{ TPS_InFURL, actionTPS_InFURL },
{ TPS_InProtocolFirst, actionTPS_InProtocolFirst },
{ TPS_InProtocolSecond, actionTPS_InProtocolSecond },
{ TPS_InProtocolEnd, actionTPS_InProtocolEnd },
{ TPS_InHyphenLatWordFirst, actionTPS_InHyphenLatWordFirst },
{ TPS_InHyphenLatWord, actionTPS_InHyphenLatWord },
{ TPS_InHyphenCyrWordFirst, actionTPS_InHyphenCyrWordFirst },
{ TPS_InHyphenCyrWord, actionTPS_InHyphenCyrWord },
{ TPS_InHyphenUWordFirst, actionTPS_InHyphenUWordFirst },
{ TPS_InHyphenUWord, actionTPS_InHyphenUWord },
{ TPS_InHyphenValueFirst, actionTPS_InHyphenValueFirst },
{ TPS_InHyphenValue, actionTPS_InHyphenValue },
{ TPS_InHyphenValueExact, actionTPS_InHyphenValueExact },
{ TPS_InParseHyphen, actionTPS_InParseHyphen },
{ TPS_InParseHyphenHyphen, actionTPS_InParseHyphenHyphen },
{ TPS_InHyphenCyrWordPart, actionTPS_InHyphenCyrWordPart },
{ TPS_InHyphenLatWordPart, actionTPS_InHyphenLatWordPart },
{ TPS_InHyphenUWordPart, actionTPS_InHyphenUWordPart },
{ TPS_InHyphenUnsignedInt, actionTPS_InHyphenUnsignedInt },
{ TPS_InHDecimalPartFirst, actionTPS_InHDecimalPartFirst },
{ TPS_InHDecimalPart, actionTPS_InHDecimalPart },
{ TPS_InHVersionPartFirst, actionTPS_InHVersionPartFirst },
{ TPS_InHVersionPart, actionTPS_InHVersionPart },
{ TPS_Null, NULL }
};
bool
TParserGet( TParser *prs ) {
TParserStateActionItem *item=NULL;
if ( prs->state->posbyte >= prs->lenstr )
return false;
Assert( prs->state );
prs->lexeme = prs->str + prs->state->posbyte;
prs->state->pushedAtAction = NULL;
/* look at string */
while (prs->state->posbyte <= prs->lenstr) {
if ( prs->state->posbyte == prs->lenstr )
prs->state->charlen = 0;
else
prs->state->charlen = ( prs->charmaxlen == 1 ) ? prs->charmaxlen :
pg_mblen( prs->str + prs->state->posbyte );
Assert( prs->state->posbyte + prs->state->charlen <= prs->lenstr );
Assert( prs->state->state >=TPS_Base && prs->state->state < TPS_Null );
Assert( Actions[ prs->state->state ].state == prs->state->state );
item = Actions[ prs->state->state ].action;
Assert(item!=NULL);
if ( item < prs->state->pushedAtAction )
item = prs->state->pushedAtAction;
/* find action by character class */
while( item->isclass ) {
prs->c = item->c;
if ( item->isclass(prs)!=0 ) {
if ( item > prs->state->pushedAtAction ) /* remember: after pushing we were by false way */
break;
}
item++;
}
prs->state->pushedAtAction = NULL;
/* call special handler if exists */
if ( item->special )
item->special(prs);
/* BINGO, lexeme is found */
if ( item->flags & A_BINGO ) {
Assert( item->type>0 );
prs->lenbytelexeme = prs->state->lenbytelexeme;
prs->lencharlexeme = prs->state->lencharlexeme;
prs->state->lenbytelexeme = prs->state->lencharlexeme = 0;
prs->type = item->type;
}
/* do various actions by flags */
if ( item->flags & A_POP ) { /* pop stored state in stack */
TParserPosition *ptr = prs->state->prev;
pfree( prs->state );
prs->state = ptr;
Assert( prs->state );
} else if ( item->flags & A_PUSH ) { /* push (store) state in stack */
prs->state->pushedAtAction = item; /* remember where we push */
prs->state = newTParserPosition( prs->state );
} else if ( item->flags & A_CLEAR ) { /* clear previous pushed state */
TParserPosition *ptr;
Assert( prs->state->prev );
ptr = prs->state->prev->prev;
pfree( prs->state->prev );
prs->state->prev = ptr;
} else if ( item->flags & A_CLRALL ) { /* clear all previous pushed state */
TParserPosition *ptr;
while( prs->state->prev ) {
ptr = prs->state->prev->prev;
pfree( prs->state->prev );
prs->state->prev = ptr;
}
} else if ( item->flags & A_MERGE ) { /* merge posinfo with current and pushed state */
TParserPosition *ptr = prs->state;
Assert( prs->state->prev );
prs->state = prs->state->prev;
prs->state->posbyte = ptr->posbyte;
prs->state->poschar = ptr->poschar;
prs->state->charlen = ptr->charlen;
prs->state->lenbytelexeme = ptr->lenbytelexeme;
prs->state->lencharlexeme = ptr->lencharlexeme;
pfree(ptr);
}
/* set new state if pointed */
if ( item->tostate != TPS_Null )
prs->state->state = item->tostate;
/* check for go away */
if ( (item->flags & A_BINGO) || (prs->state->posbyte >= prs->lenstr && (item->flags & A_RERUN)==0 ) )
break;
/* go to begining of loop if we should rerun or we just restore state */
if ( item->flags & ( A_RERUN | A_POP ) )
continue;
/* move forward */
if ( prs->state->charlen ) {
prs->state->posbyte += prs->state->charlen;
prs->state->lenbytelexeme += prs->state->charlen;
prs->state->poschar ++;
prs->state->lencharlexeme ++;
}
}
return (item && (item->flags & A_BINGO)) ? true : false;
}
#ifndef __PARSER_H__ #ifndef __PARSER_H__
#define __PARSER_H__ #define __PARSER_H__
extern char *token; #include <ctype.h>
extern int tokenlen; #include <limits.h>
int tsearch2_yylex(void); #include "ts_locale.h"
void tsearch2_start_parse_str(char *, int);
void tsearch2_end_parse(void); typedef enum {
TPS_Base = 0,
TPS_InUWord,
TPS_InLatWord,
TPS_InCyrWord,
TPS_InUnsignedInt,
TPS_InSignedIntFirst,
TPS_InSignedInt,
TPS_InSpace,
TPS_InUDecimalFirst,
TPS_InUDecimal,
TPS_InDecimalFirst,
TPS_InDecimal,
TPS_InVersionFirst,
TPS_InVersion,
TPS_InMantissaFirst,
TPS_InMantissaSign,
TPS_InMantissa,
TPS_InHTMLEntityFirst,
TPS_InHTMLEntity,
TPS_InHTMLEntityNumFirst,
TPS_InHTMLEntityNum,
TPS_InHTMLEntityEnd,
TPS_InTagFirst,
TPS_InTagCloseFirst,
TPS_InTag,
TPS_InTagEscapeK,
TPS_InTagEscapeKK,
TPS_InTagBackSleshed,
TPS_InTagEnd,
TPS_InCommentFirst,
TPS_InCommentLast,
TPS_InComment,
TPS_InCloseCommentFirst,
TPS_InCloseCommentLast,
TPS_InCommentEnd,
TPS_InHostFirstDomen,
TPS_InHostDomenSecond,
TPS_InHostDomen,
TPS_InPortFirst,
TPS_InPort,
TPS_InHostFirstAN,
TPS_InHost,
TPS_InEmail,
TPS_InFileFirst,
TPS_InFile,
TPS_InFileNext,
TPS_InURIFirst,
TPS_InURIStart,
TPS_InURI,
TPS_InFURL,
TPS_InProtocolFirst,
TPS_InProtocolSecond,
TPS_InProtocolEnd,
TPS_InHyphenLatWordFirst,
TPS_InHyphenLatWord,
TPS_InHyphenCyrWordFirst,
TPS_InHyphenCyrWord,
TPS_InHyphenUWordFirst,
TPS_InHyphenUWord,
TPS_InHyphenValueFirst,
TPS_InHyphenValue,
TPS_InHyphenValueExact,
TPS_InParseHyphen,
TPS_InParseHyphenHyphen,
TPS_InHyphenCyrWordPart,
TPS_InHyphenLatWordPart,
TPS_InHyphenUWordPart,
TPS_InHyphenUnsignedInt,
TPS_InHDecimalPartFirst,
TPS_InHDecimalPart,
TPS_InHVersionPartFirst,
TPS_InHVersionPart,
TPS_Null /* last state (fake value) */
} TParserState;
/* forward declaration */
struct TParser;
typedef int (*TParserCharTest)(struct TParser*); /* any p_is* functions except p_iseq */
typedef void (*TParserSpecial)(struct TParser*); /* special handler for special cases... */
typedef struct {
TParserCharTest isclass;
char c;
uint16 flags;
TParserState tostate;
int type;
TParserSpecial special;
} TParserStateActionItem;
typedef struct {
TParserState state;
TParserStateActionItem *action;
} TParserStateAction;
typedef struct TParserPosition {
int posbyte; /* position of parser in bytes */
int poschar; /* osition of parser in characters */
int charlen; /* length of current char */
int lenbytelexeme;
int lencharlexeme;
TParserState state;
struct TParserPosition *prev;
int flags;
TParserStateActionItem *pushedAtAction;
} TParserPosition;
typedef struct TParser {
/* string and position information */
char *str; /* multibyte string */
int lenstr; /* length of mbstring */
wchar_t *wstr; /* wide character string */
int lenwstr; /* length of wsting */
/* State of parse */
int charmaxlen;
bool usewide;
TParserPosition *state;
bool ignore;
bool wanthost;
/* silly char */
char c;
/* out */
char *lexeme;
int lenbytelexeme;
int lencharlexeme;
int type;
} TParser;
TParser* TParserInit( char *, int );
bool TParserGet( TParser* );
void TParserClose( TParser* );
#endif #endif
%{
#include "postgres.h"
#include "deflex.h"
#include "parser.h"
#include "common.h"
/* Avoid exit() on fatal scanner errors */
#undef fprintf
#define fprintf(file, fmt, msg) ts_error(ERROR, fmt, msg)
char *token = NULL; /* pointer to token */
int tokenlen;
static char *s = NULL; /* to return WHOLE hyphenated-word */
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
typedef struct {
int tlen;
int clen;
char *str;
} TagStorage;
static TagStorage ts={0,0,NULL};
static void
addTag(void)
{
while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
ts.tlen*=2;
ts.str=realloc(ts.str,ts.tlen);
if (!ts.str)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
ts.clen+=tsearch2_yyleng;
ts.str[ts.clen]='\0';
}
static void
startTag(void)
{
if ( ts.str==NULL ) {
ts.tlen=tsearch2_yyleng+1;
ts.str=malloc(ts.tlen);
if (!ts.str)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
ts.clen=0;
ts.str[0]='\0';
addTag();
}
%}
%option 8bit
%option never-interactive
%option nodefault
%option nounput
%option noyywrap
/* parser's state for parsing hyphenated-word */
%x DELIM
/* parser's state for parsing URL*/
%x URL
%x SERVER
/* parser's state for parsing TAGS */
%x INTAG
%x QINTAG
%x INCOMMENT
%x INSCRIPT
/* cyrillic koi8 char */
CYRALNUM [0-9\200-\377]
CYRALPHA [\200-\377]
ALPHA [a-zA-Z\200-\377]
ALNUM [0-9a-zA-Z\200-\377]
HOSTNAME ([-_[:alnum:]]+\.)+[[:alpha:]]+
URI [-_[:alnum:]/%,\.;=&?#]+
%%
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }
<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
BEGIN INITIAL;
addTag();
token = ts.str;
tokenlen = ts.clen;
return TAG;
}
"<!--" { BEGIN INCOMMENT; startTag(); }
<INCOMMENT>"-->" {
BEGIN INITIAL;
addTag();
token = ts.str;
tokenlen = ts.clen;
return TAG;
}
"<"[\![:alpha:]] { BEGIN INTAG; startTag(); }
"</"[[:alpha:]] { BEGIN INTAG; startTag(); }
<INTAG>"\"" { BEGIN QINTAG; addTag(); }
<QINTAG>"\\\"" { addTag(); }
<QINTAG>"\"" { BEGIN INTAG; addTag(); }
<INTAG>">" {
BEGIN INITIAL;
addTag();
token = ts.str;
tokenlen = ts.clen;
return TAG;
}
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }
\&(quot|amp|nbsp|lt|gt)\; {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTMLENTITY;
}
\&\#[0-9][0-9]?[0-9]?\; {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTMLENTITY;
}
[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return EMAIL;
}
[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SCIENTIFIC;
}
[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return VERSIONNUMBER;
}
[+-]?[0-9]+\.[0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return DECIMAL;
}
[+-][0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SIGNEDINT;
}
<DELIM,INITIAL>[0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return UNSIGNEDINT;
}
http"://" {
BEGIN URL;
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTTP;
}
ftp"://" {
BEGIN URL;
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTTP;
}
<URL,INITIAL>{HOSTNAME}[/:]{URI} {
BEGIN SERVER;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return FURL;
}
<SERVER,URL,INITIAL>{HOSTNAME} {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HOST;
}
<SERVER>[/:]{URI} {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return URI;
}
[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return FILEPATH;
}
({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return CYRHYPHENWORD;
}
([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return LATHYPHENWORD;
}
({ALNUM}+-)+{ALNUM}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return HYPHENWORD;
}
<DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return VERSIONNUMBER;
}
<DELIM>\+?[0-9]+\.[0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return DECIMAL;
}
<DELIM>{CYRALPHA}+ /* one word in composite-word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return CYRPARTHYPHENWORD;
}
<DELIM>[[:alpha:]]+ /* one word in composite-word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return LATPARTHYPHENWORD;
}
<DELIM>{ALNUM}+ /* one word in composite-word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return PARTHYPHENWORD;
}
<DELIM>- {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SPACE;
}
<DELIM,SERVER,URL>.|\n /* return in basic state */ {
BEGIN INITIAL;
yyless( 0 );
}
{CYRALPHA}+ /* normal word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return CYRWORD;
}
[[:alpha:]]+ /* normal word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return LATWORD;
}
{ALNUM}+ /* normal word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return UWORD;
}
[ \r\n\t]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SPACE;
}
. {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SPACE;
}
%%
/* clearing after parsing from string */
void
tsearch2_end_parse(void)
{
if (s)
{
free(s);
s = NULL;
}
tsearch2_yy_delete_buffer( buf );
buf = NULL;
}
/* start parse from string */
void
tsearch2_start_parse_str(char* str, int limit)
{
if (buf)
tsearch2_end_parse();
buf = tsearch2_yy_scan_bytes( str, limit );
tsearch2_yy_switch_to_buffer( buf );
BEGIN INITIAL;
}
...@@ -39,8 +39,7 @@ Datum prsd_start(PG_FUNCTION_ARGS); ...@@ -39,8 +39,7 @@ Datum prsd_start(PG_FUNCTION_ARGS);
Datum Datum
prsd_start(PG_FUNCTION_ARGS) prsd_start(PG_FUNCTION_ARGS)
{ {
tsearch2_start_parse_str((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)); PG_RETURN_POINTER(TParserInit( (char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
PG_RETURN_POINTER(NULL);
} }
PG_FUNCTION_INFO_V1(prsd_getlexeme); PG_FUNCTION_INFO_V1(prsd_getlexeme);
...@@ -48,14 +47,17 @@ Datum prsd_getlexeme(PG_FUNCTION_ARGS); ...@@ -48,14 +47,17 @@ Datum prsd_getlexeme(PG_FUNCTION_ARGS);
Datum Datum
prsd_getlexeme(PG_FUNCTION_ARGS) prsd_getlexeme(PG_FUNCTION_ARGS)
{ {
/* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */ TParser *p=(TParser*)PG_GETARG_POINTER(0);
char **t = (char **) PG_GETARG_POINTER(1); char **t = (char **) PG_GETARG_POINTER(1);
int *tlen = (int *) PG_GETARG_POINTER(2); int *tlen = (int *) PG_GETARG_POINTER(2);
int type = tsearch2_yylex();
*t = token; if ( !TParserGet(p) )
*tlen = tokenlen; PG_RETURN_INT32(0);
PG_RETURN_INT32(type);
*t = p->lexeme;
*tlen = p->lenbytelexeme;
PG_RETURN_INT32(p->type);
} }
PG_FUNCTION_INFO_V1(prsd_end); PG_FUNCTION_INFO_V1(prsd_end);
...@@ -63,8 +65,8 @@ Datum prsd_end(PG_FUNCTION_ARGS); ...@@ -63,8 +65,8 @@ Datum prsd_end(PG_FUNCTION_ARGS);
Datum Datum
prsd_end(PG_FUNCTION_ARGS) prsd_end(PG_FUNCTION_ARGS)
{ {
/* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */ TParser *p=(TParser*)PG_GETARG_POINTER(0);
tsearch2_end_parse(); TParserClose(p);
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment