Commit c52795d1 authored by Teodor Sigaev's avatar Teodor Sigaev

Text parser rewritten:

        - supports multibyte encodings
        - more strict rules for lexemes
        - flex isn't used
Add:
        - tsquery plainto_tsquery(text)
          Function makes tsquery from plain text.
        - &&, ||, !! operation for tsquery for combining
          tsquery from it's parts:  'foo & bar' || 'asd' => 'foo & bar | asd'
parent b91e6ed9
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.11 2005/11/08 17:08:46 teodor Exp $
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.12 2005/11/21 12:27:57 teodor Exp $
MODULE_big = tsearch2
OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
......@@ -6,7 +6,8 @@ OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
wparser.o wparser_def.o \
ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
tsvector_op.o rank.o ts_stat.o \
query_util.o query_support.o query_rewrite.o query_gist.o
query_util.o query_support.o query_rewrite.o query_gist.o \
ts_locale.o
SUBDIRS := snowball ispell wordparser
SUBDIROBJS := $(SUBDIRS:%=%/SUBSYS.o)
......
This diff is collapsed.
......@@ -51,10 +51,20 @@ Datum to_tsquery_name(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(to_tsquery_current);
Datum to_tsquery_current(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(plainto_tsquery);
Datum plainto_tsquery(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(plainto_tsquery_name);
Datum plainto_tsquery_name(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(plainto_tsquery_current);
Datum plainto_tsquery_current(PG_FUNCTION_ARGS);
/* parser's states */
#define WAITOPERAND 1
#define WAITOPERATOR 2
#define WAITFIRSTOPERAND 3
#define WAITSINGLEOPERAND 4
/*
* node of query tree, also used
......@@ -195,6 +205,14 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
else if (*(state->buf) != ' ')
return ERR;
break;
case WAITSINGLEOPERAND:
if ( *(state->buf) == '\0' )
return END;
*strval = state->buf;
*lenval = strlen( state->buf );
state->buf += strlen( state->buf );
state->count++;
return VAL;
default:
return ERR;
break;
......@@ -582,7 +600,7 @@ findoprnd(ITEM * ptr, int4 *pos)
* input
*/
static QUERYTYPE *
queryin(char *buf, void (*pushval) (QPRS_STATE *, int, char *, int, int2), int cfg_id)
queryin(char *buf, void (*pushval) (QPRS_STATE *, int, char *, int, int2), int cfg_id, bool isplain)
{
QPRS_STATE state;
int4 i;
......@@ -599,7 +617,7 @@ static QUERYTYPE *
/* init state */
state.buf = buf;
state.state = WAITFIRSTOPERAND;
state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND;
state.count = 0;
state.num = 0;
state.str = NULL;
......@@ -679,7 +697,7 @@ Datum
tsquery_in(PG_FUNCTION_ARGS)
{
SET_FUNCOID();
PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0));
PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0, false));
}
/*
......@@ -910,7 +928,7 @@ to_tsquery(PG_FUNCTION_ARGS)
str = text2char(in);
PG_FREE_IF_COPY(in, 1);
query = queryin(str, pushval_morph, PG_GETARG_INT32(0));
query = queryin(str, pushval_morph, PG_GETARG_INT32(0),false);
if ( query->size == 0 )
PG_RETURN_POINTER(query);
......@@ -950,3 +968,59 @@ to_tsquery_current(PG_FUNCTION_ARGS)
Int32GetDatum(get_currcfg()),
PG_GETARG_DATUM(0)));
}
Datum
plainto_tsquery(PG_FUNCTION_ARGS)
{
text *in = PG_GETARG_TEXT_P(1);
char *str;
QUERYTYPE *query;
ITEM *res;
int4 len;
SET_FUNCOID();
str = text2char(in);
PG_FREE_IF_COPY(in, 1);
query = queryin(str, pushval_morph, PG_GETARG_INT32(0), true);
if ( query->size == 0 )
PG_RETURN_POINTER(query);
res = clean_fakeval_v2(GETQUERY(query), &len);
if (!res)
{
query->len = HDRSIZEQT;
query->size = 0;
PG_RETURN_POINTER(query);
}
memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(ITEM));
pfree(res);
PG_RETURN_POINTER(query);
}
Datum
plainto_tsquery_name(PG_FUNCTION_ARGS)
{
text *name = PG_GETARG_TEXT_P(0);
Datum res;
SET_FUNCOID();
res = DirectFunctionCall2(plainto_tsquery,
Int32GetDatum(name2id_cfg(name)),
PG_GETARG_DATUM(1));
PG_FREE_IF_COPY(name, 0);
PG_RETURN_DATUM(res);
}
Datum
plainto_tsquery_current(PG_FUNCTION_ARGS)
{
SET_FUNCOID();
PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery,
Int32GetDatum(get_currcfg()),
PG_GETARG_DATUM(0)));
}
......@@ -14,6 +14,117 @@ tsquery_numnode(PG_FUNCTION_ARGS) {
PG_RETURN_INT32(nnode);
}
static QTNode*
join_tsqueries(QUERYTYPE *a, QUERYTYPE *b) {
QTNode *res=(QTNode*)palloc0( sizeof(QTNode) );
res->flags |= QTN_NEEDFREE;
res->valnode = (ITEM*)palloc0( sizeof(ITEM) );
res->valnode->type = OPR;
res->child = (QTNode**)palloc0( sizeof(QTNode*)*2 );
res->child[0] = QT2QTN( GETQUERY(b), GETOPERAND(b) );
res->child[1] = QT2QTN( GETQUERY(a), GETOPERAND(a) );
res->nchild = 2;
return res;
}
PG_FUNCTION_INFO_V1(tsquery_and);
Datum tsquery_and(PG_FUNCTION_ARGS);
Datum
tsquery_and(PG_FUNCTION_ARGS) {
QUERYTYPE *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
QUERYTYPE *b = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1)));
QTNode *res;
QUERYTYPE *query;
if ( a->size == 0 ) {
PG_FREE_IF_COPY(a,1);
PG_RETURN_POINTER(b);
} else if ( b->size == 0 ) {
PG_FREE_IF_COPY(b,1);
PG_RETURN_POINTER(a);
}
res = join_tsqueries(a, b);
res->valnode->val = '&';
query = QTN2QT( res, PlainMemory );
QTNFree(res);
PG_FREE_IF_COPY(a,0);
PG_FREE_IF_COPY(b,1);
PG_RETURN_POINTER(query);
}
PG_FUNCTION_INFO_V1(tsquery_or);
Datum tsquery_or(PG_FUNCTION_ARGS);
Datum
tsquery_or(PG_FUNCTION_ARGS) {
QUERYTYPE *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
QUERYTYPE *b = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1)));
QTNode *res;
QUERYTYPE *query;
if ( a->size == 0 ) {
PG_FREE_IF_COPY(a,1);
PG_RETURN_POINTER(b);
} else if ( b->size == 0 ) {
PG_FREE_IF_COPY(b,1);
PG_RETURN_POINTER(a);
}
res = join_tsqueries(a, b);
res->valnode->val = '|';
query = QTN2QT( res, PlainMemory );
QTNFree(res);
PG_FREE_IF_COPY(a,0);
PG_FREE_IF_COPY(b,1);
PG_RETURN_POINTER(query);
}
PG_FUNCTION_INFO_V1(tsquery_not);
Datum tsquery_not(PG_FUNCTION_ARGS);
Datum
tsquery_not(PG_FUNCTION_ARGS) {
QUERYTYPE *a = (QUERYTYPE *) DatumGetPointer(PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(0)));
QTNode *res;
QUERYTYPE *query;
if ( a->size == 0 )
PG_RETURN_POINTER(a);
res=(QTNode*)palloc0( sizeof(QTNode) );
res->flags |= QTN_NEEDFREE;
res->valnode = (ITEM*)palloc0( sizeof(ITEM) );
res->valnode->type = OPR;
res->valnode->val = '!';
res->child = (QTNode**)palloc0( sizeof(QTNode*) );
res->child[0] = QT2QTN( GETQUERY(a), GETOPERAND(a) );
res->nchild = 1;
query = QTN2QT( res, PlainMemory );
QTNFree(res);
PG_FREE_IF_COPY(a,0);
PG_RETURN_POINTER(query);
}
static int
CompareTSQ( QUERYTYPE *a, QUERYTYPE *b ) {
if ( a->size != b->size ) {
......
......@@ -173,6 +173,13 @@ select to_tsquery('default', 'asd&(and|fghj)');
select to_tsquery('default', '(asd&and)|fghj');
select to_tsquery('default', '(asd&!and)|fghj');
select to_tsquery('default', '(the|and&(i&1))&fghj');
select plainto_tsquery('default', 'the and z 1))& fghj');
select plainto_tsquery('default', 'foo bar') && plainto_tsquery('default', 'asd');
select plainto_tsquery('default', 'foo bar') || plainto_tsquery('default', 'asd fg');
select plainto_tsquery('default', 'foo bar') || !!plainto_tsquery('default', 'asd fg');
select plainto_tsquery('default', 'foo bar') && 'asd | fg';
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca';
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:B';
select 'a b:89 ca:23A,64b d:34c'::tsvector @@ 'd:AC & ca:A';
......
#include "ts_locale.h"
#include "utils/builtins.h"
#include "utils/pg_locale.h"
#include "mb/pg_wchar.h"
#if defined(TS_USE_WIDE) && defined(WIN32)
size_t
wchar2char( const char *to, const wchar_t *from, size_t len ) {
if (GetDatabaseEncoding() == PG_UTF8) {
int r;
if (len==0)
return 0;
r = WideCharToMultiByte(CP_UTF8, 0, from, len, to, nbytes,
NULL, NULL);
if ( r==0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("UTF-16 to UTF-8 translation failed: %lu",
GetLastError())));
return r;
}
return wcstombs(to, from, len);
}
size_t
char2wchar( const wchar_t *to, const char *from, size_t len ) {
if (GetDatabaseEncoding() == PG_UTF8) {
int r;
if (len==0)
return 0;
r = MultiByteToWideChar(CP_UTF8, 0, from, len,
to, len);
if (!r) {
pg_verifymbstr(from, len, false);
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("invalid multibyte character for locale"),
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
}
Assert(r <= nbytes);
return r;
}
return mbstowcs(to, from, len);
}
#endif
#ifndef __TSLOCALE_H__
#define __TSLOCALE_H__
#include "postgres.h"
#include <ctype.h>
#include <limits.h>
/*
* towlower() and friends should be in <wctype.h>, but some pre-C99 systems
* declare them in <wchar.h>.
*/
#ifdef HAVE_WCHAR_H
#include <wchar.h>
#endif
#ifdef HAVE_WCTYPE_H
#include <wctype.h>
#endif
#if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
#define TS_USE_WIDE
#ifdef WIN32
size_t wchar2char( const char *to, const wchar_t *from, size_t len );
size_t char2wchar( const wchar_t *to, const char *from, size_t len );
#else /* WIN32 */
/* correct mbstowcs */
#define char2wchar mbstowcs
#define wchar2char wcstombs
#endif /* WIN32 */
#endif /* defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER) */
#endif /* __TSLOCALE_H__ */
......@@ -427,6 +427,21 @@ RETURNS tsquery
AS 'MODULE_PATHNAME','to_tsquery_current'
LANGUAGE 'c' with (isstrict,iscachable);
CREATE FUNCTION plainto_tsquery(oid, text)
RETURNS tsquery
AS 'MODULE_PATHNAME'
LANGUAGE 'c' with (isstrict,iscachable);
CREATE FUNCTION plainto_tsquery(text, text)
RETURNS tsquery
AS 'MODULE_PATHNAME','plainto_tsquery_name'
LANGUAGE 'c' with (isstrict,iscachable);
CREATE FUNCTION plainto_tsquery(text)
RETURNS tsquery
AS 'MODULE_PATHNAME','plainto_tsquery_current'
LANGUAGE 'c' with (isstrict,iscachable);
--operations
CREATE FUNCTION exectsq(tsvector, tsquery)
RETURNS bool
......@@ -929,6 +944,47 @@ CREATE OR REPLACE FUNCTION numnode(tsquery)
language 'C'
with (isstrict,iscachable);
CREATE OR REPLACE FUNCTION tsquery_and(tsquery,tsquery)
returns tsquery
as 'MODULE_PATHNAME', 'tsquery_and'
language 'C'
with (isstrict,iscachable);
CREATE OPERATOR && (
LEFTARG = tsquery,
RIGHTARG = tsquery,
PROCEDURE = tsquery_and,
COMMUTATOR = '&&',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OR REPLACE FUNCTION tsquery_or(tsquery,tsquery)
returns tsquery
as 'MODULE_PATHNAME', 'tsquery_or'
language 'C'
with (isstrict,iscachable);
CREATE OPERATOR || (
LEFTARG = tsquery,
RIGHTARG = tsquery,
PROCEDURE = tsquery_or,
COMMUTATOR = '||',
RESTRICT = contsel,
JOIN = contjoinsel
);
CREATE OR REPLACE FUNCTION tsquery_not(tsquery)
returns tsquery
as 'MODULE_PATHNAME', 'tsquery_not'
language 'C'
with (isstrict,iscachable);
CREATE OPERATOR !! (
RIGHTARG = tsquery,
PROCEDURE = tsquery_not
);
--------------rewrite subsystem
CREATE OR REPLACE FUNCTION rewrite(tsquery, text)
......
# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.8 2005/10/18 01:30:49 tgl Exp $
# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.9 2005/11/21 12:27:57 teodor Exp $
SUBOBJS = parser.o deflex.o
EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) parser.c
EXTRA_CLEAN = SUBSYS.o $(SUBOBJS)
PG_CPPFLAGS = -I$(srcdir)/..
......@@ -20,13 +20,6 @@ override CFLAGS += $(CFLAGS_SL)
all: SUBSYS.o
parser.c: parser.l
ifdef FLEX
$(FLEX) $(FLEXFLAGS) -8 -Ptsearch2_yy -o'$@' $<
else
@$(missing) flex $< $@
endif
SUBSYS.o: $(SUBOBJS)
$(LD) $(LDREL) $(LDOUT) $@ $^
......
......@@ -15,7 +15,7 @@ const char *lex_descr[] = {
"Latin part of hyphenated word",
"Space symbols",
"HTML Tag",
"HTTP head",
"Protocol head",
"Hyphenated word",
"Latin hyphenated word",
"Non-latin hyphenated word",
......@@ -42,7 +42,7 @@ const char *tok_alias[] = {
"lpart_hword",
"blank",
"tag",
"http",
"protocol",
"hword",
"lhword",
"nlhword",
......
......@@ -17,7 +17,7 @@
#define LATPARTHYPHENWORD 11
#define SPACE 12
#define TAG 13
#define HTTP 14
#define PROTOCOL 14
#define HYPHENWORD 15
#define LATHYPHENWORD 16
#define CYRHYPHENWORD 17
......
This diff is collapsed.
#ifndef __PARSER_H__
#define __PARSER_H__
extern char *token;
extern int tokenlen;
int tsearch2_yylex(void);
void tsearch2_start_parse_str(char *, int);
void tsearch2_end_parse(void);
#include <ctype.h>
#include <limits.h>
#include "ts_locale.h"
typedef enum {
TPS_Base = 0,
TPS_InUWord,
TPS_InLatWord,
TPS_InCyrWord,
TPS_InUnsignedInt,
TPS_InSignedIntFirst,
TPS_InSignedInt,
TPS_InSpace,
TPS_InUDecimalFirst,
TPS_InUDecimal,
TPS_InDecimalFirst,
TPS_InDecimal,
TPS_InVersionFirst,
TPS_InVersion,
TPS_InMantissaFirst,
TPS_InMantissaSign,
TPS_InMantissa,
TPS_InHTMLEntityFirst,
TPS_InHTMLEntity,
TPS_InHTMLEntityNumFirst,
TPS_InHTMLEntityNum,
TPS_InHTMLEntityEnd,
TPS_InTagFirst,
TPS_InTagCloseFirst,
TPS_InTag,
TPS_InTagEscapeK,
TPS_InTagEscapeKK,
TPS_InTagBackSleshed,
TPS_InTagEnd,
TPS_InCommentFirst,
TPS_InCommentLast,
TPS_InComment,
TPS_InCloseCommentFirst,
TPS_InCloseCommentLast,
TPS_InCommentEnd,
TPS_InHostFirstDomen,
TPS_InHostDomenSecond,
TPS_InHostDomen,
TPS_InPortFirst,
TPS_InPort,
TPS_InHostFirstAN,
TPS_InHost,
TPS_InEmail,
TPS_InFileFirst,
TPS_InFile,
TPS_InFileNext,
TPS_InURIFirst,
TPS_InURIStart,
TPS_InURI,
TPS_InFURL,
TPS_InProtocolFirst,
TPS_InProtocolSecond,
TPS_InProtocolEnd,
TPS_InHyphenLatWordFirst,
TPS_InHyphenLatWord,
TPS_InHyphenCyrWordFirst,
TPS_InHyphenCyrWord,
TPS_InHyphenUWordFirst,
TPS_InHyphenUWord,
TPS_InHyphenValueFirst,
TPS_InHyphenValue,
TPS_InHyphenValueExact,
TPS_InParseHyphen,
TPS_InParseHyphenHyphen,
TPS_InHyphenCyrWordPart,
TPS_InHyphenLatWordPart,
TPS_InHyphenUWordPart,
TPS_InHyphenUnsignedInt,
TPS_InHDecimalPartFirst,
TPS_InHDecimalPart,
TPS_InHVersionPartFirst,
TPS_InHVersionPart,
TPS_Null /* last state (fake value) */
} TParserState;
/* forward declaration */
struct TParser;
typedef int (*TParserCharTest)(struct TParser*); /* any p_is* functions except p_iseq */
typedef void (*TParserSpecial)(struct TParser*); /* special handler for special cases... */
typedef struct {
TParserCharTest isclass;
char c;
uint16 flags;
TParserState tostate;
int type;
TParserSpecial special;
} TParserStateActionItem;
typedef struct {
TParserState state;
TParserStateActionItem *action;
} TParserStateAction;
typedef struct TParserPosition {
int posbyte; /* position of parser in bytes */
int poschar; /* osition of parser in characters */
int charlen; /* length of current char */
int lenbytelexeme;
int lencharlexeme;
TParserState state;
struct TParserPosition *prev;
int flags;
TParserStateActionItem *pushedAtAction;
} TParserPosition;
typedef struct TParser {
/* string and position information */
char *str; /* multibyte string */
int lenstr; /* length of mbstring */
wchar_t *wstr; /* wide character string */
int lenwstr; /* length of wsting */
/* State of parse */
int charmaxlen;
bool usewide;
TParserPosition *state;
bool ignore;
bool wanthost;
/* silly char */
char c;
/* out */
char *lexeme;
int lenbytelexeme;
int lencharlexeme;
int type;
} TParser;
TParser* TParserInit( char *, int );
bool TParserGet( TParser* );
void TParserClose( TParser* );
#endif
%{
#include "postgres.h"
#include "deflex.h"
#include "parser.h"
#include "common.h"
/* Avoid exit() on fatal scanner errors */
#undef fprintf
#define fprintf(file, fmt, msg) ts_error(ERROR, fmt, msg)
char *token = NULL; /* pointer to token */
int tokenlen;
static char *s = NULL; /* to return WHOLE hyphenated-word */
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
typedef struct {
int tlen;
int clen;
char *str;
} TagStorage;
static TagStorage ts={0,0,NULL};
static void
addTag(void)
{
while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
ts.tlen*=2;
ts.str=realloc(ts.str,ts.tlen);
if (!ts.str)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
ts.clen+=tsearch2_yyleng;
ts.str[ts.clen]='\0';
}
static void
startTag(void)
{
if ( ts.str==NULL ) {
ts.tlen=tsearch2_yyleng+1;
ts.str=malloc(ts.tlen);
if (!ts.str)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
ts.clen=0;
ts.str[0]='\0';
addTag();
}
%}
%option 8bit
%option never-interactive
%option nodefault
%option nounput
%option noyywrap
/* parser's state for parsing hyphenated-word */
%x DELIM
/* parser's state for parsing URL*/
%x URL
%x SERVER
/* parser's state for parsing TAGS */
%x INTAG
%x QINTAG
%x INCOMMENT
%x INSCRIPT
/* cyrillic koi8 char */
CYRALNUM [0-9\200-\377]
CYRALPHA [\200-\377]
ALPHA [a-zA-Z\200-\377]
ALNUM [0-9a-zA-Z\200-\377]
HOSTNAME ([-_[:alnum:]]+\.)+[[:alpha:]]+
URI [-_[:alnum:]/%,\.;=&?#]+
%%
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }
<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
BEGIN INITIAL;
addTag();
token = ts.str;
tokenlen = ts.clen;
return TAG;
}
"<!--" { BEGIN INCOMMENT; startTag(); }
<INCOMMENT>"-->" {
BEGIN INITIAL;
addTag();
token = ts.str;
tokenlen = ts.clen;
return TAG;
}
"<"[\![:alpha:]] { BEGIN INTAG; startTag(); }
"</"[[:alpha:]] { BEGIN INTAG; startTag(); }
<INTAG>"\"" { BEGIN QINTAG; addTag(); }
<QINTAG>"\\\"" { addTag(); }
<QINTAG>"\"" { BEGIN INTAG; addTag(); }
<INTAG>">" {
BEGIN INITIAL;
addTag();
token = ts.str;
tokenlen = ts.clen;
return TAG;
}
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }
\&(quot|amp|nbsp|lt|gt)\; {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTMLENTITY;
}
\&\#[0-9][0-9]?[0-9]?\; {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTMLENTITY;
}
[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return EMAIL;
}
[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SCIENTIFIC;
}
[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return VERSIONNUMBER;
}
[+-]?[0-9]+\.[0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return DECIMAL;
}
[+-][0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SIGNEDINT;
}
<DELIM,INITIAL>[0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return UNSIGNEDINT;
}
http"://" {
BEGIN URL;
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTTP;
}
ftp"://" {
BEGIN URL;
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTTP;
}
<URL,INITIAL>{HOSTNAME}[/:]{URI} {
BEGIN SERVER;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return FURL;
}
<SERVER,URL,INITIAL>{HOSTNAME} {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HOST;
}
<SERVER>[/:]{URI} {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return URI;
}
[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return FILEPATH;
}
({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return CYRHYPHENWORD;
}
([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return LATHYPHENWORD;
}
({ALNUM}+-)+{ALNUM}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return HYPHENWORD;
}
<DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return VERSIONNUMBER;
}
<DELIM>\+?[0-9]+\.[0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return DECIMAL;
}
<DELIM>{CYRALPHA}+ /* one word in composite-word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return CYRPARTHYPHENWORD;
}
<DELIM>[[:alpha:]]+ /* one word in composite-word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return LATPARTHYPHENWORD;
}
<DELIM>{ALNUM}+ /* one word in composite-word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return PARTHYPHENWORD;
}
<DELIM>- {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SPACE;
}
<DELIM,SERVER,URL>.|\n /* return in basic state */ {
BEGIN INITIAL;
yyless( 0 );
}
{CYRALPHA}+ /* normal word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return CYRWORD;
}
[[:alpha:]]+ /* normal word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return LATWORD;
}
{ALNUM}+ /* normal word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return UWORD;
}
[ \r\n\t]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SPACE;
}
. {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SPACE;
}
%%
/* clearing after parsing from string */
void
tsearch2_end_parse(void)
{
if (s)
{
free(s);
s = NULL;
}
tsearch2_yy_delete_buffer( buf );
buf = NULL;
}
/* start parse from string */
void
tsearch2_start_parse_str(char* str, int limit)
{
if (buf)
tsearch2_end_parse();
buf = tsearch2_yy_scan_bytes( str, limit );
tsearch2_yy_switch_to_buffer( buf );
BEGIN INITIAL;
}
......@@ -39,8 +39,7 @@ Datum prsd_start(PG_FUNCTION_ARGS);
Datum
prsd_start(PG_FUNCTION_ARGS)
{
tsearch2_start_parse_str((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1));
PG_RETURN_POINTER(NULL);
PG_RETURN_POINTER(TParserInit( (char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
}
PG_FUNCTION_INFO_V1(prsd_getlexeme);
......@@ -48,14 +47,17 @@ Datum prsd_getlexeme(PG_FUNCTION_ARGS);
Datum
prsd_getlexeme(PG_FUNCTION_ARGS)
{
/* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
TParser *p=(TParser*)PG_GETARG_POINTER(0);
char **t = (char **) PG_GETARG_POINTER(1);
int *tlen = (int *) PG_GETARG_POINTER(2);
int type = tsearch2_yylex();
*t = token;
*tlen = tokenlen;
PG_RETURN_INT32(type);
if ( !TParserGet(p) )
PG_RETURN_INT32(0);
*t = p->lexeme;
*tlen = p->lenbytelexeme;
PG_RETURN_INT32(p->type);
}
PG_FUNCTION_INFO_V1(prsd_end);
......@@ -63,8 +65,8 @@ Datum prsd_end(PG_FUNCTION_ARGS);
Datum
prsd_end(PG_FUNCTION_ARGS)
{
/* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
tsearch2_end_parse();
TParser *p=(TParser*)PG_GETARG_POINTER(0);
TParserClose(p);
PG_RETURN_VOID();
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment