Commit b5770567 authored by Tom Lane's avatar Tom Lane

txtidx datatype for full text indexing with GiST.

From Oleg Bartunov and Teodor Sigaev.
parent c24216be
# $Header: /cvsroot/pgsql/contrib/Makefile,v 1.29 2001/10/01 01:52:38 ishii Exp $
# $Header: /cvsroot/pgsql/contrib/Makefile,v 1.30 2001/10/12 23:19:09 tgl Exp $
subdir = contrib
top_builddir = ..
......@@ -34,9 +34,14 @@ WANTED_DIRS = \
spi \
string \
tips \
tsearch \
userlock \
vacuumlo
ifeq ($(with_java),yes)
WANTED_DIRS += retep
endif
# Missing:
# ipc_check \ (does not have a makefile)
# mSQL-interface \ (requires msql installed)
......@@ -47,10 +52,6 @@ WANTED_DIRS = \
# tools \ (does not have a makefile)
# xml \ (non-standard makefile)
ifeq ($(with_java),yes)
WANTED_DIRS += retep
endif
all install installdirs uninstall clean distclean maintainer-clean check installcheck:
@for dir in $(WANTED_DIRS); do \
......
......@@ -175,6 +175,11 @@ tools -
Assorted developer tools
by Massimo Dal Zotto <dz@cs.unitn.it>
tsearch -
Full-text-index support using GiST
by Teodor Sigaev <teodor@stack.net> and Oleg Bartunov
<oleg@sai.msu.su>.
userlock -
User locks
by Massimo Dal Zotto <dz@cs.unitn.it>
......
# $Header: /cvsroot/pgsql/contrib/tsearch/Attic/Makefile,v 1.1 2001/10/12 23:19:09 tgl Exp $
subdir = contrib/tsearch
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
MODULE_big = tsearch
OBJS = parser.o crc32.o morph.o txtidx.o query.o gistidx.o rewrite.o
DATA_built = tsearch.sql
DOCS = README.tsearch
REGRESS = tsearch
parser.c: parser.l
ifdef FLEX
$(FLEX) $(FLEXFLAGS) -8 -Ptsearch_yy -o'$@' $<
else
@$(missing) flex $< $@
endif
EXTRA_CLEAN = parser.c
include $(top_srcdir)/contrib/contrib-global.mk
# DO NOT DELETE
This diff is collapsed.
/* Both POSIX and CRC32 checksums */
#include <sys/types.h>
#include <stdio.h>
#include <sys/types.h>
#include "crc32.h"
/*
* This code implements the AUTODIN II polynomial
* The variable corresponding to the macro argument "crc" should
* be an unsigned long.
* Oroginal code by Spencer Garrett <srg@quick.com>
*/
#define _CRC32_(crc, ch) (crc = (crc >> 8) ^ crc32tab[(crc ^ (ch)) & 0xff])
/* generated using the AUTODIN II polynomial
* x^32 + x^26 + x^23 + x^22 + x^16 +
* x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + 1
*/
static const unsigned int crc32tab[256] = {
0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
};
unsigned int crc32_sz(char * buf, int size){
unsigned int crc = ~0;
char *p ;
int len, nr;
len = 0 ;
nr=size;
for (len += nr, p = buf; nr--; ++p) {
_CRC32_(crc, *p) ;
}
return ~crc;
}
#ifndef _CRC32_H
#define _CRC32_H
/* Returns crc32 of data block */
extern unsigned int crc32_sz(char * buf, int size);
/* Returns crc32 of null-terminated string */
#define crc32(buf) crc32_sz((buf),strlen(buf))
#endif
This diff is collapsed.
#ifndef __DEFLEX_H__
#define __DEFLEX_H__
/* rememder !!!! */
#define LASTNUM 19
#define LATWORD 1
#define NONLATINWORD 2
#define UWORD 3
#define EMAIL 4
#define FURL 5
#define HOST 6
#define FLOAT 7
#define FINT 8
#define PARTWORD 9
#define NONLATINPARTWORD 10
#define LATPARTWORD 11
#define SPACE 12
#define SYMTAG 13
#define HTTP 14
#define DEFISWORD 15
#define DEFISLATWORD 16
#define DEFISNONLATINWORD 17
#define URI 18
#define FILEPATH 19
extern const char *descr[];
#endif
#define TABLE_DICT_START ,{
#define TABLE_DICT_END }
#include "dict/porter_english.dct"
#ifdef USE_LOCALE
#include "dict/russian_stemming.dct"
#endif
#undef TABLE_DICT_START
#undef TABLE_DICT_END
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#ifndef __GISTIDX_H__
#define __GISTIDX_H__
/*
#define GISTIDX_DEBUG
*/
/*
* signature defines
*/
#define BITBYTE 8
#define SIGLENINT 64 /* >121 => key will toast, so it will not work !!! */
#define SIGLEN ( sizeof(int4)*SIGLENINT )
#define SIGLENBIT (SIGLEN*BITBYTE)
typedef char BITVEC[SIGLEN];
typedef char *BITVECP;
#define LOOPBYTE(a) \
for(i=0;i<SIGLEN;i++) {\
a;\
}
#define LOOPBIT(a) \
for(i=0;i<SIGLENBIT;i++) {\
a;\
}
#define GETBYTE(x,i) ( *( (BITVECP)(x) + (int)( (i) / BITBYTE ) ) )
#define GETBITBYTE(x,i) ( ((char)(x)) >> i & 0x01 )
#define CLRBIT(x,i) GETBYTE(x,i) &= ~( 0x01 << ( (i) % BITBYTE ) )
#define SETBIT(x,i) GETBYTE(x,i) |= ( 0x01 << ( (i) % BITBYTE ) )
#define GETBIT(x,i) ( (GETBYTE(x,i) >> ( (i) % BITBYTE )) & 0x01 )
#define abs(a) ((a) < (0) ? -(a) : (a))
#define min(a,b) ((a) < (b) ? (a) : (b))
#define HASHVAL(val) (((unsigned int)(val)) % SIGLENBIT)
#define HASH(sign, val) SETBIT((sign), HASHVAL(val))
/*
* type of index key
*/
typedef struct {
int4 len;
int4 flag;
char data[1];
} GISTTYPE;
#define ARRKEY 0x01
#define SIGNKEY 0x02
#define ALLISTRUE 0x04
#define ISARRKEY(x) ( ((GISTTYPE*)x)->flag & ARRKEY )
#define ISSIGNKEY(x) ( ((GISTTYPE*)x)->flag & SIGNKEY )
#define ISALLTRUE(x) ( ((GISTTYPE*)x)->flag & ALLISTRUE )
#define GTHDRSIZE ( sizeof(int4)*2 )
#define CALCGTSIZE(flag, len) ( GTHDRSIZE + ( ( (flag) & ARRKEY ) ? ((len)*sizeof(int4)) : (((flag) & ALLISTRUE) ? 0 : SIGLEN) ) )
#define GETSIGN(x) ( (BITVECP)( (char*)x+GTHDRSIZE ) )
#define GETARR(x) ( (int4*)( (char*)x+GTHDRSIZE ) )
#define ARRNELEM(x) ( ( ((GISTTYPE*)x)->len - GTHDRSIZE )/sizeof(int4) )
#endif
#!/usr/bin/perl
use strict;
use Getopt::Std;
use locale;
my %opt;
getopts('l:he:s:ap:om:f', \%opt);
if ( $opt{h} || ! ($opt{e}||$opt{s}) || !$opt{l} ) {
print<<EOT;
Generator of variant of the Lovin's stemmer which
uses a longest match algorithm.
Author Teodor Sigaev <teodor\@stack.net>
Usage:
$0 -l LOCALENAME [ -e FILENAME ] [ -s FILENAME ] [ -p PREFIX ] [ -o FILENAME ] [ -a ] [ -m NUMBER ]
-e FILENAME - file with endings of word
-s FILENAME - file with list of stop-word
-o FILENAME - out file, default STDOUT
-a - stop-word are strimmed
-p PREFIX - prefix of function and etc, default strimmed locale
-m NUMBER - minimal length of rest after semming, default 3
-l LOCALENAME - name of locale
-f - do not call tolower for each char
At least one of -e or -s must be defined
EOT
exit;
}
if ( ! defined $opt{p} ) {
$opt{p} = $opt{l};
$opt{p}=~s/[^a-zA-Z0-9_]+//g;
}
$opt{m}=3 if ! defined $opt{m};
my ($enddata,$stopdata) = ('','');
my $maxchild = 0;
if ( $opt{e} ) {
my @tree;
buildtree(\@tree, $opt{e}, 1);
printstruct( \@tree, 0, \$enddata);
undef @tree;
}
if ( $opt{s} ) {
my @tree;
buildtree(\@tree, $opt{s}, 0);
printstruct( \@tree, 0, \$stopdata);
undef @tree;
}
die "No data\n" if ( ! (length $enddata || length $stopdata) );
$enddata = "\t{0,0,0,0}" if ( ! length $enddata );
$stopdata = "\t{0,0,0,0}" if ( ! length $stopdata );
my $fh=\*STDOUT;
if ( $opt{o} ) {
open(OUT,">$opt{o}") || die "Can;t open file '$opt{o}' for writing\n";
$fh = \*OUT;
}
my $linktype = 'uint32';
if ( $maxchild <= 0xff ) {
$linktype='uint8';
} elsif ( $maxchild <= 0xffff ) {
$linktype='uint16';
}
my $wherecheck = ( $opt{a} ) ?
"NULL,\n\t$opt{p}_is_stopword"
:
"$opt{p}_is_stopword,\n\tNULL";
my ($tolower, $resttolower) = ('','');
if ( ! $opt{f} ) {
$tolower = '*cur = tolower( *cur );';
$resttolower=<<EOT;
while( cur - buf >= 0 ) {
*cur = tolower(*cur);
cur--;
}
EOT
}
print {$fh} <<EOT;
/*
* Autogenerated file
*
* Variant of the Lovin's stemmer which uses a longest match algorithm.
* Endings are stored in a suffix tree.
*/
#ifdef DICT_BODY
typedef struct {
uint8 val;
uint8 flag;
uint8 right;
$linktype child;
} $opt{p}_NODE;
/* is exists left tree ? */
#define L 0x01
/* finish word flag */
#define F 0x02
#define ISLEFT(x) ((($opt{p}_NODE*)x)->flag & L)
#define ISFINISH(x) ((($opt{p}_NODE*)x)->flag & F)
#define MINLENREST $opt{m}
static $opt{p}_NODE $opt{p}_endstree[]={
$enddata
};
static $opt{p}_NODE $opt{p}_stoptree[]={
$stopdata
};
static char*
$opt{p}_stem( void* obj, char *in, int *len ) {
$opt{p}_NODE *ptr = $opt{p}_endstree;
int result = 0;
uint8 *buf = (uint8 *)in;
uint8 *cur = buf + (*len) - 1;
while( cur - buf >= MINLENREST ) {
$tolower
if ( ptr->val == *cur ) {
if ( ISFINISH(ptr) ) result = buf + (*len) - cur;
cur--;
if ( ! ptr->child ) break;
ptr += ptr->child;
} else if ( ptr->val > *cur ) {
if ( ISLEFT(ptr) )
ptr++;
else
break;
} else {
if ( ptr->right )
ptr += ptr->right;
else
break;
}
}
$resttolower
*len -= result;
return in;
}
static int
$opt{p}_is_stopword( void *obj, char *in, int len ) {
$opt{p}_NODE *ptr = $opt{p}_stoptree;
int result = 0;
uint8 *buf = (uint8 *)in;
uint8 *cur = buf;
while( cur - buf < len ) {
$tolower
if ( ptr->val == *cur ) {
cur++;
if ( ISFINISH(ptr) ) result = cur - buf;
if ( ! ptr->child ) break;
ptr += ptr->child;
} else if ( ptr->val > *cur ) {
if ( ISLEFT(ptr) )
ptr++;
else
break;
} else {
if ( ptr->right )
ptr += ptr->right;
else
break;
}
}
return (result==len) ? 1 : 0;
}
#undef L
#undef F
#undef ISLEFT
#undef ISFINISH
#undef MINLENREST
#endif /* DICT_BODY */
#ifdef DICT_TABLE
TABLE_DICT_START
\"$opt{l}\",
NULL,
NULL,
$opt{p}_stem,
$wherecheck
TABLE_DICT_END
#endif
EOT
close($fh) if ( $fh != \*STDOUT );
sub buildtree {
my ($reftree,$file, $needreverse) = @_;
open(DATA,$file) || die "Can't open file '$file'\n";
while(<DATA>) {
chomp;
next if ! length $_;
$_ = lc($_) if ! $opt{f};
addtostruct( $reftree, ( $needreverse ) ? scalar(reverse($_)) : $_ );
}
close DATA;
}
sub mkbintree {
my ( $start, $stop, $rprop, $rres) = @_;
my $middle = $start + int( ($stop-$start)/2 );
push( @$rres, $rprop->[$middle] );
my $idx = $#$rres;
$rres->[$idx]{right}=0;
$rres->[$idx]{left}=0;
return 1 if ( $start == $stop );
my $leftsize = 0;
if ( $middle!=$start ) {
$rres->[$idx]{left}=1;
$leftsize = mkbintree( $start, $middle-1, $rprop, $rres );
$rres->[$idx]{right}=$leftsize+1;
} else {
$rres->[$idx]{right} = 1;
}
return 1 + $leftsize + mkbintree( $middle+1, $stop, $rprop, $rres );
}
sub addtostruct {
my $node = shift;
my ($char, $subval) = split('', shift, 2);
$char = ord( $char );
if ( ! defined $node->[$char] ) {
$node->[$char] = {};
$node->[$char]{finish} = length $subval;
$node->[$char]{child} = [];
} elsif ( ! length $subval ) {
$node->[$char]{finish} = 0;
}
addtostruct( $node->[$char]{child}, $subval ) if ( length $subval );
}
sub printstruct {
my ($node, $pre, $refout) = @_;
my $add = 0;
my @prop;
my $outchild;
my $current = 0;
my $poschild=0;
my @tmp;
foreach my $i ( 0..255 ) {
next if ( !defined $node->[ $i ] );
push @prop , { val=>$i,
nchild=>printstruct( $node->[ $i ]{child}, 1, \$outchild ),
poschild=>$poschild };
$poschild += $prop[$#prop]{nchild};
}
return 0 if $#prop < 0;
if ($pre) {
$$refout .= ",\n\n";
}
mkbintree(0,$#prop,\@prop,\@tmp);
@prop = @tmp;
$current=$#prop+1;
foreach my $i ( 0..$#prop ) {
my $flag = ($prop[$i]{left}) ? 'L' : undef;
if ( $node->[ $prop[$i]{val} ]{finish}==0 ) {
$flag .= '|' if defined $flag;
$flag .= 'F';
} elsif ( ! defined $flag ) {
$flag='0';
}
$$refout .= "\t{'".chr( $prop[$i]{val} )."',".
$flag.','.
$prop[$i]{right}.','.
(($prop[$i]{nchild}==0)?0:($prop[$i]{poschild}+$current)).'}'.
(($i==$#prop)? '' : ",\n");
$maxchild = $prop[$i]{poschild}+$current if
( $prop[$i]{nchild} && $prop[$i]{poschild}+$current > $maxchild );
$current--;
$add += $prop[$i]{nchild};
}
$$refout .= $outchild;
return $#prop+1 + $add;
}
/*
* morphology module
* New dictionary is include in dict.h. For languages which
* use latin charset it may be need to modify mapdict table.
* Teodor Sigaev <teodor@stack.net>
*/
#include "postgres.h"
#include "utils/elog.h"
#include "utils/palloc.h"
#include "utils/builtins.h"
#include "catalog/pg_control.h"
#include "utils/pg_locale.h"
#include "morph.h"
#include "deflex.h"
/*
* Struct for calling dictionaries
* All of this methods are optional, but
* if all methods are NULL, then dictionary does nothing :)
* Return value of lemmatize must be palloced or the same.
* Return value of init must be malloced in other case
* it will be free in end of transaction!
*/
typedef struct {
char localename[LOCALE_NAME_BUFLEN];
/* init dictionary */
void* (*init)(void);
/* close dictionary */
void (*close)(void*);
/* find in dictionary */
char* (*lemmatize)(void*,char*,int*);
int (*is_stoplemm)(void*,char*,int);
int (*is_stemstoplemm)(void*,char*,int);
} DICT;
/* insert all dictionaries */
#define DICT_BODY
#include "dict.h"
#undef DICT_BODY
/* fill dictionary's structure */
#define DICT_TABLE
DICT dicts[] = {
{
"C",NULL,NULL,NULL,NULL,NULL /* fake dictionary */
}
#include "dict.h"
};
#undef DICT_TABLE
/* array for storing dictinary's objects (if needed) */
void* dictobjs[ lengthof(dicts) ];
#define STOPLEXEM -2
#define BYLOCALE -1
#define NODICT 0
#define DEFAULTDICT 1
#define MAXNDICT 2
typedef int2 MAPDICT[MAXNDICT];
#define GETDICT(x,i) *( ((int2*)(x)) + (i) )
/* map dictionaries for lexem type */
static MAPDICT mapdict[] = {
{NODICT, NODICT}, /* not used */
{DEFAULTDICT, NODICT}, /* LATWORD */
{BYLOCALE, NODICT}, /* NONLATINWORD */
{BYLOCALE, DEFAULTDICT}, /* UWORD */
{NODICT, NODICT}, /* EMAIL */
{NODICT, NODICT}, /* FURL */
{NODICT, NODICT}, /* HOST */
{NODICT, NODICT}, /* FLOAT */
{NODICT, NODICT}, /* FINT */
{BYLOCALE, DEFAULTDICT}, /* PARTWORD */
{BYLOCALE, NODICT}, /* NONLATINPARTWORD */
{DEFAULTDICT, NODICT}, /* LATPARTWORD */
{STOPLEXEM, NODICT}, /* SPACE */
{STOPLEXEM, NODICT}, /* SYMTAG */
{STOPLEXEM, NODICT}, /* HTTP */
{BYLOCALE, DEFAULTDICT}, /* DEFISWORD */
{DEFAULTDICT, NODICT}, /* DEFISLATWORD */
{BYLOCALE, NODICT}, /* DEFISNONLATINWORD */
{NODICT, NODICT}, /* URI */
{NODICT, NODICT} /* FILEPATH */
};
static bool inited=false;
void initmorph(void) {
int i,j,k;
MAPDICT *md;
bool needinit[ lengthof(dicts) ];
#ifdef USE_LOCALE
PG_LocaleCategories lc;
int bylocaledict = NODICT;
#endif
if ( inited ) return;
for(i=1; i<lengthof(dicts);i++)
needinit[i] = false;
#ifdef USE_LOCALE
PGLC_current(&lc);
for(i=1;i<lengthof(dicts);i++)
if (strcmp( dicts[i].localename, lc.lang ) == 0) {
bylocaledict = i;
break;
}
PGLC_free_categories(&lc);
#endif
for(i=1; i<lengthof(mapdict);i++) {
k=0;
md = &mapdict[i];
for(j=0;j<MAXNDICT;j++) {
GETDICT(md,k) = GETDICT(md,j);
if ( GETDICT(md,k) == NODICT ) {
break;
} else if ( GETDICT(md,k) == BYLOCALE ) {
#ifdef USE_LOCALE
if ( bylocaledict == NODICT )
continue;
GETDICT(md,k) = bylocaledict;
#else
continue;
#endif
}
if ( GETDICT(md,k) >= (int2)lengthof(dicts) )
continue;
needinit[ GETDICT(md,k) ] = true;
k++;
}
for(;k<MAXNDICT;k++)
if ( GETDICT(md,k) != STOPLEXEM )
GETDICT(md,k) = NODICT;
}
for(i=1; i<lengthof(dicts);i++)
if ( needinit[i] && dicts[i].init )
dictobjs[i] = (*(dicts[i].init))();
inited = true;
return;
}
char* lemmatize( char* word, int *len, int type ) {
int2 nd;
int i;
DICT *dict;
for(i=0;i<MAXNDICT;i++) {
nd = GETDICT( &mapdict[type], i );
if ( nd == NODICT ) {
/* there is no dictionary */
return word;
} else if ( nd == STOPLEXEM ) {
/* word is stopword */
return NULL;
} else {
dict = &dicts[ nd ];
if ( dict->is_stoplemm && (*(dict->is_stoplemm))(dictobjs[nd], word, *len) )
return NULL;
if ( dict->lemmatize ) {
int oldlen = *len;
char *newword = (*(dict->lemmatize))(dictobjs[nd], word, len);
/* word is recognized by distionary */
if ( newword != word || *len != oldlen ) {
if ( dict->is_stemstoplemm &&
(*(dict->is_stemstoplemm))(dictobjs[nd], word, *len) ) {
if ( newword != word && newword)
pfree(newword);
return NULL;
}
return newword;
}
}
}
}
return word;
}
bool is_stoptype(int type) {
return ( GETDICT( &mapdict[type], 0 ) == STOPLEXEM ) ? true : false;
}
#ifndef __MORPH_H__
#define __MORPH_H__
void initmorph(void);
char* lemmatize( char* word, int *len, int type );
bool is_stoptype(int type);
#endif
#ifndef __PARSER_H__
#define __PARSER_H__
char *token;
int tokenlen;
int tsearch_yylex(void);
void start_parse_str(char*, int);
void start_parse_fh(FILE*, int);
void end_parse(void);
#endif
%{
#include <string.h>
#include "deflex.h"
#include "parser.h"
/* postgres allocation function */
#include "postgres.h"
#define free pfree
#define malloc palloc
#define realloc repalloc
#ifdef strdup
#undef strdup
#endif
#define strdup pstrdup
char *token = NULL; /* pointer to token */
char *s = NULL; /* for returning full defis-word */
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
int lrlimit = -1; /* for limiting read from filehandle ( -1 - unlimited read ) */
int bytestoread = 0; /* for limiting read from filehandle */
/* redefine macro for read limited length */
#define YY_INPUT(buf,result,max_size) \
if ( yy_current_buffer->yy_is_interactive ) { \
int c = '*', n; \
for ( n = 0; n < max_size && \
(c = getc( tsearch_yyin )) != EOF && c != '\n'; ++n ) \
buf[n] = (char) c; \
if ( c == '\n' ) \
buf[n++] = (char) c; \
if ( c == EOF && ferror( tsearch_yyin ) ) \
YY_FATAL_ERROR( "input in flex scanner failed" ); \
result = n; \
} else { \
if ( lrlimit == 0 ) \
result=YY_NULL; \
else { \
if ( lrlimit>0 ) { \
bytestoread = ( lrlimit > max_size ) ? max_size : lrlimit; \
lrlimit -= bytestoread; \
} else \
bytestoread = max_size; \
if ( ((result = fread( buf, 1, bytestoread, tsearch_yyin )) == 0) \
&& ferror( tsearch_yyin ) ) \
YY_FATAL_ERROR( "input in flex scanner failed" ); \
} \
}
#define YY_NO_UNPUT
%}
/* parser's state for parsing defis-word */
%x DELIM
/* parser's state for parsing URL*/
%x URL
%x SERVER
/* parser's state for parsing filepath */
%x INTAG
%x QINTAG
/* NONLATIN char */
NONLATINALNUM [0-9\200-\377]
NONLATINALPHA [\200-\377]
ALPHA [a-zA-Z\200-\377]
ALNUM [0-9a-zA-Z\200-\377]
HOSTNAME ([-_[:alnum:]]+\.)+[[:alpha:]]+
URI [-_[:alnum:]/%,\.;=&?#]+
%%
"<"[[:alpha:]] { BEGIN INTAG;
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
"</"[[:alpha:]] { BEGIN INTAG;
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
"<>" {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
"<"[^>[:alpha:]] {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SPACE;
}
<INTAG>"\"" { BEGIN QINTAG;
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
<QINTAG>"\\\"" {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
<QINTAG>"\"" { BEGIN INTAG;
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
<QINTAG>.|\n {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
<INTAG>">" { BEGIN INITIAL;
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
<INTAG>.|\n {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SYMTAG;
}
[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return EMAIL;
}
<DELIM,INITIAL>[0-9] /* digit's and point (might be a version) */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return FINT;
}
<DELIM,INITIAL>[0-9]+[0-9\.]*[0-9] /* digit's and point (might be a version) */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return FINT;
}
[+-]?[0-9\.]+[eE][+-]?[0-9]+ /* float */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return FLOAT;
}
http"://" {
BEGIN URL;
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return HTTP;
}
ftp"://" {
BEGIN URL;
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return HTTP;
}
<URL,INITIAL>{HOSTNAME}[/:]{URI} {
BEGIN SERVER;
if (s) { free(s); s=NULL; }
s = strdup( tsearch_yytext );
tokenlen = tsearch_yyleng;
yyless( 0 );
token = s;
return FURL;
}
<SERVER,URL,INITIAL>{HOSTNAME} {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return HOST;
}
<SERVER>[/:]{URI} {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return URI;
}
[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return FILEPATH;
}
({NONLATINALNUM}+-)+{NONLATINALPHA}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch_yytext );
tokenlen = tsearch_yyleng;
yyless( 0 );
token = s;
return DEFISNONLATINWORD;
}
([[:alnum:]]+-)+[[:alpha:]]+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
tokenlen = tsearch_yyleng;
s = strdup( tsearch_yytext );
yyless( 0 );
token = s;
return DEFISLATWORD;
}
({ALNUM}+-)+{ALPHA}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch_yytext );
tokenlen = tsearch_yyleng;
yyless( 0 );
token = s;
return DEFISWORD;
}
<DELIM>{NONLATINALNUM}+ /* one word in composite-word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return NONLATINPARTWORD;
}
<DELIM>[[:alnum:]]+ /* one word in composite-word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return LATPARTWORD;
}
<DELIM>{ALNUM}+ /* one word in composite-word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return PARTWORD;
}
<DELIM>- {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SPACE;
}
<DELIM,SERVER,URL>.|\n /* return in basic state */ {
BEGIN INITIAL;
tokenlen = tsearch_yyleng;
yyless( 0 );
}
{NONLATINALNUM}+ /* normal word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return NONLATINWORD;
}
[[:alnum:]]+ /* normal word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return LATWORD;
}
{ALNUM}+ /* normal word */ {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return UWORD;
}
.|\n {
token = tsearch_yytext;
tokenlen = tsearch_yyleng;
return SPACE;
}
%%
int tsearch_yywrap(void) {
return 1;
}
/* clearing after parsing from string */
void end_parse() {
if (s) { free(s); s=NULL; }
tsearch_yy_delete_buffer( buf );
buf = NULL;
}
/* start parse from string */
void start_parse_str(char* str, int limit) {
if (buf) end_parse();
buf = tsearch_yy_scan_bytes( str, limit );
tsearch_yy_switch_to_buffer( buf );
BEGIN INITIAL;
}
/* start parse from filehandle */
void start_parse_fh( FILE* fh, int limit ) {
if (buf) end_parse();
lrlimit = ( limit ) ? limit : -1;
buf = tsearch_yy_create_buffer( fh, YY_BUF_SIZE );
tsearch_yy_switch_to_buffer( buf );
BEGIN INITIAL;
}
This diff is collapsed.
#ifndef __QUERY_H__
#define __QUERY_H__
/*
#define BS_DEBUG
*/
/*
* item in polish notation with back link
* to left operand
*/
typedef struct ITEM {
int2 type;
int2 left;
int4 val;
/* user-friendly value */
uint16 distance;
uint16 length;
} ITEM;
/*
*Storage:
* (len)(size)(array of ITEM)(array of operand in user-friendly form)
*/
typedef struct {
int4 len;
int4 size;
char data[1];
} QUERYTYPE;
#define HDRSIZEQT ( 2*sizeof(int4) )
#define COMPUTESIZE(size,lenofoperand) ( HDRSIZEQT + size * sizeof(ITEM) + lenofoperand )
#define GETQUERY(x) (ITEM*)( (char*)(x)+HDRSIZEQT )
#define GETOPERAND(x) ( (char*)GETQUERY(x) + ((QUERYTYPE*)x)->size * sizeof(ITEM) )
#define ISOPERATOR(x) ( (x)=='!' || (x)=='&' || (x)=='|' || (x)=='(' || (x)==')' )
#define END 0
#define ERR 1
#define VAL 2
#define OPR 3
#define OPEN 4
#define CLOSE 5
#define VALTRUE 6 /* for stop words */
#define VALFALSE 7
bool execute( ITEM* curitem, void *checkval,
bool calcnot, bool (*chkcond)(void *checkval, ITEM* val ));
#endif
/*
* Rewrite routines of query tree
* Teodor Sigaev <teodor@stack.net>
*/
#include "postgres.h"
#include <float.h>
#include "access/gist.h"
#include "access/itup.h"
#include "access/rtree.h"
#include "utils/elog.h"
#include "utils/palloc.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "storage/bufpage.h"
#include "query.h"
#include "rewrite.h"
typedef struct NODE {
struct NODE *left;
struct NODE *right;
ITEM* valnode;
} NODE;
/*
* make query tree from plain view of query
*/
static NODE*
maketree(ITEM *in) {
NODE *node = (NODE*)palloc(sizeof(NODE));
node->valnode = in;
node->right = node->left = NULL;
if ( in->type == OPR ) {
node->right = maketree( in + 1 );
if ( in->val != (int4)'!' )
node->left = maketree( in + in->left );
}
return node;
}
typedef struct {
ITEM* ptr;
int4 len;
int4 cur;
} PLAINTREE;
static void
plainnode(PLAINTREE *state, NODE* node) {
if ( state->cur == state->len ) {
state->len *= 2;
state->ptr=(ITEM*)repalloc( (void*)state->ptr, state->len*sizeof(ITEM) );
}
memcpy( (void*)&(state->ptr[state->cur]), (void*)node->valnode, sizeof(ITEM) );
if ( node->valnode->type == VAL ) {
state->cur++;
} else if ( node->valnode->val == (int4)'!' ) {
state->ptr[state->cur].left=1;
state->cur++;
plainnode(state, node->right);
} else {
int4 cur = state->cur;
state->cur++;
plainnode(state, node->right);
state->ptr[cur].left = state->cur - cur;
plainnode(state, node->left);
}
pfree(node);
}
/*
* make plain view of tree from 'normal' view of tree
*/
static ITEM*
plaintree(NODE *root, int4 *len) {
PLAINTREE pl;
pl.cur=0;
pl.len=16;
if ( root && (root->valnode->type == VAL || root->valnode->type == OPR) ) {
pl.ptr = (ITEM*)palloc( pl.len*sizeof(ITEM) );
plainnode(&pl, root);
} else {
pl.ptr = NULL;
}
*len = pl.cur;
return pl.ptr;
}
static void
freetree(NODE *node) {
if ( !node ) return;
if ( node->left ) freetree(node->left);
if ( node->right ) freetree(node->right);
pfree( node );
}
/*
* clean tree for ! operator.
* It's usefull for debug, but in
* other case, such view is used with search in index.
* Operator ! always return TRUE
*/
static NODE*
clean_NOT_intree( NODE* node ) {
if ( node->valnode->type == VAL )
return node;
if ( node->valnode->val == (int4)'!' ) {
freetree(node);
return NULL;
}
/* operator & or | */
if ( node->valnode->val == (int4)'|' ) {
if ( (node->left=clean_NOT_intree(node->left)) == NULL ||
(node->right=clean_NOT_intree(node->right)) == NULL ) {
freetree(node);
return NULL;
}
} else {
NODE *res = node;
node->left=clean_NOT_intree(node->left);
node->right=clean_NOT_intree(node->right);
if ( node->left == NULL && node->right == NULL ) {
pfree(node);
res = NULL;
} else if ( node->left == NULL ) {
res = node->right;
pfree(node);
} else if ( node->right == NULL ) {
res = node->left;
pfree(node);
}
return res;
}
return node;
}
ITEM*
clean_NOT(ITEM* ptr, int4 *len) {
NODE *root = maketree( ptr );
return plaintree(clean_NOT_intree(root), len);
}
#define V_UNKNOWN 0
#define V_TRUE 1
#define V_FALSE 2
/*
* Clean query tree from values which is always in
* text (stopword)
*/
static NODE*
clean_fakeval_intree( NODE* node, char *result ) {
char lresult = V_UNKNOWN, rresult = V_UNKNOWN;
if ( node->valnode->type == VAL )
return node;
else if ( node->valnode->type == VALTRUE ) {
pfree( node );
*result = V_TRUE;
return NULL;
}
if ( node->valnode->val == (int4)'!' ) {
node->right = clean_fakeval_intree( node->right, &rresult );
if ( ! node->right ) {
*result = ( rresult == V_TRUE ) ? V_FALSE : V_TRUE;
freetree(node);
return NULL;
}
} else if ( node->valnode->val == (int4)'|' ) {
NODE *res = node;
node->left =clean_fakeval_intree(node->left, &lresult);
node->right=clean_fakeval_intree(node->right,&rresult);
if ( lresult == V_TRUE || rresult == V_TRUE ) {
freetree(node);
*result=V_TRUE;
return NULL;
} else if ( lresult == V_FALSE && rresult == V_FALSE ) {
freetree(node);
*result=V_FALSE;
return NULL;
} else if ( lresult == V_FALSE ) {
res = node->right;
pfree(node);
} else if ( rresult == V_FALSE ) {
res = node->left;
pfree(node);
}
return res;
} else {
NODE *res = node;
node->left =clean_fakeval_intree(node->left, &lresult);
node->right=clean_fakeval_intree(node->right,&rresult);
if ( lresult == V_FALSE || rresult == V_FALSE ) {
freetree(node);
*result=V_FALSE;
return NULL;
} else if ( lresult == V_TRUE && rresult == V_TRUE ) {
freetree(node);
*result=V_TRUE;
return NULL;
} else if ( lresult == V_TRUE ) {
res = node->right;
pfree(node);
} else if ( rresult == V_TRUE ) {
res = node->left;
pfree(node);
}
return res;
}
return node;
}
ITEM*
clean_fakeval(ITEM* ptr, int4 *len) {
NODE *root = maketree( ptr );
char result = V_UNKNOWN;
NODE *resroot;
resroot = clean_fakeval_intree(root, &result);
if ( result != V_UNKNOWN ) {
elog(ERROR,"Your query contained only stopword(s), ignored");
*len = 0;
return NULL;
}
return plaintree(resroot, len);
}
#ifndef __REWRITE_H__
#define __REWRITE_H__
ITEM* clean_NOT(ITEM* ptr, int4 *len);
ITEM* clean_fakeval(ITEM* ptr, int4 *len);
#endif
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#ifndef __TXTIDX_H__
#define __TXTIDX_H__
/*
#define TXTIDX_DEBUG
*/
#include "postgres.h"
#include "access/gist.h"
#include "access/itup.h"
#include "utils/elog.h"
#include "utils/palloc.h"
#include "utils/builtins.h"
#include "storage/bufpage.h"
typedef struct {
uint16 len;
uint16 pos;
} WordEntry;
typedef struct {
int4 len;
int4 size;
char data[1];
} txtidx;
#define DATAHDRSIZE (sizeof(int4)*2)
#define CALCDATASIZE(x, lenstr) ( x * sizeof(WordEntry) + DATAHDRSIZE + lenstr )
#define ARRPTR(x) ( (WordEntry*) ( (char*)x + DATAHDRSIZE ) )
#define STRPTR(x) ( (char*)x + DATAHDRSIZE + ( sizeof(WordEntry) * ((txtidx*)x)->size ) )
#define STRSIZE(x) ( ((txtidx*)x)->len - DATAHDRSIZE - ( sizeof(WordEntry) * ((txtidx*)x)->size ) )
typedef struct {
char *prsbuf;
char *word;
char *curpos;
int4 len;
int4 state;
bool oprisdelim;
} TI_IN_STATE;
int4 gettoken_txtidx( TI_IN_STATE *state );
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment