Commit 978de9d0 authored by Teodor Sigaev's avatar Teodor Sigaev

Improvements from Heikki Linnakangas <heikki@enterprisedb.com>

- change the alignment requirement of lexemes in TSVector slightly.
Lexeme strings were always padded to 2-byte aligned length to make sure
that if there's position array (uint16[]) it has the right alignment.
The patch changes that so that the padding is not done when there's no
positions. That makes the storage of tsvectors without positions
slightly more compact.

- added some #include "miscadmin.h" lines I missed in the earlier when I
added calls to check_stack_depth().

- Reimplement the send/recv functions, and added a comment
above them describing the on-wire format. The CRC is now recalculated in
tsquery as well per previous discussion.
parent 8983852e
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/tsginidx.c,v 1.2 2007/09/07 15:09:56 teodor Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/tsginidx.c,v 1.3 2007/09/07 16:03:40 teodor Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -22,7 +22,7 @@ Datum ...@@ -22,7 +22,7 @@ Datum
gin_extract_tsvector(PG_FUNCTION_ARGS) gin_extract_tsvector(PG_FUNCTION_ARGS)
{ {
TSVector vector = PG_GETARG_TSVECTOR(0); TSVector vector = PG_GETARG_TSVECTOR(0);
uint32 *nentries = (uint32 *) PG_GETARG_POINTER(1); int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
Datum *entries = NULL; Datum *entries = NULL;
*nentries = 0; *nentries = 0;
...@@ -55,7 +55,7 @@ Datum ...@@ -55,7 +55,7 @@ Datum
gin_extract_query(PG_FUNCTION_ARGS) gin_extract_query(PG_FUNCTION_ARGS)
{ {
TSQuery query = PG_GETARG_TSQUERY(0); TSQuery query = PG_GETARG_TSQUERY(0);
uint32 *nentries = (uint32 *) PG_GETARG_POINTER(1); int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
StrategyNumber strategy = PG_GETARG_UINT16(2); StrategyNumber strategy = PG_GETARG_UINT16(2);
Datum *entries = NULL; Datum *entries = NULL;
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.4 2007/09/07 15:35:10 teodor Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.5 2007/09/07 16:03:40 teodor Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -21,7 +21,6 @@ ...@@ -21,7 +21,6 @@
#include "tsearch/ts_utils.h" #include "tsearch/ts_utils.h"
#include "utils/memutils.h" #include "utils/memutils.h"
#include "utils/pg_crc.h" #include "utils/pg_crc.h"
#include "nodes/bitmapset.h"
struct TSQueryParserStateData struct TSQueryParserStateData
...@@ -384,16 +383,15 @@ makepol(TSQueryParserState state, ...@@ -384,16 +383,15 @@ makepol(TSQueryParserState state,
} }
} }
/*
* Fills in the left-fields previously left unfilled. The input
* QueryItems must be in polish (prefix) notation.
*/
static void static void
findoprnd(QueryItem *ptr, uint32 *pos) findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes)
{ {
/* since this function recurses, it could be driven to stack overflow. */ /* since this function recurses, it could be driven to stack overflow. */
check_stack_depth(); check_stack_depth();
if (*pos >= nnodes)
elog(ERROR, "malformed tsquery; operand not found");
if (ptr[*pos].type == QI_VAL || if (ptr[*pos].type == QI_VAL ||
ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here, ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here,
* they haven't been cleaned * they haven't been cleaned
...@@ -410,7 +408,7 @@ findoprnd(QueryItem *ptr, uint32 *pos) ...@@ -410,7 +408,7 @@ findoprnd(QueryItem *ptr, uint32 *pos)
{ {
ptr[*pos].operator.left = 1; ptr[*pos].operator.left = 1;
(*pos)++; (*pos)++;
findoprnd(ptr, pos); findoprnd_recurse(ptr, pos, nnodes);
} }
else else
{ {
...@@ -420,13 +418,31 @@ findoprnd(QueryItem *ptr, uint32 *pos) ...@@ -420,13 +418,31 @@ findoprnd(QueryItem *ptr, uint32 *pos)
Assert(curitem->oper == OP_AND || curitem->oper == OP_OR); Assert(curitem->oper == OP_AND || curitem->oper == OP_OR);
(*pos)++; (*pos)++;
findoprnd(ptr, pos); findoprnd_recurse(ptr, pos, nnodes);
curitem->left = *pos - tmp; curitem->left = *pos - tmp;
findoprnd(ptr, pos); findoprnd_recurse(ptr, pos, nnodes);
} }
} }
} }
/*
* Fills in the left-fields previously left unfilled. The input
* QueryItems must be in polish (prefix) notation.
*/
static void
findoprnd(QueryItem *ptr, int size)
{
uint32 pos;
pos = 0;
findoprnd_recurse(ptr, &pos, size);
if (pos != size)
elog(ERROR, "malformed tsquery; extra nodes");
}
/* /*
* Each value (operand) in the query is be passed to pushval. pushval can * Each value (operand) in the query is be passed to pushval. pushval can
* transform the simple value to an arbitrarily complex expression using * transform the simple value to an arbitrarily complex expression using
...@@ -452,7 +468,6 @@ parse_tsquery(char *buf, ...@@ -452,7 +468,6 @@ parse_tsquery(char *buf,
TSQuery query; TSQuery query;
int commonlen; int commonlen;
QueryItem *ptr; QueryItem *ptr;
uint32 pos = 0;
ListCell *cell; ListCell *cell;
/* init state */ /* init state */
...@@ -522,8 +537,7 @@ parse_tsquery(char *buf, ...@@ -522,8 +537,7 @@ parse_tsquery(char *buf,
pfree(state.op); pfree(state.op);
/* Set left operand pointers for every operator. */ /* Set left operand pointers for every operator. */
pos = 0; findoprnd(ptr, query->size);
findoprnd(ptr, &pos);
return query; return query;
} }
...@@ -734,6 +748,22 @@ tsqueryout(PG_FUNCTION_ARGS) ...@@ -734,6 +748,22 @@ tsqueryout(PG_FUNCTION_ARGS)
PG_RETURN_CSTRING(nrm.buf); PG_RETURN_CSTRING(nrm.buf);
} }
/*
* Binary Input / Output functions. The binary format is as follows:
*
* uint32 number of operators/operands in the query
*
* Followed by the operators and operands, in prefix notation. For each
* operand:
*
* uint8 type, QI_VAL
* uint8 weight
* operand text in client encoding, null-terminated
*
* For each operator:
* uint8 type, QI_OPR
* uint8 operator, one of OP_AND, OP_OR, OP_NOT.
*/
Datum Datum
tsquerysend(PG_FUNCTION_ARGS) tsquerysend(PG_FUNCTION_ARGS)
{ {
...@@ -744,7 +774,7 @@ tsquerysend(PG_FUNCTION_ARGS) ...@@ -744,7 +774,7 @@ tsquerysend(PG_FUNCTION_ARGS)
pq_begintypsend(&buf); pq_begintypsend(&buf);
pq_sendint(&buf, query->size, sizeof(int32)); pq_sendint(&buf, query->size, sizeof(uint32));
for (i = 0; i < query->size; i++) for (i = 0; i < query->size; i++)
{ {
pq_sendint(&buf, item->type, sizeof(item->type)); pq_sendint(&buf, item->type, sizeof(item->type));
...@@ -752,16 +782,13 @@ tsquerysend(PG_FUNCTION_ARGS) ...@@ -752,16 +782,13 @@ tsquerysend(PG_FUNCTION_ARGS)
switch(item->type) switch(item->type)
{ {
case QI_VAL: case QI_VAL:
pq_sendint(&buf, item->operand.weight, sizeof(item->operand.weight)); pq_sendint(&buf, item->operand.weight, sizeof(uint8));
pq_sendint(&buf, item->operand.valcrc, sizeof(item->operand.valcrc)); pq_sendstring(&buf, GETOPERAND(query) + item->operand.distance);
pq_sendint(&buf, item->operand.length, sizeof(int16));
/* istrue flag is just for temporary use in tsrank.c/Cover, /* istrue flag is just for temporary use in tsrank.c/Cover,
* so we don't need to transfer that */ * so we don't need to transfer that */
break; break;
case QI_OPR: case QI_OPR:
pq_sendint(&buf, item->operator.oper, sizeof(item->operator.oper)); pq_sendint(&buf, item->operator.oper, sizeof(item->operator.oper));
if (item->operator.oper != OP_NOT)
pq_sendint(&buf, item->operator.left, sizeof(item->operator.left));
break; break;
default: default:
elog(ERROR, "unknown tsquery node type %d", item->type); elog(ERROR, "unknown tsquery node type %d", item->type);
...@@ -769,14 +796,6 @@ tsquerysend(PG_FUNCTION_ARGS) ...@@ -769,14 +796,6 @@ tsquerysend(PG_FUNCTION_ARGS)
item++; item++;
} }
item = GETQUERY(query);
for (i = 0; i < query->size; i++)
{
if (item->type == QI_VAL)
pq_sendbytes(&buf, GETOPERAND(query) + item->operand.distance, item->operand.length);
item++;
}
PG_FREE_IF_COPY(query, 0); PG_FREE_IF_COPY(query, 0);
PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
...@@ -788,141 +807,113 @@ tsqueryrecv(PG_FUNCTION_ARGS) ...@@ -788,141 +807,113 @@ tsqueryrecv(PG_FUNCTION_ARGS)
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
TSQuery query; TSQuery query;
int i, int i,
size,
len; len;
QueryItem *item; QueryItem *item;
int datalen = 0; int datalen;
char *ptr; char *ptr;
Bitmapset *parentset = NULL; uint32 size;
const char **operands;
size = pq_getmsgint(buf, sizeof(uint32)); size = pq_getmsgint(buf, sizeof(uint32));
if (size < 0 || size > (MaxAllocSize / sizeof(QueryItem))) if (size > (MaxAllocSize / sizeof(QueryItem)))
elog(ERROR, "invalid size of tsquery"); elog(ERROR, "invalid size of tsquery");
len = HDRSIZETQ + sizeof(QueryItem) * size; /* Allocate space to temporarily hold operand strings */
operands = palloc(size * sizeof(char *));
/* Allocate space for all the QueryItems. */
len = HDRSIZETQ + sizeof(QueryItem) * size;
query = (TSQuery) palloc0(len); query = (TSQuery) palloc0(len);
query->size = size; query->size = size;
item = GETQUERY(query); item = GETQUERY(query);
datalen = 0;
for (i = 0; i < size; i++) for (i = 0; i < size; i++)
{ {
item->type = (int8) pq_getmsgint(buf, sizeof(int8)); item->type = (int8) pq_getmsgint(buf, sizeof(int8));
switch(item->type) if (item->type == QI_VAL)
{ {
case QI_VAL: size_t val_len; /* length after recoding to server encoding */
item->operand.weight = (int8) pq_getmsgint(buf, sizeof(int8)); uint8 weight;
item->operand.valcrc = (int32) pq_getmsgint(buf, sizeof(int32)); const char *val;
item->operand.length = pq_getmsgint(buf, sizeof(int16)); pg_crc32 valcrc;
/* Check that the weight bitmap is valid */ weight = (uint8) pq_getmsgint(buf, sizeof(uint8));
if (item->operand.weight < 0 || item->operand.weight > 0xF) val = pq_getmsgstring(buf);
elog(ERROR, "invalid weight bitmap"); val_len = strlen(val);
/* XXX: We don't check that the CRC is valid. Actually, if we /* Sanity checks */
* bothered to calculate it to verify, there would be no need
* to transfer it. if (weight > 0xF)
*/ elog(ERROR, "invalid tsquery; invalid weight bitmap");
/* if (val_len > MAXSTRLEN)
* Check that datalen doesn't grow too large. Without the elog(ERROR, "invalid tsquery; operand too long");
* check, a malicious client could induce a buffer overflow
* by sending a tsquery whose size exceeds 2GB. datalen if (datalen > MAXSTRPOS)
* would overflow, we would allocate a too small buffer below, elog(ERROR, "invalid tsquery; total operand length exceeded");
* and overflow the buffer. Because operand.length is a 20-bit
* field, adding one such value to datalen must exceed /* Looks valid. */
* MaxAllocSize before wrapping over the 32-bit datalen field,
* so this check will protect from it. INIT_CRC32(valcrc);
*/ COMP_CRC32(valcrc, val, val_len);
if (datalen > MAXSTRLEN) FIN_CRC32(valcrc);
elog(ERROR, "invalid tsquery; total operand length exceeded");
item->operand.weight = weight;
/* We can calculate distance from datalen, no need to send it item->operand.valcrc = (int32) valcrc;
* across the wire. If we did, we would have to check that item->operand.length = val_len;
* it's valid anyway. item->operand.distance = datalen;
*/
item->operand.distance = datalen; /*
* Operand strings are copied to the final struct after this loop;
datalen += item->operand.length + 1; /* \0 */ * here we just collect them to an array
*/
break; operands[i] = val;
case QI_OPR:
item->operator.oper = (int8) pq_getmsgint(buf, sizeof(int8)); datalen += val_len + 1; /* + 1 for the '\0' terminator */
if (item->operator.oper != OP_NOT && }
item->operator.oper != OP_OR && else if (item->type == QI_OPR)
item->operator.oper != OP_AND) {
elog(ERROR, "unknown operator type %d", (int) item->operator.oper); int8 oper;
oper = (int8) pq_getmsgint(buf, sizeof(int8));
/* if (oper != OP_NOT && oper != OP_OR && oper != OP_AND)
* Check that no previous operator node points to the right elog(ERROR, "invalid tsquery; unknown operator type %d", (int) oper);
* operand. That would mean that the operand node if (i == size - 1)
* has two parents. elog(ERROR, "invalid pointer to right operand");
*/
if (bms_is_member(i + 1, parentset)) item->operator.oper = oper;
elog(ERROR, "malformed query tree");
parentset = bms_add_member(parentset, i + 1);
if(item->operator.oper != OP_NOT)
{
uint32 left = (uint32) pq_getmsgint(buf, sizeof(uint32));
/*
* Right operand is implicitly at "this+1". Don't allow
* left to point to the right operand, or to self.
*/
if (left <= 1 || i + left >= size)
elog(ERROR, "invalid pointer to left operand");
/*
* Check that no previous operator node points to the left
* operand.
*/
if (bms_is_member(i + left, parentset))
elog(ERROR, "malformed query tree");
parentset = bms_add_member(parentset, i + left);
item->operator.left = left;
}
else
item->operator.left = 1; /* do not leave uninitialized fields */
if (i == size - 1)
elog(ERROR, "invalid pointer to right operand");
break;
default:
elog(ERROR, "unknown tsquery node type %d", item->type);
} }
else
elog(ERROR, "unknown tsquery node type %d", item->type);
item++; item++;
} }
/* Now check that each node, except the root, has a parent. We /* Enlarge buffer to make room for the operand values. */
* already checked above that no node has more than one parent. */
if (bms_num_members(parentset) != size - 1 && size != 0)
elog(ERROR, "malformed query tree");
bms_free( parentset );
query = (TSQuery) repalloc(query, len + datalen); query = (TSQuery) repalloc(query, len + datalen);
item = GETQUERY(query); item = GETQUERY(query);
ptr = GETOPERAND(query); ptr = GETOPERAND(query);
/*
* Fill in the left-pointers. Checks that the tree is well-formed
* as a side-effect.
*/
findoprnd(item, size);
/* Copy operands to output struct */
for (i = 0; i < size; i++) for (i = 0; i < size; i++)
{ {
if (item->type == QI_VAL) if (item->type == QI_VAL)
{ {
memcpy(ptr, memcpy(ptr, operands[i], item->operand.length + 1);
pq_getmsgbytes(buf, item->operand.length), ptr += item->operand.length + 1;
item->operand.length);
ptr += item->operand.length;
*ptr++ = '\0';
} }
item++; item++;
} }
pfree(operands);
Assert(ptr - GETOPERAND(query) == datalen); Assert(ptr - GETOPERAND(query) == datalen);
SET_VARSIZE(query, len + datalen); SET_VARSIZE(query, len + datalen);
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_cleanup.c,v 1.3 2007/09/07 15:35:10 teodor Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_cleanup.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "tsearch/ts_type.h" #include "tsearch/ts_type.h"
#include "tsearch/ts_utils.h" #include "tsearch/ts_utils.h"
#include "miscadmin.h"
typedef struct NODE typedef struct NODE
{ {
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_rewrite.c,v 1.3 2007/09/07 15:35:10 teodor Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_rewrite.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "executor/spi.h" #include "executor/spi.h"
#include "tsearch/ts_type.h" #include "tsearch/ts_type.h"
#include "tsearch/ts_utils.h" #include "tsearch/ts_utils.h"
#include "miscadmin.h"
static int static int
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_util.c,v 1.3 2007/09/07 15:35:10 teodor Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_util.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "tsearch/ts_type.h" #include "tsearch/ts_type.h"
#include "tsearch/ts_utils.h" #include "tsearch/ts_utils.h"
#include "miscadmin.h"
QTNode * QTNode *
QT2QTN(QueryItem * in, char *operand) QT2QTN(QueryItem * in, char *operand)
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.3 2007/09/07 15:35:10 teodor Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "tsearch/ts_type.h" #include "tsearch/ts_type.h"
#include "tsearch/ts_utils.h" #include "tsearch/ts_utils.h"
#include "utils/array.h" #include "utils/array.h"
#include "miscadmin.h"
static float weights[] = {0.1, 0.2, 0.4, 1.0}; static float weights[] = {0.1, 0.2, 0.4, 1.0};
...@@ -176,8 +177,9 @@ SortAndUniqItems(TSQuery q, int *size) ...@@ -176,8 +177,9 @@ SortAndUniqItems(TSQuery q, int *size)
return res; return res;
} }
/* A dummy WordEntryPos array to use when haspos is false */
static WordEntryPos POSNULL[] = { static WordEntryPos POSNULL[] = {
0, 1, /* Number of elements that follow */
0 0
}; };
...@@ -207,7 +209,6 @@ calc_rank_and(float *w, TSVector t, TSQuery q) ...@@ -207,7 +209,6 @@ calc_rank_and(float *w, TSVector t, TSQuery q)
} }
pos = (uint16 **) palloc(sizeof(uint16 *) * q->size); pos = (uint16 **) palloc(sizeof(uint16 *) * q->size);
memset(pos, 0, sizeof(uint16 *) * q->size); memset(pos, 0, sizeof(uint16 *) * q->size);
*(uint16 *) POSNULL = lengthof(POSNULL) - 1;
WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1); WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1);
for (i = 0; i < size; i++) for (i = 0; i < size; i++)
...@@ -265,7 +266,6 @@ calc_rank_or(float *w, TSVector t, TSQuery q) ...@@ -265,7 +266,6 @@ calc_rank_or(float *w, TSVector t, TSQuery q)
QueryOperand **item; QueryOperand **item;
int size = q->size; int size = q->size;
*(uint16 *) POSNULL = lengthof(POSNULL) - 1;
item = SortAndUniqItems(q, &size); item = SortAndUniqItems(q, &size);
for (i = 0; i < size; i++) for (i = 0; i < size; i++)
...@@ -593,7 +593,6 @@ get_docrep(TSVector txt, TSQuery query, int *doclen) ...@@ -593,7 +593,6 @@ get_docrep(TSVector txt, TSQuery query, int *doclen)
DocRepresentation *doc; DocRepresentation *doc;
char *operand; char *operand;
*(uint16 *) POSNULL = lengthof(POSNULL) - 1;
doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
operand = GETOPERAND(query); operand = GETOPERAND(query);
reset_istrue_flag(query); reset_istrue_flag(query);
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.3 2007/09/07 15:09:56 teodor Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -75,18 +75,20 @@ uniquePos(WordEntryPos * a, int l) ...@@ -75,18 +75,20 @@ uniquePos(WordEntryPos * a, int l)
} }
static int static int
compareentry(const void *a, const void *b, void *arg) compareentry(const void *va, const void *vb, void *arg)
{ {
char *BufferStr = (char *) arg; char *BufferStr = (char *) arg;
WordEntryIN *a = (WordEntryIN *) va;
WordEntryIN *b = (WordEntryIN *) vb;
if (((WordEntryIN *) a)->entry.len == ((WordEntryIN *) b)->entry.len) if (a->entry.len == b->entry.len)
{ {
return strncmp(&BufferStr[((WordEntryIN *) a)->entry.pos], return strncmp(&BufferStr[a->entry.pos],
&BufferStr[((WordEntryIN *) b)->entry.pos], &BufferStr[b->entry.pos],
((WordEntryIN *) a)->entry.len); a->entry.len);
} }
return (((WordEntryIN *) a)->entry.len > ((WordEntryIN *) b)->entry.len) ? 1 : -1; return (a->entry.len > b->entry.len) ? 1 : -1;
} }
static int static int
...@@ -104,6 +106,9 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen) ...@@ -104,6 +106,9 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen)
a->poslen = uniquePos(a->pos, a->poslen); a->poslen = uniquePos(a->pos, a->poslen);
*outbuflen = SHORTALIGN(a->entry.len) + (a->poslen + 1) * sizeof(WordEntryPos); *outbuflen = SHORTALIGN(a->entry.len) + (a->poslen + 1) * sizeof(WordEntryPos);
} }
else
*outbuflen = a->entry.len;
return l; return l;
} }
res = a; res = a;
...@@ -118,10 +123,12 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen) ...@@ -118,10 +123,12 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen)
{ {
if (res->entry.haspos) if (res->entry.haspos)
{ {
*outbuflen += SHORTALIGN(res->entry.len);
res->poslen = uniquePos(res->pos, res->poslen); res->poslen = uniquePos(res->pos, res->poslen);
*outbuflen += res->poslen * sizeof(WordEntryPos); *outbuflen += res->poslen * sizeof(WordEntryPos);
} }
*outbuflen += SHORTALIGN(res->entry.len); else
*outbuflen += res->entry.len;
res++; res++;
memcpy(res, ptr, sizeof(WordEntryIN)); memcpy(res, ptr, sizeof(WordEntryIN));
} }
...@@ -147,12 +154,18 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen) ...@@ -147,12 +154,18 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen)
} }
ptr++; ptr++;
} }
/* add last item */
if (res->entry.haspos) if (res->entry.haspos)
{ {
*outbuflen += SHORTALIGN(res->entry.len);
res->poslen = uniquePos(res->pos, res->poslen); res->poslen = uniquePos(res->pos, res->poslen);
*outbuflen += res->poslen * sizeof(WordEntryPos); *outbuflen += res->poslen * sizeof(WordEntryPos);
} }
*outbuflen += SHORTALIGN(res->entry.len); else
*outbuflen += res->entry.len;
return res + 1 - a; return res + 1 - a;
} }
...@@ -367,6 +380,18 @@ tsvectorout(PG_FUNCTION_ARGS) ...@@ -367,6 +380,18 @@ tsvectorout(PG_FUNCTION_ARGS)
PG_RETURN_CSTRING(outbuf); PG_RETURN_CSTRING(outbuf);
} }
/*
* Binary Input / Output functions. The binary format is as follows:
*
* uint32 number of lexemes
*
* for each lexeme:
* lexeme text in client encoding, null-terminated
* uint16 number of positions
* for each position:
* uint16 WordEntryPos
*/
Datum Datum
tsvectorsend(PG_FUNCTION_ARGS) tsvectorsend(PG_FUNCTION_ARGS)
{ {
...@@ -381,18 +406,22 @@ tsvectorsend(PG_FUNCTION_ARGS) ...@@ -381,18 +406,22 @@ tsvectorsend(PG_FUNCTION_ARGS)
pq_sendint(&buf, vec->size, sizeof(int32)); pq_sendint(&buf, vec->size, sizeof(int32));
for (i = 0; i < vec->size; i++) for (i = 0; i < vec->size; i++)
{ {
/* uint16 npos;
* We are sure that sizeof(WordEntry) == sizeof(int32)
/* the strings in the TSVector array are not null-terminated, so
* we have to send the null-terminator separately
*/ */
pq_sendint(&buf, *(int32 *) weptr, sizeof(int32)); pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
pq_sendbyte(&buf, '\0');
pq_sendbytes(&buf, STRPTR(vec) + weptr->pos, weptr->len); npos = POSDATALEN(vec, weptr);
if (weptr->haspos) pq_sendint(&buf, npos, sizeof(uint16));
if(npos > 0)
{ {
WordEntryPos *wepptr = POSDATAPTR(vec, weptr); WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
pq_sendint(&buf, POSDATALEN(vec, weptr), sizeof(WordEntryPos)); for (j = 0; j < npos; j++)
for (j = 0; j < POSDATALEN(vec, weptr); j++)
pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos)); pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
} }
weptr++; weptr++;
...@@ -407,71 +436,92 @@ tsvectorrecv(PG_FUNCTION_ARGS) ...@@ -407,71 +436,92 @@ tsvectorrecv(PG_FUNCTION_ARGS)
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
TSVector vec; TSVector vec;
int i; int i;
uint32 size; int32 nentries;
WordEntry *weptr; int datalen; /* number of bytes used in the variable size area
int datalen = 0; * after fixed size TSVector header and WordEntries
Size len; */
Size hdrlen;
size = pq_getmsgint(buf, sizeof(uint32)); Size len; /* allocated size of vec */
if (size < 0 || size > (MaxAllocSize / sizeof(WordEntry)))
nentries = pq_getmsgint(buf, sizeof(int32));
if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
elog(ERROR, "invalid size of tsvector"); elog(ERROR, "invalid size of tsvector");
len = DATAHDRSIZE + sizeof(WordEntry) * size; hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
len = len * 2; /* times two to make room for lexemes */ len = hdrlen * 2; /* times two to make room for lexemes */
vec = (TSVector) palloc0(len); vec = (TSVector) palloc0(len);
vec->size = size; vec->size = nentries;
weptr = ARRPTR(vec); datalen = 0;
for (i = 0; i < size; i++) for (i = 0; i < nentries; i++)
{ {
int32 tmp; const char *lexeme;
uint16 npos;
size_t lex_len;
lexeme = pq_getmsgstring(buf);
npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
/* sanity checks */
lex_len = strlen(lexeme);
if (lex_len < 0 || lex_len > MAXSTRLEN)
elog(ERROR, "invalid tsvector; lexeme too long");
if (datalen > MAXSTRPOS)
elog(ERROR, "invalid tsvector; maximum total lexeme length exceeded");
weptr = ARRPTR(vec) + i; if (npos > MAXNUMPOS)
elog(ERROR, "unexpected number of positions");
/* /*
* We are sure that sizeof(WordEntry) == sizeof(int32) * Looks valid. Fill the WordEntry struct, and copy lexeme.
*
* But make sure the buffer is large enough first.
*/ */
tmp = pq_getmsgint(buf, sizeof(int32)); while (hdrlen + SHORTALIGN(datalen + lex_len) +
*weptr = *(WordEntry *) & tmp; (npos + 1) * sizeof(WordEntryPos) >= len)
while (CALCDATASIZE(size, datalen + SHORTALIGN(weptr->len)) >= len)
{ {
len *= 2; len *= 2;
vec = (TSVector) repalloc(vec, len); vec = (TSVector) repalloc(vec, len);
weptr = ARRPTR(vec) + i;
} }
memcpy(STRPTR(vec) + weptr->pos, vec->entries[i].haspos = (npos > 0) ? 1 : 0;
pq_getmsgbytes(buf, weptr->len), vec->entries[i].len = lex_len;
weptr->len); vec->entries[i].pos = datalen;
datalen += SHORTALIGN(weptr->len);
memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
datalen += lex_len;
if (i > 0 && WordEntryCMP(weptr, weptr - 1, STRPTR(vec)) <= 0) if (i > 0 && WordEntryCMP(&vec->entries[i], &vec->entries[i - 1], STRPTR(vec)) <= 0)
elog(ERROR, "lexemes are unordered"); elog(ERROR, "lexemes are unordered");
if (weptr->haspos) /* Receive positions */
if (npos > 0)
{ {
uint16 j, uint16 j;
npos;
WordEntryPos *wepptr; WordEntryPos *wepptr;
npos = (uint16) pq_getmsgint(buf, sizeof(uint16)); /*
if (npos > MAXNUMPOS) * Pad to 2-byte alignment if necessary. Though we used palloc0
elog(ERROR, "unexpected number of positions"); * for the initial allocation, subsequent repalloc'd memory
* areas are not initialized to zero.
while (CALCDATASIZE(size, datalen + (npos + 1) * sizeof(WordEntryPos)) >= len) */
if (datalen != SHORTALIGN(datalen))
{ {
len *= 2; *(STRPTR(vec) + datalen) = '\0';
vec = (TSVector) repalloc(vec, len); datalen = SHORTALIGN(datalen);
weptr = ARRPTR(vec) + i;
} }
memcpy(_POSDATAPTR(vec, weptr), &npos, sizeof(int16)); memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
wepptr = POSDATAPTR(vec, weptr);
wepptr = POSDATAPTR(vec, &vec->entries[i]);
for (j = 0; j < npos; j++) for (j = 0; j < npos; j++)
{ {
wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(int16)); wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1])) if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
elog(ERROR, "position information is unordered"); elog(ERROR, "position information is unordered");
} }
...@@ -480,7 +530,7 @@ tsvectorrecv(PG_FUNCTION_ARGS) ...@@ -480,7 +530,7 @@ tsvectorrecv(PG_FUNCTION_ARGS)
} }
} }
SET_VARSIZE(vec, CALCDATASIZE(vec->size, datalen)); SET_VARSIZE(vec, hdrlen + datalen);
PG_RETURN_TSVECTOR(vec); PG_RETURN_TSVECTOR(vec);
} }
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.3 2007/09/07 15:09:56 teodor Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -165,7 +165,7 @@ tsvector_strip(PG_FUNCTION_ARGS) ...@@ -165,7 +165,7 @@ tsvector_strip(PG_FUNCTION_ARGS)
char *cur; char *cur;
for (i = 0; i < in->size; i++) for (i = 0; i < in->size; i++)
len += SHORTALIGN(arrin[i].len); len += arrin[i].len;
len = CALCDATASIZE(in->size, len); len = CALCDATASIZE(in->size, len);
out = (TSVector) palloc0(len); out = (TSVector) palloc0(len);
...@@ -179,7 +179,7 @@ tsvector_strip(PG_FUNCTION_ARGS) ...@@ -179,7 +179,7 @@ tsvector_strip(PG_FUNCTION_ARGS)
arrout[i].haspos = 0; arrout[i].haspos = 0;
arrout[i].len = arrin[i].len; arrout[i].len = arrin[i].len;
arrout[i].pos = cur - STRPTR(out); arrout[i].pos = cur - STRPTR(out);
cur += SHORTALIGN(arrout[i].len); cur += arrout[i].len;
} }
PG_FREE_IF_COPY(in, 0); PG_FREE_IF_COPY(in, 0);
...@@ -351,12 +351,15 @@ tsvector_concat(PG_FUNCTION_ARGS) ...@@ -351,12 +351,15 @@ tsvector_concat(PG_FUNCTION_ARGS)
ptr->len = ptr1->len; ptr->len = ptr1->len;
memcpy(cur, data1 + ptr1->pos, ptr1->len); memcpy(cur, data1 + ptr1->pos, ptr1->len);
ptr->pos = cur - data; ptr->pos = cur - data;
cur += SHORTALIGN(ptr1->len);
if (ptr->haspos) if (ptr->haspos)
{ {
cur += SHORTALIGN(ptr1->len);
memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
} }
else
cur += ptr1->len;
ptr++; ptr++;
ptr1++; ptr1++;
i1--; i1--;
...@@ -367,16 +370,20 @@ tsvector_concat(PG_FUNCTION_ARGS) ...@@ -367,16 +370,20 @@ tsvector_concat(PG_FUNCTION_ARGS)
ptr->len = ptr2->len; ptr->len = ptr2->len;
memcpy(cur, data2 + ptr2->pos, ptr2->len); memcpy(cur, data2 + ptr2->pos, ptr2->len);
ptr->pos = cur - data; ptr->pos = cur - data;
cur += SHORTALIGN(ptr2->len);
if (ptr->haspos) if (ptr->haspos)
{ {
int addlen = add_pos(in2, ptr2, out, ptr, maxpos); int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
cur += SHORTALIGN(ptr2->len);
if (addlen == 0) if (addlen == 0)
ptr->haspos = 0; ptr->haspos = 0;
else else
cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
} }
else
cur += ptr2->len;
ptr++; ptr++;
ptr2++; ptr2++;
i2--; i2--;
...@@ -387,9 +394,9 @@ tsvector_concat(PG_FUNCTION_ARGS) ...@@ -387,9 +394,9 @@ tsvector_concat(PG_FUNCTION_ARGS)
ptr->len = ptr1->len; ptr->len = ptr1->len;
memcpy(cur, data1 + ptr1->pos, ptr1->len); memcpy(cur, data1 + ptr1->pos, ptr1->len);
ptr->pos = cur - data; ptr->pos = cur - data;
cur += SHORTALIGN(ptr1->len);
if (ptr->haspos) if (ptr->haspos)
{ {
cur += SHORTALIGN(ptr1->len);
if (ptr1->haspos) if (ptr1->haspos)
{ {
memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
...@@ -407,6 +414,9 @@ tsvector_concat(PG_FUNCTION_ARGS) ...@@ -407,6 +414,9 @@ tsvector_concat(PG_FUNCTION_ARGS)
cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
} }
} }
else
cur += ptr1->len;
ptr++; ptr++;
ptr1++; ptr1++;
ptr2++; ptr2++;
...@@ -421,12 +431,15 @@ tsvector_concat(PG_FUNCTION_ARGS) ...@@ -421,12 +431,15 @@ tsvector_concat(PG_FUNCTION_ARGS)
ptr->len = ptr1->len; ptr->len = ptr1->len;
memcpy(cur, data1 + ptr1->pos, ptr1->len); memcpy(cur, data1 + ptr1->pos, ptr1->len);
ptr->pos = cur - data; ptr->pos = cur - data;
cur += SHORTALIGN(ptr1->len);
if (ptr->haspos) if (ptr->haspos)
{ {
cur += SHORTALIGN(ptr1->len);
memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
} }
else
cur += ptr1->len;
ptr++; ptr++;
ptr1++; ptr1++;
i1--; i1--;
...@@ -438,16 +451,20 @@ tsvector_concat(PG_FUNCTION_ARGS) ...@@ -438,16 +451,20 @@ tsvector_concat(PG_FUNCTION_ARGS)
ptr->len = ptr2->len; ptr->len = ptr2->len;
memcpy(cur, data2 + ptr2->pos, ptr2->len); memcpy(cur, data2 + ptr2->pos, ptr2->len);
ptr->pos = cur - data; ptr->pos = cur - data;
cur += SHORTALIGN(ptr2->len);
if (ptr->haspos) if (ptr->haspos)
{ {
int addlen = add_pos(in2, ptr2, out, ptr, maxpos); int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
cur += SHORTALIGN(ptr2->len);
if (addlen == 0) if (addlen == 0)
ptr->haspos = 0; ptr->haspos = 0;
else else
cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
} }
else
cur += ptr2->len;
ptr++; ptr++;
ptr2++; ptr2++;
i2--; i2--;
...@@ -484,8 +501,8 @@ ValCompare(CHKVAL * chkval, WordEntry * ptr, QueryOperand * item) ...@@ -484,8 +501,8 @@ ValCompare(CHKVAL * chkval, WordEntry * ptr, QueryOperand * item)
static bool static bool
checkclass_str(CHKVAL * chkval, WordEntry * val, QueryOperand * item) checkclass_str(CHKVAL * chkval, WordEntry * val, QueryOperand * item)
{ {
WordEntryPos *ptr = (WordEntryPos *) (chkval->values + val->pos + SHORTALIGN(val->len) + sizeof(uint16)); WordEntryPos *ptr = (WordEntryPos *) (chkval->values + SHORTALIGN(val->pos + val->len) + sizeof(uint16));
uint16 len = *((uint16 *) (chkval->values + val->pos + SHORTALIGN(val->len))); uint16 len = *((uint16 *) (chkval->values + SHORTALIGN(val->pos + val->len)));
while (len--) while (len--)
{ {
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
* *
* Copyright (c) 1998-2007, PostgreSQL Global Development Group * Copyright (c) 1998-2007, PostgreSQL Global Development Group
* *
* $PostgreSQL: pgsql/src/include/tsearch/ts_type.h,v 1.3 2007/09/07 15:35:11 teodor Exp $ * $PostgreSQL: pgsql/src/include/tsearch/ts_type.h,v 1.4 2007/09/07 16:03:40 teodor Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -62,26 +62,33 @@ typedef uint16 WordEntryPos; ...@@ -62,26 +62,33 @@ typedef uint16 WordEntryPos;
* bytes from end of WordEntry array to start of * bytes from end of WordEntry array to start of
* corresponding lexeme. * corresponding lexeme.
* 4) Lexeme's storage: * 4) Lexeme's storage:
* SHORTALIGNED(lexeme) and position information if it exists * lexeme (without null-terminator)
* Position information: first int2 - is a number of positions and it * if haspos is true:
* follows array of WordEntryPos * padding byte if necessary to make the number of positions 2-byte aligned
* uint16 number of positions that follow.
* uint16[] positions
*
* The positions must be sorted.
*/ */
typedef struct typedef struct
{ {
int32 vl_len_; /* varlena header (do not touch directly!) */ int32 vl_len_; /* varlena header (do not touch directly!) */
uint32 size; int32 size;
char data[1]; WordEntry entries[1]; /* var size */
/* lexemes follow */
} TSVectorData; } TSVectorData;
typedef TSVectorData *TSVector; typedef TSVectorData *TSVector;
#define DATAHDRSIZE (VARHDRSZ + sizeof(int4)) #define DATAHDRSIZE (offsetof(TSVectorData, entries))
#define CALCDATASIZE(x, lenstr) ( (x) * sizeof(WordEntry) + DATAHDRSIZE + (lenstr) ) #define CALCDATASIZE(x, lenstr) (DATAHDRSIZE + (x) * sizeof(WordEntry) + (lenstr) )
#define ARRPTR(x) ( (WordEntry*) ( (char*)(x) + DATAHDRSIZE ) ) #define ARRPTR(x) ( (x)->entries )
#define STRPTR(x) ( (char*)(x) + DATAHDRSIZE + ( sizeof(WordEntry) * ((TSVector)(x))->size ) )
#define STRSIZE(x) ( ((TSVector)(x))->len - DATAHDRSIZE - ( sizeof(WordEntry) * ((TSVector)(x))->size ) ) /* returns a pointer to the beginning of lexemes */
#define _POSDATAPTR(x,e) (STRPTR(x)+((WordEntry*)(e))->pos+SHORTALIGN(((WordEntry*)(e))->len)) #define STRPTR(x) ( (char *) &(x)->entries[x->size] )
#define _POSDATAPTR(x,e) (STRPTR(x) + SHORTALIGN((e)->pos + (e)->len))
#define POSDATALEN(x,e) ( ( ((WordEntry*)(e))->haspos ) ? (*(uint16*)_POSDATAPTR(x,e)) : 0 ) #define POSDATALEN(x,e) ( ( ((WordEntry*)(e))->haspos ) ? (*(uint16*)_POSDATAPTR(x,e)) : 0 )
#define POSDATAPTR(x,e) ( (WordEntryPos*)( _POSDATAPTR(x,e)+sizeof(uint16) ) ) #define POSDATAPTR(x,e) ( (WordEntryPos*)( _POSDATAPTR(x,e)+sizeof(uint16) ) )
...@@ -159,7 +166,7 @@ typedef int8 QueryItemType; ...@@ -159,7 +166,7 @@ typedef int8 QueryItemType;
typedef struct typedef struct
{ {
QueryItemType type; /* operand or kind of operator (ts_tokentype) */ QueryItemType type; /* operand or kind of operator (ts_tokentype) */
int8 weight; /* weights of operand to search. It's a bitmask of allowed weights. uint8 weight; /* weights of operand to search. It's a bitmask of allowed weights.
* if it =0 then any weight are allowed. * if it =0 then any weight are allowed.
* Weights and bit map: * Weights and bit map:
* A: 1<<3 * A: 1<<3
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment