Commit b4c6d31c authored by Tom Lane's avatar Tom Lane

Fix serious performance problems in json(b) to_tsvector().

In an off-list followup to bug #14745, Bob Jones complained that
to_tsvector() on a 2MB jsonb value took an unreasonable amount of
time and space --- enough to draw the wrath of the OOM killer on
his machine.  On my machine, his example proved to require upwards
of 18 seconds and 4GB, which seemed pretty bogus considering that
to_tsvector() on the same data treated as text took just a couple
hundred msec and 10 or so MB.

On investigation, the problem is that the implementation scans each
string element of the json(b) and converts it to tsvector separately,
then applies tsvector_concat() to join those separate tsvectors.
The unreasonable memory usage came from leaking every single one of
the transient tsvectors --- but even without that mistake, this is an
O(N^2) or worse algorithm, because tsvector_concat() has to repeatedly
process the words coming from earlier elements.

We can fix it by accumulating all the lexeme data and applying
make_tsvector() just once.  As a side benefit, that also makes the
desired adjustment of lexeme positions far cheaper, because we can
just tweak the running "pos" counter between JSON elements.

In passing, try to make the explanation of that tweak more intelligible.
(I didn't think that a barely-readable comment far removed from the
actual code was helpful.)  And do some minor other code beautification.
parent fb9bd4b0
...@@ -28,11 +28,11 @@ typedef struct MorphOpaque ...@@ -28,11 +28,11 @@ typedef struct MorphOpaque
typedef struct TSVectorBuildState typedef struct TSVectorBuildState
{ {
ParsedText *prs; ParsedText *prs;
TSVector result;
Oid cfgId; Oid cfgId;
} TSVectorBuildState; } TSVectorBuildState;
static void add_to_tsvector(void *state, char *elem_value, int elem_len); static void add_to_tsvector(void *_state, char *elem_value, int elem_len);
Datum Datum
get_current_ts_config(PG_FUNCTION_ARGS) get_current_ts_config(PG_FUNCTION_ARGS)
...@@ -270,34 +270,33 @@ jsonb_to_tsvector_byid(PG_FUNCTION_ARGS) ...@@ -270,34 +270,33 @@ jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
{ {
Oid cfgId = PG_GETARG_OID(0); Oid cfgId = PG_GETARG_OID(0);
Jsonb *jb = PG_GETARG_JSONB(1); Jsonb *jb = PG_GETARG_JSONB(1);
TSVector result;
TSVectorBuildState state; TSVectorBuildState state;
ParsedText *prs = (ParsedText *) palloc(sizeof(ParsedText)); ParsedText prs;
prs->words = NULL; prs.words = NULL;
state.result = NULL; prs.curwords = 0;
state.prs = &prs;
state.cfgId = cfgId; state.cfgId = cfgId;
state.prs = prs;
iterate_jsonb_string_values(jb, &state, (JsonIterateStringValuesAction) add_to_tsvector); iterate_jsonb_string_values(jb, &state, add_to_tsvector);
PG_FREE_IF_COPY(jb, 1); if (prs.curwords > 0)
result = make_tsvector(&prs);
if (state.result == NULL) else
{ {
/* /*
* There weren't any string elements in jsonb, so wee need to return * There weren't any string elements in jsonb, so we need to return an
* an empty vector * empty vector
*/ */
result = palloc(CALCDATASIZE(0, 0));
if (prs->words != NULL) SET_VARSIZE(result, CALCDATASIZE(0, 0));
pfree(prs->words); result->size = 0;
state.result = palloc(CALCDATASIZE(0, 0));
SET_VARSIZE(state.result, CALCDATASIZE(0, 0));
state.result->size = 0;
} }
PG_RETURN_TSVECTOR(state.result); PG_FREE_IF_COPY(jb, 1);
PG_RETURN_TSVECTOR(result);
} }
Datum Datum
...@@ -317,33 +316,33 @@ json_to_tsvector_byid(PG_FUNCTION_ARGS) ...@@ -317,33 +316,33 @@ json_to_tsvector_byid(PG_FUNCTION_ARGS)
{ {
Oid cfgId = PG_GETARG_OID(0); Oid cfgId = PG_GETARG_OID(0);
text *json = PG_GETARG_TEXT_P(1); text *json = PG_GETARG_TEXT_P(1);
TSVector result;
TSVectorBuildState state; TSVectorBuildState state;
ParsedText *prs = (ParsedText *) palloc(sizeof(ParsedText)); ParsedText prs;
prs->words = NULL; prs.words = NULL;
state.result = NULL; prs.curwords = 0;
state.prs = &prs;
state.cfgId = cfgId; state.cfgId = cfgId;
state.prs = prs;
iterate_json_string_values(json, &state, (JsonIterateStringValuesAction) add_to_tsvector); iterate_json_string_values(json, &state, add_to_tsvector);
PG_FREE_IF_COPY(json, 1); if (prs.curwords > 0)
if (state.result == NULL) result = make_tsvector(&prs);
else
{ {
/* /*
* There weren't any string elements in json, so wee need to return an * There weren't any string elements in json, so we need to return an
* empty vector * empty vector
*/ */
result = palloc(CALCDATASIZE(0, 0));
if (prs->words != NULL) SET_VARSIZE(result, CALCDATASIZE(0, 0));
pfree(prs->words); result->size = 0;
state.result = palloc(CALCDATASIZE(0, 0));
SET_VARSIZE(state.result, CALCDATASIZE(0, 0));
state.result->size = 0;
} }
PG_RETURN_TSVECTOR(state.result); PG_FREE_IF_COPY(json, 1);
PG_RETURN_TSVECTOR(result);
} }
Datum Datum
...@@ -359,45 +358,42 @@ json_to_tsvector(PG_FUNCTION_ARGS) ...@@ -359,45 +358,42 @@ json_to_tsvector(PG_FUNCTION_ARGS)
} }
/* /*
* Extend current TSVector from _state with a new one, * Parse lexemes in an element of a json(b) value, add to TSVectorBuildState.
* build over a json(b) element.
*/ */
static void static void
add_to_tsvector(void *_state, char *elem_value, int elem_len) add_to_tsvector(void *_state, char *elem_value, int elem_len)
{ {
TSVectorBuildState *state = (TSVectorBuildState *) _state; TSVectorBuildState *state = (TSVectorBuildState *) _state;
ParsedText *prs = state->prs; ParsedText *prs = state->prs;
TSVector item_vector; int32 prevwords;
int i;
prs->lenwords = elem_len / 6; if (prs->words == NULL)
if (prs->lenwords == 0) {
prs->lenwords = 2; /*
* First time through: initialize words array to a reasonable size.
* (parsetext() will realloc it bigger as needed.)
*/
prs->lenwords = Max(elem_len / 6, 64);
prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords);
prs->curwords = 0;
prs->pos = 0;
}
prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords); prevwords = prs->curwords;
prs->curwords = 0;
prs->pos = 0;
parsetext(state->cfgId, prs, elem_value, elem_len); parsetext(state->cfgId, prs, elem_value, elem_len);
if (prs->curwords) /*
{ * If we extracted any words from this JSON element, advance pos to create
if (state->result != NULL) * an artificial break between elements. This is because we don't want
{ * phrase searches to think that the last word in this element is adjacent
for (i = 0; i < prs->curwords; i++) * to the first word in the next one.
prs->words[i].pos.pos = prs->words[i].pos.pos + TS_JUMP; */
if (prs->curwords > prevwords)
item_vector = make_tsvector(prs); prs->pos += 1;
state->result = (TSVector) DirectFunctionCall2(tsvector_concat,
TSVectorGetDatum(state->result),
PointerGetDatum(item_vector));
}
else
state->result = make_tsvector(prs);
}
} }
/* /*
* to_tsquery * to_tsquery
*/ */
......
...@@ -86,15 +86,6 @@ typedef struct ...@@ -86,15 +86,6 @@ typedef struct
#define MAXNUMPOS (256) #define MAXNUMPOS (256)
#define LIMITPOS(x) ( ( (x) >= MAXENTRYPOS ) ? (MAXENTRYPOS-1) : (x) ) #define LIMITPOS(x) ( ( (x) >= MAXENTRYPOS ) ? (MAXENTRYPOS-1) : (x) )
/*
* In case if a TSVector contains several parts and we want to treat them as
* separate, it's necessary to add an artificial increment to position of each
* lexeme from every next part. It's required to avoid the situation when
* tsquery can find a phrase consisting of lexemes from two of such parts.
* TS_JUMP defined a value of this increment.
*/
#define TS_JUMP 1
/* This struct represents a complete tsvector datum */ /* This struct represents a complete tsvector datum */
typedef struct typedef struct
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment