Commit 1db5af27 authored by Tom Lane's avatar Tom Lane

Fix gincostestimate to handle ScalarArrayOpExpr reasonably.

The original coding of this function overlooked the possibility that
it could be passed anything except simple OpExpr indexquals.  But
ScalarArrayOpExpr is possible too, and the code would probably crash
(and surely give ridiculous answers) in such a case.  Add logic to try
to estimate sanely for such cases.

In passing, fix the treatment of inner-indexscan cost estimation: it was
failing to scale up properly for multiple iterations of a nestloop.
(I think somebody might've thought that index_pages_fetched() is linear,
but of course it's not.)

Report, diagnosis, and preliminary patch by Marti Raudsepp; I refactored
it a bit and fixed the cost estimation.

Back-patch into 9.1 where the bogus code was introduced.
parent d0024cd1
......@@ -6575,6 +6575,20 @@ spgcostestimate(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
/*
* Support routines for gincostestimate
*/
typedef struct
{
bool haveFullScan;
double partialEntries;
double exactEntries;
double searchEntries;
double arrayScans;
} GinQualCounts;
/* Find the index column matching "op"; return its index, or -1 if no match */
static int
find_index_column(Node *op, IndexOptInfo *index)
......@@ -6590,6 +6604,277 @@ find_index_column(Node *op, IndexOptInfo *index)
return -1;
}
/*
* Estimate the number of index terms that need to be searched for while
* testing the given GIN query, and increment the counts in *counts
* appropriately. If the query is unsatisfiable, return false.
*/
static bool
gincost_pattern(IndexOptInfo *index, int indexcol,
Oid clause_op, Datum query,
GinQualCounts *counts)
{
Oid extractProcOid;
int strategy_op;
Oid lefttype,
righttype;
int32 nentries = 0;
bool *partial_matches = NULL;
Pointer *extra_data = NULL;
bool *nullFlags = NULL;
int32 searchMode = GIN_SEARCH_MODE_DEFAULT;
int32 i;
/*
* Get the operator's strategy number and declared input data types
* within the index opfamily. (We don't need the latter, but we use
* get_op_opfamily_properties because it will throw error if it fails
* to find a matching pg_amop entry.)
*/
get_op_opfamily_properties(clause_op, index->opfamily[indexcol], false,
&strategy_op, &lefttype, &righttype);
/*
* GIN always uses the "default" support functions, which are those
* with lefttype == righttype == the opclass' opcintype (see
* IndexSupportInitialize in relcache.c).
*/
extractProcOid = get_opfamily_proc(index->opfamily[indexcol],
index->opcintype[indexcol],
index->opcintype[indexcol],
GIN_EXTRACTQUERY_PROC);
if (!OidIsValid(extractProcOid))
{
/* should not happen; throw same error as index_getprocinfo */
elog(ERROR, "missing support function %d for attribute %d of index \"%s\"",
GIN_EXTRACTQUERY_PROC, indexcol + 1,
get_rel_name(index->indexoid));
}
OidFunctionCall7(extractProcOid,
query,
PointerGetDatum(&nentries),
UInt16GetDatum(strategy_op),
PointerGetDatum(&partial_matches),
PointerGetDatum(&extra_data),
PointerGetDatum(&nullFlags),
PointerGetDatum(&searchMode));
if (nentries <= 0 && searchMode == GIN_SEARCH_MODE_DEFAULT)
{
/* No match is possible */
return false;
}
for (i = 0; i < nentries; i++)
{
/*
* For partial match we haven't any information to estimate number of
* matched entries in index, so, we just estimate it as 100
*/
if (partial_matches && partial_matches[i])
counts->partialEntries += 100;
else
counts->exactEntries++;
counts->searchEntries++;
}
if (searchMode == GIN_SEARCH_MODE_INCLUDE_EMPTY)
{
/* Treat "include empty" like an exact-match item */
counts->exactEntries++;
counts->searchEntries++;
}
else if (searchMode != GIN_SEARCH_MODE_DEFAULT)
{
/* It's GIN_SEARCH_MODE_ALL */
counts->haveFullScan = true;
}
return true;
}
/*
* Estimate the number of index terms that need to be searched for while
* testing the given GIN index clause, and increment the counts in *counts
* appropriately. If the query is unsatisfiable, return false.
*/
static bool
gincost_opexpr(IndexOptInfo *index, OpExpr *clause, GinQualCounts *counts)
{
Node *leftop = get_leftop((Expr *) clause);
Node *rightop = get_rightop((Expr *) clause);
Oid clause_op = clause->opno;
int indexcol;
Node *operand;
/* Locate the operand being compared to the index column */
if ((indexcol = find_index_column(leftop, index)) >= 0)
{
operand = rightop;
}
else if ((indexcol = find_index_column(rightop, index)) >= 0)
{
operand = leftop;
clause_op = get_commutator(clause_op);
}
else
{
elog(ERROR, "could not match index to operand");
operand = NULL; /* keep compiler quiet */
}
if (IsA(operand, RelabelType))
operand = (Node *) ((RelabelType *) operand)->arg;
/*
* It's impossible to call extractQuery method for unknown operand. So
* unless operand is a Const we can't do much; just assume there will
* be one ordinary search entry from the operand at runtime.
*/
if (!IsA(operand, Const))
{
counts->exactEntries++;
counts->searchEntries++;
return true;
}
/* If Const is null, there can be no matches */
if (((Const *) operand)->constisnull)
return false;
/* Otherwise, apply extractQuery and get the actual term counts */
return gincost_pattern(index, indexcol, clause_op,
((Const *) operand)->constvalue,
counts);
}
/*
* Estimate the number of index terms that need to be searched for while
* testing the given GIN index clause, and increment the counts in *counts
* appropriately. If the query is unsatisfiable, return false.
*
* A ScalarArrayOpExpr will give rise to N separate indexscans at runtime,
* each of which involves one value from the RHS array, plus all the
* non-array quals (if any). To model this, we average the counts across
* the RHS elements, and add the averages to the counts in *counts (which
* correspond to per-indexscan costs). We also multiply counts->arrayScans
* by N, causing gincostestimate to scale up its estimates accordingly.
*/
static bool
gincost_scalararrayopexpr(IndexOptInfo *index, ScalarArrayOpExpr *clause,
double numIndexEntries,
GinQualCounts *counts)
{
Node *leftop = (Node *) linitial(clause->args);
Node *rightop = (Node *) lsecond(clause->args);
Oid clause_op = clause->opno;
int indexcol;
ArrayType *arrayval;
int16 elmlen;
bool elmbyval;
char elmalign;
int numElems;
Datum *elemValues;
bool *elemNulls;
GinQualCounts arraycounts;
int numPossible = 0;
int i;
Assert(clause->useOr);
/* index column must be on the left */
if ((indexcol = find_index_column(leftop, index)) < 0)
elog(ERROR, "could not match index to operand");
if (IsA(rightop, RelabelType))
rightop = (Node *) ((RelabelType *) rightop)->arg;
/*
* It's impossible to call extractQuery method for unknown operand. So
* unless operand is a Const we can't do much; just assume there will
* be one ordinary search entry from each array entry at runtime, and
* fall back on a probably-bad estimate of the number of array entries.
*/
if (!IsA(rightop, Const))
{
counts->exactEntries++;
counts->searchEntries++;
counts->arrayScans *= estimate_array_length(rightop);
return true;
}
/* If Const is null, there can be no matches */
if (((Const *) rightop)->constisnull)
return false;
/* Otherwise, extract the array elements and iterate over them */
arrayval = DatumGetArrayTypeP(((Const *) rightop)->constvalue);
get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
&elmlen, &elmbyval, &elmalign);
deconstruct_array(arrayval,
ARR_ELEMTYPE(arrayval),
elmlen, elmbyval, elmalign,
&elemValues, &elemNulls, &numElems);
memset(&arraycounts, 0, sizeof(arraycounts));
for (i = 0; i < numElems; i++)
{
GinQualCounts elemcounts;
/* NULL can't match anything, so ignore, as the executor will */
if (elemNulls[i])
continue;
/* Otherwise, apply extractQuery and get the actual term counts */
memset(&elemcounts, 0, sizeof(elemcounts));
if (gincost_pattern(index, indexcol, clause_op, elemValues[i],
&elemcounts))
{
/* We ignore array elements that are unsatisfiable patterns */
numPossible++;
if (elemcounts.haveFullScan)
{
/*
* Full index scan will be required. We treat this as if
* every key in the index had been listed in the query; is
* that reasonable?
*/
elemcounts.partialEntries = 0;
elemcounts.exactEntries = numIndexEntries;
elemcounts.searchEntries = numIndexEntries;
}
arraycounts.partialEntries += elemcounts.partialEntries;
arraycounts.exactEntries += elemcounts.exactEntries;
arraycounts.searchEntries += elemcounts.searchEntries;
}
}
if (numPossible == 0)
{
/* No satisfiable patterns in the array */
return false;
}
/*
* Now add the averages to the global counts. This will give us an
* estimate of the average number of terms searched for in each indexscan,
* including contributions from both array and non-array quals.
*/
counts->partialEntries += arraycounts.partialEntries / numPossible;
counts->exactEntries += arraycounts.exactEntries / numPossible;
counts->searchEntries += arraycounts.searchEntries / numPossible;
counts->arrayScans *= numPossible;
return true;
}
/*
* GIN has search behavior completely different from other index types
*/
......@@ -6613,17 +6898,15 @@ gincostestimate(PG_FUNCTION_ARGS)
numDataPages,
numPendingPages,
numEntries;
bool haveFullScan = false;
double partialEntriesInQuals = 0.0;
double searchEntriesInQuals = 0.0;
double exactEntriesInQuals = 0.0;
GinQualCounts counts;
bool matchPossible;
double entryPagesFetched,
dataPagesFetched,
dataPagesFetchedBySel;
double qual_op_cost,
qual_arg_cost,
spc_random_page_cost,
num_scans;
outer_scans;
QualCost index_qual_cost;
Relation indexRel;
GinStatsData ginStats;
......@@ -6709,199 +6992,113 @@ gincostestimate(PG_FUNCTION_ARGS)
/*
* Examine quals to estimate number of search entries & partial matches
*/
memset(&counts, 0, sizeof(counts));
counts.arrayScans = 1;
matchPossible = true;
foreach(l, indexQuals)
{
RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
Expr *clause;
Node *leftop,
*rightop,
*operand;
Oid extractProcOid;
Oid clause_op;
int strategy_op;
Oid lefttype,
righttype;
int32 nentries = 0;
bool *partial_matches = NULL;
Pointer *extra_data = NULL;
bool *nullFlags = NULL;
int32 searchMode = GIN_SEARCH_MODE_DEFAULT;
int indexcol;
Assert(IsA(rinfo, RestrictInfo));
clause = rinfo->clause;
Assert(IsA(clause, OpExpr));
leftop = get_leftop(clause);
rightop = get_rightop(clause);
clause_op = ((OpExpr *) clause)->opno;
if ((indexcol = find_index_column(leftop, index)) >= 0)
if (IsA(clause, OpExpr))
{
operand = rightop;
matchPossible = gincost_opexpr(index,
(OpExpr *) clause,
&counts);
if (!matchPossible)
break;
}
else if ((indexcol = find_index_column(rightop, index)) >= 0)
else if (IsA(clause, ScalarArrayOpExpr))
{
operand = leftop;
clause_op = get_commutator(clause_op);
matchPossible = gincost_scalararrayopexpr(index,
(ScalarArrayOpExpr *) clause,
numEntries,
&counts);
if (!matchPossible)
break;
}
else
{
elog(ERROR, "could not match index to operand");
operand = NULL; /* keep compiler quiet */
}
if (IsA(operand, RelabelType))
operand = (Node *) ((RelabelType *) operand)->arg;
/*
* It's impossible to call extractQuery method for unknown operand. So
* unless operand is a Const we can't do much; just assume there will
* be one ordinary search entry from the operand at runtime.
*/
if (!IsA(operand, Const))
{
searchEntriesInQuals++;
continue;
}
/* If Const is null, there can be no matches */
if (((Const *) operand)->constisnull)
{
*indexStartupCost = 0;
*indexTotalCost = 0;
*indexSelectivity = 0;
PG_RETURN_VOID();
/* shouldn't be anything else for a GIN index */
elog(ERROR, "unsupported GIN indexqual type: %d",
(int) nodeTag(clause));
}
/*
* Get the operator's strategy number and declared input data types
* within the index opfamily. (We don't need the latter, but we use
* get_op_opfamily_properties because it will throw error if it fails
* to find a matching pg_amop entry.)
*/
get_op_opfamily_properties(clause_op, index->opfamily[indexcol], false,
&strategy_op, &lefttype, &righttype);
/*
* GIN always uses the "default" support functions, which are those
* with lefttype == righttype == the opclass' opcintype (see
* IndexSupportInitialize in relcache.c).
*/
extractProcOid = get_opfamily_proc(index->opfamily[indexcol],
index->opcintype[indexcol],
index->opcintype[indexcol],
GIN_EXTRACTQUERY_PROC);
if (!OidIsValid(extractProcOid))
{
/* should not happen; throw same error as index_getprocinfo */
elog(ERROR, "missing support function %d for attribute %d of index \"%s\"",
GIN_EXTRACTQUERY_PROC, indexcol + 1,
get_rel_name(index->indexoid));
}
OidFunctionCall7(extractProcOid,
((Const *) operand)->constvalue,
PointerGetDatum(&nentries),
UInt16GetDatum(strategy_op),
PointerGetDatum(&partial_matches),
PointerGetDatum(&extra_data),
PointerGetDatum(&nullFlags),
PointerGetDatum(&searchMode));
if (nentries <= 0 && searchMode == GIN_SEARCH_MODE_DEFAULT)
/* Fall out if there were any provably-unsatisfiable quals */
if (!matchPossible)
{
/* No match is possible */
*indexStartupCost = 0;
*indexTotalCost = 0;
*indexSelectivity = 0;
PG_RETURN_VOID();
}
else
{
int32 i;
for (i = 0; i < nentries; i++)
{
/*
* For partial match we haven't any information to estimate
* number of matched entries in index, so, we just estimate it
* as 100
*/
if (partial_matches && partial_matches[i])
partialEntriesInQuals += 100;
else
exactEntriesInQuals++;
searchEntriesInQuals++;
}
}
if (searchMode == GIN_SEARCH_MODE_INCLUDE_EMPTY)
{
/* Treat "include empty" like an exact-match item */
exactEntriesInQuals++;
searchEntriesInQuals++;
}
else if (searchMode != GIN_SEARCH_MODE_DEFAULT)
{
/* It's GIN_SEARCH_MODE_ALL */
haveFullScan = true;
}
}
if (haveFullScan || indexQuals == NIL)
if (counts.haveFullScan || indexQuals == NIL)
{
/*
* Full index scan will be required. We treat this as if every key in
* the index had been listed in the query; is that reasonable?
*/
searchEntriesInQuals = numEntries;
counts.partialEntries = 0;
counts.exactEntries = numEntries;
counts.searchEntries = numEntries;
}
/* Will we have more than one iteration of a nestloop scan? */
if (outer_rel != NULL && outer_rel->rows > 1)
num_scans = outer_rel->rows;
outer_scans = outer_rel->rows;
else
num_scans = 1;
outer_scans = 1;
/*
* cost to begin scan, first of all, pay attention to pending list.
* Compute cost to begin scan, first of all, pay attention to pending list.
*/
entryPagesFetched = numPendingPages;
/*
* Estimate number of entry pages read. We need to do
* searchEntriesInQuals searches. Use a power function as it should be,
* counts.searchEntries searches. Use a power function as it should be,
* but tuples on leaf pages usually is much greater. Here we include all
* searches in entry tree, including search of first entry in partial
* match algorithm
*/
entryPagesFetched += ceil(searchEntriesInQuals * rint(pow(numEntryPages, 0.15)));
entryPagesFetched += ceil(counts.searchEntries * rint(pow(numEntryPages, 0.15)));
/*
* Add an estimate of entry pages read by partial match algorithm. It's a
* scan over leaf pages in entry tree. We haven't any useful stats here,
* so estimate it as proportion.
*/
entryPagesFetched += ceil(numEntryPages * partialEntriesInQuals / numEntries);
entryPagesFetched += ceil(numEntryPages * counts.partialEntries / numEntries);
/*
* Partial match algorithm reads all data pages before doing actual scan,
* so it's a startup cost. Again, we havn't any useful stats here, so,
* so it's a startup cost. Again, we haven't any useful stats here, so,
* estimate it as proportion
*/
dataPagesFetched = ceil(numDataPages * partialEntriesInQuals / numEntries);
dataPagesFetched = ceil(numDataPages * counts.partialEntries / numEntries);
/* calculate cache effects */
if (num_scans > 1 || searchEntriesInQuals > 1)
/*
* Calculate cache effects if more than one scan due to nestloops or array
* quals. The result is pro-rated per nestloop scan, but the array qual
* factor shouldn't be pro-rated (compare genericcostestimate).
*/
if (outer_scans > 1 || counts.arrayScans > 1)
{
entryPagesFetched *= outer_scans * counts.arrayScans;
entryPagesFetched = index_pages_fetched(entryPagesFetched,
(BlockNumber) numEntryPages,
numEntryPages, root);
entryPagesFetched /= outer_scans;
dataPagesFetched *= outer_scans * counts.arrayScans;
dataPagesFetched = index_pages_fetched(dataPagesFetched,
(BlockNumber) numDataPages,
numDataPages, root);
dataPagesFetched /= outer_scans;
}
/*
......@@ -6910,8 +7107,12 @@ gincostestimate(PG_FUNCTION_ARGS)
*/
*indexStartupCost = (entryPagesFetched + dataPagesFetched) * spc_random_page_cost;
/* cost to scan data pages for each exact (non-partial) matched entry */
dataPagesFetched = ceil(numDataPages * exactEntriesInQuals / numEntries);
/*
* Now we compute the number of data pages fetched while the scan proceeds.
*/
/* data pages scanned for each exact (non-partial) matched entry */
dataPagesFetched = ceil(numDataPages * counts.exactEntries / numEntries);
/*
* Estimate number of data pages read, using selectivity estimation and
......@@ -6932,10 +7133,17 @@ gincostestimate(PG_FUNCTION_ARGS)
dataPagesFetched = dataPagesFetchedBySel;
}
if (num_scans > 1)
/* Account for cache effects, the same as above */
if (outer_scans > 1 || counts.arrayScans > 1)
{
dataPagesFetched *= outer_scans * counts.arrayScans;
dataPagesFetched = index_pages_fetched(dataPagesFetched,
(BlockNumber) numDataPages,
numDataPages, root);
dataPagesFetched /= outer_scans;
}
/* And apply random_page_cost as the cost per page */
*indexTotalCost = *indexStartupCost +
dataPagesFetched * spc_random_page_cost;
......
......@@ -142,6 +142,12 @@ SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*';
494
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}');
count
-------
158
(1 row)
RESET enable_seqscan;
DROP INDEX wowidx;
CREATE INDEX wowidx ON test_tsvector USING gin (a);
......@@ -188,6 +194,12 @@ SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*';
494
(1 row)
SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}');
count
-------
158
(1 row)
RESET enable_seqscan;
INSERT INTO test_tsvector VALUES ('???', 'DFG:1A,2B,6C,10 FGH');
SELECT * FROM ts_stat('SELECT a FROM test_tsvector') ORDER BY ndoc DESC, nentry DESC, word LIMIT 10;
......
......@@ -60,6 +60,7 @@ SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt';
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)';
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)';
SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*';
SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}');
RESET enable_seqscan;
......@@ -76,6 +77,7 @@ SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt';
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)';
SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)';
SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*';
SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}');
RESET enable_seqscan;
INSERT INTO test_tsvector VALUES ('???', 'DFG:1A,2B,6C,10 FGH');
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment