Commit 31f38f28 authored by Tom Lane's avatar Tom Lane

Redesign the planner's handling of index-descent cost estimation.

Historically we've used a couple of very ad-hoc fudge factors to try to
get the right results when indexes of different sizes would satisfy a
query with the same number of index leaf tuples being visited.  In
commit 21a39de5 I tweaked one of these
fudge factors, with results that proved disastrous for larger indexes.
Commit bf01e34b fudged it some more,
but still with not a lot of principle behind it.

What seems like a better way to address these issues is to explicitly model
index-descent costs, since that's what's really at stake when considering
diferent indexes with similar leaf-page-level costs.  We tried that once
long ago, and found that charging random_page_cost per page descended
through was way too much, because upper btree levels tend to stay in cache
in real-world workloads.  However, there's still CPU costs to think about,
and the previous fudge factors can be seen as a crude attempt to account
for those costs.  So this patch replaces those fudge factors with explicit
charges for the number of tuple comparisons needed to descend the index
tree, plus a small charge per page touched in the descent.  The cost
multipliers are chosen so that the resulting charges are in the vicinity of
the historical (pre-9.2) fudge factors for indexes of up to about a million
tuples, while not ballooning unreasonably beyond that, as the old fudge
factor did (even more so in 9.2).

To make this work accurately for btree indexes, add some code that allows
extraction of the known root-page height from a btree.  There's no
equivalent number readily available for other index types, but we can use
the log of the number of index pages as an approximate substitute.

This seems like too much of a behavioral change to risk back-patching,
but it should improve matters going forward.  In 9.2 I'll just revert
the fudge-factor change.
parent e1b735ae
...@@ -411,6 +411,82 @@ _bt_gettrueroot(Relation rel) ...@@ -411,6 +411,82 @@ _bt_gettrueroot(Relation rel)
return rootbuf; return rootbuf;
} }
/*
* _bt_getrootheight() -- Get the height of the btree search tree.
*
* We return the level (counting from zero) of the current fast root.
* This represents the number of tree levels we'd have to descend through
* to start any btree index search.
*
* This is used by the planner for cost-estimation purposes. Since it's
* only an estimate, slightly-stale data is fine, hence we don't worry
* about updating previously cached data.
*/
int
_bt_getrootheight(Relation rel)
{
BTMetaPageData *metad;
/*
* We can get what we need from the cached metapage data. If it's not
* cached yet, load it. Sanity checks here must match _bt_getroot().
*/
if (rel->rd_amcache == NULL)
{
Buffer metabuf;
Page metapg;
BTPageOpaque metaopaque;
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
metad = BTPageGetMeta(metapg);
/* sanity-check the metapage */
if (!(metaopaque->btpo_flags & BTP_META) ||
metad->btm_magic != BTREE_MAGIC)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("index \"%s\" is not a btree",
RelationGetRelationName(rel))));
if (metad->btm_version != BTREE_VERSION)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("version mismatch in index \"%s\": file version %d, code version %d",
RelationGetRelationName(rel),
metad->btm_version, BTREE_VERSION)));
/*
* If there's no root page yet, _bt_getroot() doesn't expect a cache
* to be made, so just stop here and report the index height is zero.
* (XXX perhaps _bt_getroot() should be changed to allow this case.)
*/
if (metad->btm_root == P_NONE)
{
_bt_relbuf(rel, metabuf);
return 0;
}
/*
* Cache the metapage data for next time
*/
rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
sizeof(BTMetaPageData));
memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
_bt_relbuf(rel, metabuf);
}
metad = (BTMetaPageData *) rel->rd_amcache;
/* We shouldn't have cached it if any of these fail */
Assert(metad->btm_magic == BTREE_MAGIC);
Assert(metad->btm_version == BTREE_VERSION);
Assert(metad->btm_fastroot != P_NONE);
return metad->btm_fastlevel;
}
/* /*
* _bt_checkpage() -- Verify that a freshly-read page looks sane. * _bt_checkpage() -- Verify that a freshly-read page looks sane.
*/ */
......
...@@ -1772,7 +1772,9 @@ _outIndexOptInfo(StringInfo str, const IndexOptInfo *node) ...@@ -1772,7 +1772,9 @@ _outIndexOptInfo(StringInfo str, const IndexOptInfo *node)
/* Do NOT print rel field, else infinite recursion */ /* Do NOT print rel field, else infinite recursion */
WRITE_UINT_FIELD(pages); WRITE_UINT_FIELD(pages);
WRITE_FLOAT_FIELD(tuples, "%.0f"); WRITE_FLOAT_FIELD(tuples, "%.0f");
WRITE_INT_FIELD(tree_height);
WRITE_INT_FIELD(ncolumns); WRITE_INT_FIELD(ncolumns);
/* array fields aren't really worth the trouble to print */
WRITE_OID_FIELD(relam); WRITE_OID_FIELD(relam);
/* indexprs is redundant since we print indextlist */ /* indexprs is redundant since we print indextlist */
WRITE_NODE_FIELD(indpred); WRITE_NODE_FIELD(indpred);
...@@ -1781,6 +1783,7 @@ _outIndexOptInfo(StringInfo str, const IndexOptInfo *node) ...@@ -1781,6 +1783,7 @@ _outIndexOptInfo(StringInfo str, const IndexOptInfo *node)
WRITE_BOOL_FIELD(unique); WRITE_BOOL_FIELD(unique);
WRITE_BOOL_FIELD(immediate); WRITE_BOOL_FIELD(immediate);
WRITE_BOOL_FIELD(hypothetical); WRITE_BOOL_FIELD(hypothetical);
/* we don't bother with fields copied from the pg_am entry */
} }
static void static void
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "access/genam.h" #include "access/genam.h"
#include "access/heapam.h" #include "access/heapam.h"
#include "access/htup_details.h" #include "access/htup_details.h"
#include "access/nbtree.h"
#include "access/sysattr.h" #include "access/sysattr.h"
#include "access/transam.h" #include "access/transam.h"
#include "access/xlog.h" #include "access/xlog.h"
...@@ -352,6 +353,17 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, ...@@ -352,6 +353,17 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
info->tuples = rel->tuples; info->tuples = rel->tuples;
} }
if (info->relam == BTREE_AM_OID)
{
/* For btrees, get tree height while we have the index open */
info->tree_height = _bt_getrootheight(indexRelation);
}
else
{
/* For other index types, just set it to "unknown" for now */
info->tree_height = -1;
}
index_close(indexRelation, NoLock); index_close(indexRelation, NoLock);
indexinfos = lcons(info, indexinfos); indexinfos = lcons(info, indexinfos);
......
...@@ -201,6 +201,7 @@ static Selectivity regex_selectivity(const char *patt, int pattlen, ...@@ -201,6 +201,7 @@ static Selectivity regex_selectivity(const char *patt, int pattlen,
static Datum string_to_datum(const char *str, Oid datatype); static Datum string_to_datum(const char *str, Oid datatype);
static Const *string_to_const(const char *str, Oid datatype); static Const *string_to_const(const char *str, Oid datatype);
static Const *string_to_bytea_const(const char *str, size_t str_len); static Const *string_to_bytea_const(const char *str, size_t str_len);
static List *add_predicate_to_quals(IndexOptInfo *index, List *indexQuals);
/* /*
...@@ -5916,76 +5917,55 @@ string_to_bytea_const(const char *str, size_t str_len) ...@@ -5916,76 +5917,55 @@ string_to_bytea_const(const char *str, size_t str_len)
*/ */
/* /*
* If the index is partial, add its predicate to the given qual list. * genericcostestimate is a general-purpose estimator that can be used for
* * most index types. In some cases we use genericcostestimate as the base
* ANDing the index predicate with the explicitly given indexquals produces * code and then incorporate additional index-type-specific knowledge in
* a more accurate idea of the index's selectivity. However, we need to be * the type-specific calling function. To avoid code duplication, we make
* careful not to insert redundant clauses, because clauselist_selectivity() * genericcostestimate return a number of intermediate values as well as
* is easily fooled into computing a too-low selectivity estimate. Our * its preliminary estimates of the output cost values. The GenericCosts
* approach is to add only the predicate clause(s) that cannot be proven to * struct includes all these values.
* be implied by the given indexquals. This successfully handles cases such *
* as a qual "x = 42" used with a partial index "WHERE x >= 40 AND x < 50". * Callers should initialize all fields of GenericCosts to zero. In addition,
* There are many other cases where we won't detect redundancy, leading to a * they can set numIndexTuples to some positive value if they have a better
* too-low selectivity estimate, which will bias the system in favor of using * than default way of estimating the number of leaf index tuples visited.
* partial indexes where possible. That is not necessarily bad though.
*
* Note that indexQuals contains RestrictInfo nodes while the indpred
* does not, so the output list will be mixed. This is OK for both
* predicate_implied_by() and clauselist_selectivity(), but might be
* problematic if the result were passed to other things.
*/ */
static List * typedef struct
add_predicate_to_quals(IndexOptInfo *index, List *indexQuals)
{ {
List *predExtraQuals = NIL; /* These are the values the cost estimator must return to the planner */
ListCell *lc; Cost indexStartupCost; /* index-related startup cost */
Cost indexTotalCost; /* total index-related scan cost */
if (index->indpred == NIL) Selectivity indexSelectivity; /* selectivity of index */
return indexQuals; double indexCorrelation; /* order correlation of index */
/* Intermediate values we obtain along the way */
double numIndexPages; /* number of leaf pages visited */
double numIndexTuples; /* number of leaf tuples visited */
double spc_random_page_cost; /* relevant random_page_cost value */
double num_sa_scans; /* # indexscans from ScalarArrayOps */
} GenericCosts;
foreach(lc, index->indpred)
{
Node *predQual = (Node *) lfirst(lc);
List *oneQual = list_make1(predQual);
if (!predicate_implied_by(oneQual, indexQuals))
predExtraQuals = list_concat(predExtraQuals, oneQual);
}
/* list_concat avoids modifying the passed-in indexQuals list */
return list_concat(predExtraQuals, indexQuals);
}
/*
* genericcostestimate is a general-purpose estimator for use when we
* don't have any better idea about how to estimate. Index-type-specific
* knowledge can be incorporated in the type-specific routines.
*
* One bit of index-type-specific knowledge we can relatively easily use
* in genericcostestimate is the estimate of the number of index tuples
* visited. If numIndexTuples is not 0 then it is used as the estimate,
* otherwise we compute a generic estimate.
*/
static void static void
genericcostestimate(PlannerInfo *root, genericcostestimate(PlannerInfo *root,
IndexPath *path, IndexPath *path,
double loop_count, double loop_count,
double numIndexTuples, GenericCosts *costs)
Cost *indexStartupCost,
Cost *indexTotalCost,
Selectivity *indexSelectivity,
double *indexCorrelation)
{ {
IndexOptInfo *index = path->indexinfo; IndexOptInfo *index = path->indexinfo;
List *indexQuals = path->indexquals; List *indexQuals = path->indexquals;
List *indexOrderBys = path->indexorderbys; List *indexOrderBys = path->indexorderbys;
Cost indexStartupCost;
Cost indexTotalCost;
Selectivity indexSelectivity;
double indexCorrelation;
double numIndexPages; double numIndexPages;
double numIndexTuples;
double spc_random_page_cost;
double num_sa_scans; double num_sa_scans;
double num_outer_scans; double num_outer_scans;
double num_scans; double num_scans;
QualCost index_qual_cost; QualCost index_qual_cost;
double qual_op_cost; double qual_op_cost;
double qual_arg_cost; double qual_arg_cost;
double spc_random_page_cost;
List *selectivityQuals; List *selectivityQuals;
ListCell *l; ListCell *l;
...@@ -6016,7 +5996,7 @@ genericcostestimate(PlannerInfo *root, ...@@ -6016,7 +5996,7 @@ genericcostestimate(PlannerInfo *root,
} }
/* Estimate the fraction of main-table tuples that will be visited */ /* Estimate the fraction of main-table tuples that will be visited */
*indexSelectivity = clauselist_selectivity(root, selectivityQuals, indexSelectivity = clauselist_selectivity(root, selectivityQuals,
index->rel->relid, index->rel->relid,
JOIN_INNER, JOIN_INNER,
NULL); NULL);
...@@ -6026,9 +6006,10 @@ genericcostestimate(PlannerInfo *root, ...@@ -6026,9 +6006,10 @@ genericcostestimate(PlannerInfo *root,
* tuples that will be visited. We do it in this rather peculiar-looking * tuples that will be visited. We do it in this rather peculiar-looking
* way in order to get the right answer for partial indexes. * way in order to get the right answer for partial indexes.
*/ */
numIndexTuples = costs->numIndexTuples;
if (numIndexTuples <= 0.0) if (numIndexTuples <= 0.0)
{ {
numIndexTuples = *indexSelectivity * index->rel->tuples; numIndexTuples = indexSelectivity * index->rel->tuples;
/* /*
* The above calculation counts all the tuples visited across all * The above calculation counts all the tuples visited across all
...@@ -6055,9 +6036,12 @@ genericcostestimate(PlannerInfo *root, ...@@ -6055,9 +6036,12 @@ genericcostestimate(PlannerInfo *root,
* *
* We use the simplistic method of taking a pro-rata fraction of the total * We use the simplistic method of taking a pro-rata fraction of the total
* number of index pages. In effect, this counts only leaf pages and not * number of index pages. In effect, this counts only leaf pages and not
* any overhead such as index metapage or upper tree levels. In practice * any overhead such as index metapage or upper tree levels.
* this seems a better approximation than charging for access to the upper *
* levels, perhaps because those tend to stay in cache under load. * In practice access to upper index levels is often nearly free because
* those tend to stay in cache under load; moreover, the cost involved is
* highly dependent on index type. We therefore ignore such costs here
* and leave it to the caller to add a suitable charge if needed.
*/ */
if (index->pages > 1 && index->tuples > 1) if (index->pages > 1 && index->tuples > 1)
numIndexPages = ceil(numIndexTuples * index->pages / index->tuples); numIndexPages = ceil(numIndexTuples * index->pages / index->tuples);
...@@ -6107,7 +6091,7 @@ genericcostestimate(PlannerInfo *root, ...@@ -6107,7 +6091,7 @@ genericcostestimate(PlannerInfo *root,
* share for each outer scan. (Don't pro-rate for ScalarArrayOpExpr, * share for each outer scan. (Don't pro-rate for ScalarArrayOpExpr,
* since that's internal to the indexscan.) * since that's internal to the indexscan.)
*/ */
*indexTotalCost = (pages_fetched * spc_random_page_cost) indexTotalCost = (pages_fetched * spc_random_page_cost)
/ num_outer_scans; / num_outer_scans;
} }
else else
...@@ -6116,29 +6100,9 @@ genericcostestimate(PlannerInfo *root, ...@@ -6116,29 +6100,9 @@ genericcostestimate(PlannerInfo *root,
* For a single index scan, we just charge spc_random_page_cost per * For a single index scan, we just charge spc_random_page_cost per
* page touched. * page touched.
*/ */
*indexTotalCost = numIndexPages * spc_random_page_cost; indexTotalCost = numIndexPages * spc_random_page_cost;
} }
/*
* A difficulty with the leaf-pages-only cost approach is that for small
* selectivities (eg, single index tuple fetched) all indexes will look
* equally attractive because we will estimate exactly 1 leaf page to be
* fetched. All else being equal, we should prefer physically smaller
* indexes over larger ones. (An index might be smaller because it is
* partial or because it contains fewer columns; presumably the other
* columns in the larger index aren't useful to the query, or the larger
* index would have better selectivity.)
*
* We can deal with this by adding a very small "fudge factor" that
* depends on the index size, so that indexes of different sizes won't
* look exactly equally attractive. To ensure the fudge factor stays
* small even for very large indexes, use a log function. (We previously
* used a factor of one spc_random_page_cost per 10000 index pages, which
* grew too large for large indexes. This expression has about the same
* growth rate for small indexes, but tails off quickly.)
*/
*indexTotalCost += log(1.0 + index->pages / 10000.0) * spc_random_page_cost;
/* /*
* CPU cost: any complex expressions in the indexquals will need to be * CPU cost: any complex expressions in the indexquals will need to be
* evaluated once at the start of the scan to reduce them to runtime keys * evaluated once at the start of the scan to reduce them to runtime keys
...@@ -6149,10 +6113,9 @@ genericcostestimate(PlannerInfo *root, ...@@ -6149,10 +6113,9 @@ genericcostestimate(PlannerInfo *root,
* for ScalarArrayOpExpr cases. Similarly add in costs for any index * for ScalarArrayOpExpr cases. Similarly add in costs for any index
* ORDER BY expressions. * ORDER BY expressions.
* *
* Note: this neglects the possible costs of rechecking lossy operators * Note: this neglects the possible costs of rechecking lossy operators.
* and OR-clause expressions. Detecting that that might be needed seems * Detecting that that might be needed seems more expensive than it's
* more expensive than it's worth, though, considering all the other * worth, though, considering all the other inaccuracies here ...
* inaccuracies here ...
*/ */
cost_qual_eval(&index_qual_cost, indexQuals, root); cost_qual_eval(&index_qual_cost, indexQuals, root);
qual_arg_cost = index_qual_cost.startup + index_qual_cost.per_tuple; qual_arg_cost = index_qual_cost.startup + index_qual_cost.per_tuple;
...@@ -6164,29 +6127,66 @@ genericcostestimate(PlannerInfo *root, ...@@ -6164,29 +6127,66 @@ genericcostestimate(PlannerInfo *root,
if (qual_arg_cost < 0) /* just in case... */ if (qual_arg_cost < 0) /* just in case... */
qual_arg_cost = 0; qual_arg_cost = 0;
*indexStartupCost = qual_arg_cost; indexStartupCost = qual_arg_cost;
*indexTotalCost += qual_arg_cost; indexTotalCost += qual_arg_cost;
*indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost); indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost);
/* /*
* We also add a CPU-cost component to represent the general costs of * Generic assumption about index correlation: there isn't any.
* starting an indexscan, such as analysis of btree index keys and initial
* tree descent. This is estimated at 100x cpu_operator_cost, which is a
* bit arbitrary but seems the right order of magnitude. (As noted above,
* we don't charge any I/O for touching upper tree levels, but charging
* nothing at all has been found too optimistic.)
*
* Although this is startup cost with respect to any one scan, we add it
* to the "total" cost component because it's only very interesting in the
* many-ScalarArrayOpExpr-scan case, and there it will be paid over the
* life of the scan node.
*/ */
*indexTotalCost += num_sa_scans * 100.0 * cpu_operator_cost; indexCorrelation = 0.0;
/* /*
* Generic assumption about index correlation: there isn't any. * Return everything to caller.
*/ */
*indexCorrelation = 0.0; costs->indexStartupCost = indexStartupCost;
costs->indexTotalCost = indexTotalCost;
costs->indexSelectivity = indexSelectivity;
costs->indexCorrelation = indexCorrelation;
costs->numIndexPages = numIndexPages;
costs->numIndexTuples = numIndexTuples;
costs->spc_random_page_cost = spc_random_page_cost;
costs->num_sa_scans = num_sa_scans;
}
/*
* If the index is partial, add its predicate to the given qual list.
*
* ANDing the index predicate with the explicitly given indexquals produces
* a more accurate idea of the index's selectivity. However, we need to be
* careful not to insert redundant clauses, because clauselist_selectivity()
* is easily fooled into computing a too-low selectivity estimate. Our
* approach is to add only the predicate clause(s) that cannot be proven to
* be implied by the given indexquals. This successfully handles cases such
* as a qual "x = 42" used with a partial index "WHERE x >= 40 AND x < 50".
* There are many other cases where we won't detect redundancy, leading to a
* too-low selectivity estimate, which will bias the system in favor of using
* partial indexes where possible. That is not necessarily bad though.
*
* Note that indexQuals contains RestrictInfo nodes while the indpred
* does not, so the output list will be mixed. This is OK for both
* predicate_implied_by() and clauselist_selectivity(), but might be
* problematic if the result were passed to other things.
*/
static List *
add_predicate_to_quals(IndexOptInfo *index, List *indexQuals)
{
List *predExtraQuals = NIL;
ListCell *lc;
if (index->indpred == NIL)
return indexQuals;
foreach(lc, index->indpred)
{
Node *predQual = (Node *) lfirst(lc);
List *oneQual = list_make1(predQual);
if (!predicate_implied_by(oneQual, indexQuals))
predExtraQuals = list_concat(predExtraQuals, oneQual);
}
/* list_concat avoids modifying the passed-in indexQuals list */
return list_concat(predExtraQuals, indexQuals);
} }
...@@ -6201,10 +6201,12 @@ btcostestimate(PG_FUNCTION_ARGS) ...@@ -6201,10 +6201,12 @@ btcostestimate(PG_FUNCTION_ARGS)
Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5); Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
double *indexCorrelation = (double *) PG_GETARG_POINTER(6); double *indexCorrelation = (double *) PG_GETARG_POINTER(6);
IndexOptInfo *index = path->indexinfo; IndexOptInfo *index = path->indexinfo;
GenericCosts costs;
Oid relid; Oid relid;
AttrNumber colnum; AttrNumber colnum;
VariableStatData vardata; VariableStatData vardata;
double numIndexTuples; double numIndexTuples;
Cost descentCost;
List *indexBoundQuals; List *indexBoundQuals;
int indexcol; int indexcol;
bool eqQualHere; bool eqQualHere;
...@@ -6379,10 +6381,45 @@ btcostestimate(PG_FUNCTION_ARGS) ...@@ -6379,10 +6381,45 @@ btcostestimate(PG_FUNCTION_ARGS)
numIndexTuples = rint(numIndexTuples / num_sa_scans); numIndexTuples = rint(numIndexTuples / num_sa_scans);
} }
genericcostestimate(root, path, loop_count, /*
numIndexTuples, * Now do generic index cost estimation.
indexStartupCost, indexTotalCost, */
indexSelectivity, indexCorrelation); MemSet(&costs, 0, sizeof(costs));
costs.numIndexTuples = numIndexTuples;
genericcostestimate(root, path, loop_count, &costs);
/*
* Add a CPU-cost component to represent the costs of initial btree
* descent. We don't charge any I/O cost for touching upper btree levels,
* since they tend to stay in cache, but we still have to do about log2(N)
* comparisons to descend a btree of N leaf tuples. We charge one
* cpu_operator_cost per comparison.
*
* If there are ScalarArrayOpExprs, charge this once per SA scan. The
* ones after the first one are not startup cost so far as the overall
* plan is concerned, so add them only to "total" cost.
*/
if (index->tuples > 1) /* avoid computing log(0) */
{
descentCost = ceil(log(index->tuples) / log(2.0)) * cpu_operator_cost;
costs.indexStartupCost += descentCost;
costs.indexTotalCost += costs.num_sa_scans * descentCost;
}
/*
* Even though we're not charging I/O cost for touching upper btree pages,
* it's still reasonable to charge some CPU cost per page descended
* through. Moreover, if we had no such charge at all, bloated indexes
* would appear to have the same search cost as unbloated ones, at least
* in cases where only a single leaf page is expected to be visited. This
* cost is somewhat arbitrarily set at 50x cpu_operator_cost per page
* touched. The number of such pages is btree tree height plus one (ie,
* we charge for the leaf page too). As above, charge once per SA scan.
*/
descentCost = (index->tree_height + 1) * 50.0 * cpu_operator_cost;
costs.indexStartupCost += descentCost;
costs.indexTotalCost += costs.num_sa_scans * descentCost;
/* /*
* If we can get an estimate of the first column's ordering correlation C * If we can get an estimate of the first column's ordering correlation C
...@@ -6478,9 +6515,9 @@ btcostestimate(PG_FUNCTION_ARGS) ...@@ -6478,9 +6515,9 @@ btcostestimate(PG_FUNCTION_ARGS)
varCorrelation = -varCorrelation; varCorrelation = -varCorrelation;
if (index->ncolumns > 1) if (index->ncolumns > 1)
*indexCorrelation = varCorrelation * 0.75; costs.indexCorrelation = varCorrelation * 0.75;
else else
*indexCorrelation = varCorrelation; costs.indexCorrelation = varCorrelation;
free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers); free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers);
} }
...@@ -6488,6 +6525,11 @@ btcostestimate(PG_FUNCTION_ARGS) ...@@ -6488,6 +6525,11 @@ btcostestimate(PG_FUNCTION_ARGS)
ReleaseVariableStats(vardata); ReleaseVariableStats(vardata);
*indexStartupCost = costs.indexStartupCost;
*indexTotalCost = costs.indexTotalCost;
*indexSelectivity = costs.indexSelectivity;
*indexCorrelation = costs.indexCorrelation;
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
...@@ -6501,10 +6543,41 @@ hashcostestimate(PG_FUNCTION_ARGS) ...@@ -6501,10 +6543,41 @@ hashcostestimate(PG_FUNCTION_ARGS)
Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4); Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5); Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
double *indexCorrelation = (double *) PG_GETARG_POINTER(6); double *indexCorrelation = (double *) PG_GETARG_POINTER(6);
GenericCosts costs;
MemSet(&costs, 0, sizeof(costs));
genericcostestimate(root, path, loop_count, 0.0, genericcostestimate(root, path, loop_count, &costs);
indexStartupCost, indexTotalCost,
indexSelectivity, indexCorrelation); /*
* A hash index has no descent costs as such, since the index AM can go
* directly to the target bucket after computing the hash value. There
* are a couple of other hash-specific costs that we could conceivably add
* here, though:
*
* Ideally we'd charge spc_random_page_cost for each page in the target
* bucket, not just the numIndexPages pages that genericcostestimate
* thought we'd visit. However in most cases we don't know which bucket
* that will be. There's no point in considering the average bucket size
* because the hash AM makes sure that's always one page.
*
* Likewise, we could consider charging some CPU for each index tuple in
* the bucket, if we knew how many there were. But the per-tuple cost is
* just a hash value comparison, not a general datatype-dependent
* comparison, so any such charge ought to be quite a bit less than
* cpu_operator_cost; which makes it probably not worth worrying about.
*
* A bigger issue is that chance hash-value collisions will result in
* wasted probes into the heap. We don't currently attempt to model this
* cost on the grounds that it's rare, but maybe it's not rare enough.
* (Any fix for this ought to consider the generic lossy-operator problem,
* though; it's not entirely hash-specific.)
*/
*indexStartupCost = costs.indexStartupCost;
*indexTotalCost = costs.indexTotalCost;
*indexSelectivity = costs.indexSelectivity;
*indexCorrelation = costs.indexCorrelation;
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
...@@ -6519,10 +6592,54 @@ gistcostestimate(PG_FUNCTION_ARGS) ...@@ -6519,10 +6592,54 @@ gistcostestimate(PG_FUNCTION_ARGS)
Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4); Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5); Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
double *indexCorrelation = (double *) PG_GETARG_POINTER(6); double *indexCorrelation = (double *) PG_GETARG_POINTER(6);
IndexOptInfo *index = path->indexinfo;
GenericCosts costs;
Cost descentCost;
MemSet(&costs, 0, sizeof(costs));
genericcostestimate(root, path, loop_count, &costs);
/*
* We model index descent costs similarly to those for btree, but to do
* that we first need an idea of the tree height. We somewhat arbitrarily
* assume that the fanout is 100, meaning the tree height is at most
* log100(index->pages).
*
* Although this computation isn't really expensive enough to require
* caching, we might as well use index->tree_height to cache it.
*/
if (index->tree_height < 0) /* unknown? */
{
if (index->pages > 1) /* avoid computing log(0) */
index->tree_height = (int) (log(index->pages) / log(100.0));
else
index->tree_height = 0;
}
/*
* Add a CPU-cost component to represent the costs of initial descent.
* We just use log(N) here not log2(N) since the branching factor isn't
* necessarily two anyway. As for btree, charge once per SA scan.
*/
if (index->tuples > 1) /* avoid computing log(0) */
{
descentCost = ceil(log(index->tuples)) * cpu_operator_cost;
costs.indexStartupCost += descentCost;
costs.indexTotalCost += costs.num_sa_scans * descentCost;
}
/*
* Likewise add a per-page charge, calculated the same as for btrees.
*/
descentCost = (index->tree_height + 1) * 50.0 * cpu_operator_cost;
costs.indexStartupCost += descentCost;
costs.indexTotalCost += costs.num_sa_scans * descentCost;
genericcostestimate(root, path, loop_count, 0.0, *indexStartupCost = costs.indexStartupCost;
indexStartupCost, indexTotalCost, *indexTotalCost = costs.indexTotalCost;
indexSelectivity, indexCorrelation); *indexSelectivity = costs.indexSelectivity;
*indexCorrelation = costs.indexCorrelation;
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
...@@ -6537,10 +6654,54 @@ spgcostestimate(PG_FUNCTION_ARGS) ...@@ -6537,10 +6654,54 @@ spgcostestimate(PG_FUNCTION_ARGS)
Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4); Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5); Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
double *indexCorrelation = (double *) PG_GETARG_POINTER(6); double *indexCorrelation = (double *) PG_GETARG_POINTER(6);
IndexOptInfo *index = path->indexinfo;
GenericCosts costs;
Cost descentCost;
MemSet(&costs, 0, sizeof(costs));
genericcostestimate(root, path, loop_count, &costs);
/*
* We model index descent costs similarly to those for btree, but to do
* that we first need an idea of the tree height. We somewhat arbitrarily
* assume that the fanout is 100, meaning the tree height is at most
* log100(index->pages).
*
* Although this computation isn't really expensive enough to require
* caching, we might as well use index->tree_height to cache it.
*/
if (index->tree_height < 0) /* unknown? */
{
if (index->pages > 1) /* avoid computing log(0) */
index->tree_height = (int) (log(index->pages) / log(100.0));
else
index->tree_height = 0;
}
/*
* Add a CPU-cost component to represent the costs of initial descent.
* We just use log(N) here not log2(N) since the branching factor isn't
* necessarily two anyway. As for btree, charge once per SA scan.
*/
if (index->tuples > 1) /* avoid computing log(0) */
{
descentCost = ceil(log(index->tuples)) * cpu_operator_cost;
costs.indexStartupCost += descentCost;
costs.indexTotalCost += costs.num_sa_scans * descentCost;
}
/*
* Likewise add a per-page charge, calculated the same as for btrees.
*/
descentCost = (index->tree_height + 1) * 50.0 * cpu_operator_cost;
costs.indexStartupCost += descentCost;
costs.indexTotalCost += costs.num_sa_scans * descentCost;
genericcostestimate(root, path, loop_count, 0.0, *indexStartupCost = costs.indexStartupCost;
indexStartupCost, indexTotalCost, *indexTotalCost = costs.indexTotalCost;
indexSelectivity, indexCorrelation); *indexSelectivity = costs.indexSelectivity;
*indexCorrelation = costs.indexCorrelation;
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
......
...@@ -626,6 +626,7 @@ extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, ...@@ -626,6 +626,7 @@ extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level); extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_getroot(Relation rel, int access);
extern Buffer _bt_gettrueroot(Relation rel); extern Buffer _bt_gettrueroot(Relation rel);
extern int _bt_getrootheight(Relation rel);
extern void _bt_checkpage(Relation rel, Buffer buf); extern void _bt_checkpage(Relation rel, Buffer buf);
extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
......
...@@ -487,9 +487,10 @@ typedef struct IndexOptInfo ...@@ -487,9 +487,10 @@ typedef struct IndexOptInfo
Oid reltablespace; /* tablespace of index (not table) */ Oid reltablespace; /* tablespace of index (not table) */
RelOptInfo *rel; /* back-link to index's table */ RelOptInfo *rel; /* back-link to index's table */
/* statistics from pg_class */ /* index-size statistics (from pg_class and elsewhere) */
BlockNumber pages; /* number of disk pages in index */ BlockNumber pages; /* number of disk pages in index */
double tuples; /* number of index tuples in index */ double tuples; /* number of index tuples in index */
int tree_height; /* index tree height, or -1 if unknown */
/* index descriptor information */ /* index descriptor information */
int ncolumns; /* number of columns in index */ int ncolumns; /* number of columns in index */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment