Commit 31f38f28 authored by Tom Lane's avatar Tom Lane

Redesign the planner's handling of index-descent cost estimation.

Historically we've used a couple of very ad-hoc fudge factors to try to
get the right results when indexes of different sizes would satisfy a
query with the same number of index leaf tuples being visited.  In
commit 21a39de5 I tweaked one of these
fudge factors, with results that proved disastrous for larger indexes.
Commit bf01e34b fudged it some more,
but still with not a lot of principle behind it.

What seems like a better way to address these issues is to explicitly model
index-descent costs, since that's what's really at stake when considering
diferent indexes with similar leaf-page-level costs.  We tried that once
long ago, and found that charging random_page_cost per page descended
through was way too much, because upper btree levels tend to stay in cache
in real-world workloads.  However, there's still CPU costs to think about,
and the previous fudge factors can be seen as a crude attempt to account
for those costs.  So this patch replaces those fudge factors with explicit
charges for the number of tuple comparisons needed to descend the index
tree, plus a small charge per page touched in the descent.  The cost
multipliers are chosen so that the resulting charges are in the vicinity of
the historical (pre-9.2) fudge factors for indexes of up to about a million
tuples, while not ballooning unreasonably beyond that, as the old fudge
factor did (even more so in 9.2).

To make this work accurately for btree indexes, add some code that allows
extraction of the known root-page height from a btree.  There's no
equivalent number readily available for other index types, but we can use
the log of the number of index pages as an approximate substitute.

This seems like too much of a behavioral change to risk back-patching,
but it should improve matters going forward.  In 9.2 I'll just revert
the fudge-factor change.
parent e1b735ae
......@@ -411,6 +411,82 @@ _bt_gettrueroot(Relation rel)
return rootbuf;
}
/*
* _bt_getrootheight() -- Get the height of the btree search tree.
*
* We return the level (counting from zero) of the current fast root.
* This represents the number of tree levels we'd have to descend through
* to start any btree index search.
*
* This is used by the planner for cost-estimation purposes. Since it's
* only an estimate, slightly-stale data is fine, hence we don't worry
* about updating previously cached data.
*/
int
_bt_getrootheight(Relation rel)
{
BTMetaPageData *metad;
/*
* We can get what we need from the cached metapage data. If it's not
* cached yet, load it. Sanity checks here must match _bt_getroot().
*/
if (rel->rd_amcache == NULL)
{
Buffer metabuf;
Page metapg;
BTPageOpaque metaopaque;
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
metad = BTPageGetMeta(metapg);
/* sanity-check the metapage */
if (!(metaopaque->btpo_flags & BTP_META) ||
metad->btm_magic != BTREE_MAGIC)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("index \"%s\" is not a btree",
RelationGetRelationName(rel))));
if (metad->btm_version != BTREE_VERSION)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("version mismatch in index \"%s\": file version %d, code version %d",
RelationGetRelationName(rel),
metad->btm_version, BTREE_VERSION)));
/*
* If there's no root page yet, _bt_getroot() doesn't expect a cache
* to be made, so just stop here and report the index height is zero.
* (XXX perhaps _bt_getroot() should be changed to allow this case.)
*/
if (metad->btm_root == P_NONE)
{
_bt_relbuf(rel, metabuf);
return 0;
}
/*
* Cache the metapage data for next time
*/
rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
sizeof(BTMetaPageData));
memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
_bt_relbuf(rel, metabuf);
}
metad = (BTMetaPageData *) rel->rd_amcache;
/* We shouldn't have cached it if any of these fail */
Assert(metad->btm_magic == BTREE_MAGIC);
Assert(metad->btm_version == BTREE_VERSION);
Assert(metad->btm_fastroot != P_NONE);
return metad->btm_fastlevel;
}
/*
* _bt_checkpage() -- Verify that a freshly-read page looks sane.
*/
......
......@@ -1772,7 +1772,9 @@ _outIndexOptInfo(StringInfo str, const IndexOptInfo *node)
/* Do NOT print rel field, else infinite recursion */
WRITE_UINT_FIELD(pages);
WRITE_FLOAT_FIELD(tuples, "%.0f");
WRITE_INT_FIELD(tree_height);
WRITE_INT_FIELD(ncolumns);
/* array fields aren't really worth the trouble to print */
WRITE_OID_FIELD(relam);
/* indexprs is redundant since we print indextlist */
WRITE_NODE_FIELD(indpred);
......@@ -1781,6 +1783,7 @@ _outIndexOptInfo(StringInfo str, const IndexOptInfo *node)
WRITE_BOOL_FIELD(unique);
WRITE_BOOL_FIELD(immediate);
WRITE_BOOL_FIELD(hypothetical);
/* we don't bother with fields copied from the pg_am entry */
}
static void
......
......@@ -20,6 +20,7 @@
#include "access/genam.h"
#include "access/heapam.h"
#include "access/htup_details.h"
#include "access/nbtree.h"
#include "access/sysattr.h"
#include "access/transam.h"
#include "access/xlog.h"
......@@ -352,6 +353,17 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
info->tuples = rel->tuples;
}
if (info->relam == BTREE_AM_OID)
{
/* For btrees, get tree height while we have the index open */
info->tree_height = _bt_getrootheight(indexRelation);
}
else
{
/* For other index types, just set it to "unknown" for now */
info->tree_height = -1;
}
index_close(indexRelation, NoLock);
indexinfos = lcons(info, indexinfos);
......
This diff is collapsed.
......@@ -626,6 +626,7 @@ extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
extern Buffer _bt_getroot(Relation rel, int access);
extern Buffer _bt_gettrueroot(Relation rel);
extern int _bt_getrootheight(Relation rel);
extern void _bt_checkpage(Relation rel, Buffer buf);
extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
......
......@@ -487,9 +487,10 @@ typedef struct IndexOptInfo
Oid reltablespace; /* tablespace of index (not table) */
RelOptInfo *rel; /* back-link to index's table */
/* statistics from pg_class */
/* index-size statistics (from pg_class and elsewhere) */
BlockNumber pages; /* number of disk pages in index */
double tuples; /* number of index tuples in index */
int tree_height; /* index tree height, or -1 if unknown */
/* index descriptor information */
int ncolumns; /* number of columns in index */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment