Commit 48c7d9f6 authored by Tom Lane's avatar Tom Lane

Improve GIN indexscan cost estimation.

The better estimate requires more statistics than we previously stored:
in particular, counts of "entry" versus "data" pages within the index,
as well as knowledge of the number of distinct key values.  We collect
this information during initial index build and update it during VACUUM,
storing the info in new fields on the index metapage.  No initdb is
required because these fields will read as zeroes in a pre-existing
index, and the new gincostestimate code is coded to behave (reasonably)
sanely if they are zeroes.

Teodor Sigaev, reviewed by Jan Urbanski, Tom Lane, and Itagaki Takahiro.
parent cd0e8253
...@@ -268,10 +268,13 @@ findParents(GinBtree btree, GinBtreeStack *stack, ...@@ -268,10 +268,13 @@ findParents(GinBtree btree, GinBtreeStack *stack,
/* /*
* Insert value (stored in GinBtree) to tree described by stack * Insert value (stored in GinBtree) to tree described by stack
* *
* During an index build, buildStats is non-null and the counters
* it contains should be incremented as needed.
*
* NB: the passed-in stack is freed, as though by freeGinBtreeStack. * NB: the passed-in stack is freed, as though by freeGinBtreeStack.
*/ */
void void
ginInsertValue(GinBtree btree, GinBtreeStack *stack) ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats)
{ {
GinBtreeStack *parent = stack; GinBtreeStack *parent = stack;
BlockNumber rootBlkno = InvalidBuffer; BlockNumber rootBlkno = InvalidBuffer;
...@@ -330,6 +333,15 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack) ...@@ -330,6 +333,15 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack)
((ginxlogSplit *) (rdata->data))->rootBlkno = rootBlkno; ((ginxlogSplit *) (rdata->data))->rootBlkno = rootBlkno;
/* During index build, count the newly-split page */
if (buildStats)
{
if (btree->isData)
buildStats->nDataPages++;
else
buildStats->nEntryPages++;
}
parent = stack->parent; parent = stack->parent;
if (parent == NULL) if (parent == NULL)
...@@ -381,6 +393,15 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack) ...@@ -381,6 +393,15 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack)
freeGinBtreeStack(stack); freeGinBtreeStack(stack);
/* During index build, count the newly-added root page */
if (buildStats)
{
if (btree->isData)
buildStats->nDataPages++;
else
buildStats->nEntryPages++;
}
return; return;
} }
else else
......
...@@ -592,9 +592,11 @@ void ...@@ -592,9 +592,11 @@ void
prepareDataScan(GinBtree btree, Relation index) prepareDataScan(GinBtree btree, Relation index)
{ {
memset(btree, 0, sizeof(GinBtreeData)); memset(btree, 0, sizeof(GinBtreeData));
btree->index = index; btree->index = index;
btree->isMoveRight = dataIsMoveRight;
btree->findChildPage = dataLocateItem; btree->findChildPage = dataLocateItem;
btree->isMoveRight = dataIsMoveRight;
btree->findItem = dataLocateLeafItem; btree->findItem = dataLocateLeafItem;
btree->findChildPtr = dataFindChildPtr; btree->findChildPtr = dataFindChildPtr;
btree->getLeftMostPage = dataGetLeftMostPage; btree->getLeftMostPage = dataGetLeftMostPage;
...@@ -603,6 +605,7 @@ prepareDataScan(GinBtree btree, Relation index) ...@@ -603,6 +605,7 @@ prepareDataScan(GinBtree btree, Relation index)
btree->splitPage = dataSplitPage; btree->splitPage = dataSplitPage;
btree->fillRoot = dataFillRoot; btree->fillRoot = dataFillRoot;
btree->isData = TRUE;
btree->searchMode = FALSE; btree->searchMode = FALSE;
btree->isDelete = FALSE; btree->isDelete = FALSE;
btree->fullScan = FALSE; btree->fullScan = FALSE;
...@@ -628,7 +631,9 @@ prepareScanPostingTree(Relation index, BlockNumber rootBlkno, bool searchMode) ...@@ -628,7 +631,9 @@ prepareScanPostingTree(Relation index, BlockNumber rootBlkno, bool searchMode)
* Inserts array of item pointers, may execute several tree scan (very rare) * Inserts array of item pointers, may execute several tree scan (very rare)
*/ */
void void
insertItemPointer(GinPostingTreeScan *gdi, ItemPointerData *items, uint32 nitem) ginInsertItemPointer(GinPostingTreeScan *gdi,
ItemPointerData *items, uint32 nitem,
GinStatsData *buildStats)
{ {
BlockNumber rootBlkno = gdi->stack->blkno; BlockNumber rootBlkno = gdi->stack->blkno;
...@@ -653,7 +658,7 @@ insertItemPointer(GinPostingTreeScan *gdi, ItemPointerData *items, uint32 nitem) ...@@ -653,7 +658,7 @@ insertItemPointer(GinPostingTreeScan *gdi, ItemPointerData *items, uint32 nitem)
freeGinBtreeStack(gdi->stack); freeGinBtreeStack(gdi->stack);
} }
else else
ginInsertValue(&(gdi->btree), gdi->stack); ginInsertValue(&(gdi->btree), gdi->stack, buildStats);
gdi->stack = NULL; gdi->stack = NULL;
} }
......
...@@ -659,8 +659,11 @@ prepareEntryScan(GinBtree btree, Relation index, OffsetNumber attnum, Datum valu ...@@ -659,8 +659,11 @@ prepareEntryScan(GinBtree btree, Relation index, OffsetNumber attnum, Datum valu
{ {
memset(btree, 0, sizeof(GinBtreeData)); memset(btree, 0, sizeof(GinBtreeData));
btree->isMoveRight = entryIsMoveRight; btree->index = index;
btree->ginstate = ginstate;
btree->findChildPage = entryLocateEntry; btree->findChildPage = entryLocateEntry;
btree->isMoveRight = entryIsMoveRight;
btree->findItem = entryLocateLeafEntry; btree->findItem = entryLocateLeafEntry;
btree->findChildPtr = entryFindChildPtr; btree->findChildPtr = entryFindChildPtr;
btree->getLeftMostPage = entryGetLeftMostPage; btree->getLeftMostPage = entryGetLeftMostPage;
...@@ -669,13 +672,12 @@ prepareEntryScan(GinBtree btree, Relation index, OffsetNumber attnum, Datum valu ...@@ -669,13 +672,12 @@ prepareEntryScan(GinBtree btree, Relation index, OffsetNumber attnum, Datum valu
btree->splitPage = entrySplitPage; btree->splitPage = entrySplitPage;
btree->fillRoot = entryFillRoot; btree->fillRoot = entryFillRoot;
btree->index = index; btree->isData = FALSE;
btree->ginstate = ginstate;
btree->entryAttnum = attnum;
btree->entryValue = value;
btree->isDelete = FALSE;
btree->searchMode = FALSE; btree->searchMode = FALSE;
btree->fullScan = FALSE; btree->fullScan = FALSE;
btree->isBuild = FALSE; btree->isBuild = FALSE;
btree->entryAttnum = attnum;
btree->entryValue = value;
btree->isDelete = FALSE;
} }
...@@ -789,7 +789,7 @@ ginInsertCleanup(Relation index, GinState *ginstate, ...@@ -789,7 +789,7 @@ ginInsertCleanup(Relation index, GinState *ginstate,
ginBeginBAScan(&accum); ginBeginBAScan(&accum);
while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL) while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL)
{ {
ginEntryInsert(index, ginstate, attnum, entry, list, nlist, FALSE); ginEntryInsert(index, ginstate, attnum, entry, list, nlist, NULL);
if (vac_delay) if (vac_delay)
vacuum_delay_point(); vacuum_delay_point();
} }
...@@ -823,7 +823,7 @@ ginInsertCleanup(Relation index, GinState *ginstate, ...@@ -823,7 +823,7 @@ ginInsertCleanup(Relation index, GinState *ginstate,
ginBeginBAScan(&accum); ginBeginBAScan(&accum);
while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL) while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL)
ginEntryInsert(index, ginstate, attnum, entry, list, nlist, FALSE); ginEntryInsert(index, ginstate, attnum, entry, list, nlist, NULL);
} }
/* /*
......
...@@ -27,6 +27,7 @@ typedef struct ...@@ -27,6 +27,7 @@ typedef struct
{ {
GinState ginstate; GinState ginstate;
double indtuples; double indtuples;
GinStatsData buildStats;
MemoryContext tmpCtx; MemoryContext tmpCtx;
MemoryContext funcCtx; MemoryContext funcCtx;
BuildAccumulator accum; BuildAccumulator accum;
...@@ -97,8 +98,10 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems) ...@@ -97,8 +98,10 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems)
* GinFormTuple(). * GinFormTuple().
*/ */
static IndexTuple static IndexTuple
addItemPointersToTuple(Relation index, GinState *ginstate, GinBtreeStack *stack, addItemPointersToTuple(Relation index, GinState *ginstate,
IndexTuple old, ItemPointerData *items, uint32 nitem, bool isBuild) GinBtreeStack *stack, IndexTuple old,
ItemPointerData *items, uint32 nitem,
GinStatsData *buildStats)
{ {
Datum key = gin_index_getattr(ginstate, old); Datum key = gin_index_getattr(ginstate, old);
OffsetNumber attnum = gintuple_get_attrnum(ginstate, old); OffsetNumber attnum = gintuple_get_attrnum(ginstate, old);
...@@ -128,11 +131,15 @@ addItemPointersToTuple(Relation index, GinState *ginstate, GinBtreeStack *stack, ...@@ -128,11 +131,15 @@ addItemPointersToTuple(Relation index, GinState *ginstate, GinBtreeStack *stack,
GinSetPostingTree(res, postingRoot); GinSetPostingTree(res, postingRoot);
gdi = prepareScanPostingTree(index, postingRoot, FALSE); gdi = prepareScanPostingTree(index, postingRoot, FALSE);
gdi->btree.isBuild = isBuild; gdi->btree.isBuild = (buildStats != NULL);
insertItemPointer(gdi, items, nitem); ginInsertItemPointer(gdi, items, nitem, buildStats);
pfree(gdi); pfree(gdi);
/* During index build, count the newly-added data page */
if (buildStats)
buildStats->nDataPages++;
} }
return res; return res;
...@@ -140,18 +147,25 @@ addItemPointersToTuple(Relation index, GinState *ginstate, GinBtreeStack *stack, ...@@ -140,18 +147,25 @@ addItemPointersToTuple(Relation index, GinState *ginstate, GinBtreeStack *stack,
/* /*
* Inserts only one entry to the index, but it can add more than 1 ItemPointer. * Inserts only one entry to the index, but it can add more than 1 ItemPointer.
*
* During an index build, buildStats is non-null and the counters
* it contains should be incremented as needed.
*/ */
void void
ginEntryInsert(Relation index, GinState *ginstate, ginEntryInsert(Relation index, GinState *ginstate,
OffsetNumber attnum, Datum value, OffsetNumber attnum, Datum value,
ItemPointerData *items, uint32 nitem, ItemPointerData *items, uint32 nitem,
bool isBuild) GinStatsData *buildStats)
{ {
GinBtreeData btree; GinBtreeData btree;
GinBtreeStack *stack; GinBtreeStack *stack;
IndexTuple itup; IndexTuple itup;
Page page; Page page;
/* During index build, count the to-be-inserted entry */
if (buildStats)
buildStats->nEntries++;
prepareEntryScan(&btree, index, attnum, value, ginstate); prepareEntryScan(&btree, index, attnum, value, ginstate);
stack = ginFindLeafPage(&btree, NULL); stack = ginFindLeafPage(&btree, NULL);
...@@ -174,14 +188,15 @@ ginEntryInsert(Relation index, GinState *ginstate, ...@@ -174,14 +188,15 @@ ginEntryInsert(Relation index, GinState *ginstate,
/* insert into posting tree */ /* insert into posting tree */
gdi = prepareScanPostingTree(index, rootPostingTree, FALSE); gdi = prepareScanPostingTree(index, rootPostingTree, FALSE);
gdi->btree.isBuild = isBuild; gdi->btree.isBuild = (buildStats != NULL);
insertItemPointer(gdi, items, nitem); ginInsertItemPointer(gdi, items, nitem, buildStats);
pfree(gdi); pfree(gdi);
return; return;
} }
itup = addItemPointersToTuple(index, ginstate, stack, itup, items, nitem, isBuild); itup = addItemPointersToTuple(index, ginstate, stack, itup,
items, nitem, buildStats);
btree.isDelete = TRUE; btree.isDelete = TRUE;
} }
...@@ -195,13 +210,14 @@ ginEntryInsert(Relation index, GinState *ginstate, ...@@ -195,13 +210,14 @@ ginEntryInsert(Relation index, GinState *ginstate,
/* Add the rest, making a posting tree if necessary */ /* Add the rest, making a posting tree if necessary */
IndexTuple previtup = itup; IndexTuple previtup = itup;
itup = addItemPointersToTuple(index, ginstate, stack, previtup, items + 1, nitem - 1, isBuild); itup = addItemPointersToTuple(index, ginstate, stack, previtup,
items + 1, nitem - 1, buildStats);
pfree(previtup); pfree(previtup);
} }
} }
btree.entry = itup; btree.entry = itup;
ginInsertValue(&btree, stack); ginInsertValue(&btree, stack, buildStats);
pfree(itup); pfree(itup);
} }
...@@ -260,7 +276,8 @@ ginBuildCallback(Relation index, HeapTuple htup, Datum *values, ...@@ -260,7 +276,8 @@ ginBuildCallback(Relation index, HeapTuple htup, Datum *values,
{ {
/* there could be many entries, so be willing to abort here */ /* there could be many entries, so be willing to abort here */
CHECK_FOR_INTERRUPTS(); CHECK_FOR_INTERRUPTS();
ginEntryInsert(index, &buildstate->ginstate, attnum, entry, list, nlist, TRUE); ginEntryInsert(index, &buildstate->ginstate, attnum, entry,
list, nlist, &buildstate->buildStats);
} }
MemoryContextReset(buildstate->tmpCtx); MemoryContextReset(buildstate->tmpCtx);
...@@ -292,6 +309,8 @@ ginbuild(PG_FUNCTION_ARGS) ...@@ -292,6 +309,8 @@ ginbuild(PG_FUNCTION_ARGS)
RelationGetRelationName(index)); RelationGetRelationName(index));
initGinState(&buildstate.ginstate, index); initGinState(&buildstate.ginstate, index);
buildstate.indtuples = 0;
memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
/* initialize the meta page */ /* initialize the meta page */
MetaBuffer = GinNewBuffer(index); MetaBuffer = GinNewBuffer(index);
...@@ -331,8 +350,8 @@ ginbuild(PG_FUNCTION_ARGS) ...@@ -331,8 +350,8 @@ ginbuild(PG_FUNCTION_ARGS)
UnlockReleaseBuffer(RootBuffer); UnlockReleaseBuffer(RootBuffer);
END_CRIT_SECTION(); END_CRIT_SECTION();
/* build the index */ /* count the root as first entry page */
buildstate.indtuples = 0; buildstate.buildStats.nEntryPages++;
/* /*
* create a temporary memory context that is reset once for each tuple * create a temporary memory context that is reset once for each tuple
...@@ -367,12 +386,19 @@ ginbuild(PG_FUNCTION_ARGS) ...@@ -367,12 +386,19 @@ ginbuild(PG_FUNCTION_ARGS)
{ {
/* there could be many entries, so be willing to abort here */ /* there could be many entries, so be willing to abort here */
CHECK_FOR_INTERRUPTS(); CHECK_FOR_INTERRUPTS();
ginEntryInsert(index, &buildstate.ginstate, attnum, entry, list, nlist, TRUE); ginEntryInsert(index, &buildstate.ginstate, attnum, entry,
list, nlist, &buildstate.buildStats);
} }
MemoryContextSwitchTo(oldCtx); MemoryContextSwitchTo(oldCtx);
MemoryContextDelete(buildstate.tmpCtx); MemoryContextDelete(buildstate.tmpCtx);
/*
* Update metapage stats
*/
buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
ginUpdateStats(index, &buildstate.buildStats);
/* /*
* Return statistics * Return statistics
*/ */
...@@ -401,7 +427,7 @@ ginHeapTupleInsert(Relation index, GinState *ginstate, OffsetNumber attnum, Datu ...@@ -401,7 +427,7 @@ ginHeapTupleInsert(Relation index, GinState *ginstate, OffsetNumber attnum, Datu
return 0; return 0;
for (i = 0; i < nentries; i++) for (i = 0; i < nentries; i++)
ginEntryInsert(index, ginstate, attnum, entries[i], item, 1, FALSE); ginEntryInsert(index, ginstate, attnum, entries[i], item, 1, NULL);
return nentries; return nentries;
} }
......
...@@ -13,10 +13,12 @@ ...@@ -13,10 +13,12 @@
*/ */
#include "postgres.h" #include "postgres.h"
#include "access/genam.h" #include "access/genam.h"
#include "access/gin.h" #include "access/gin.h"
#include "access/reloptions.h" #include "access/reloptions.h"
#include "catalog/pg_type.h" #include "catalog/pg_type.h"
#include "miscadmin.h"
#include "storage/bufmgr.h" #include "storage/bufmgr.h"
#include "storage/freespace.h" #include "storage/freespace.h"
#include "storage/indexfsm.h" #include "storage/indexfsm.h"
...@@ -227,6 +229,10 @@ GinInitMetabuffer(Buffer b) ...@@ -227,6 +229,10 @@ GinInitMetabuffer(Buffer b)
metadata->tailFreeSize = 0; metadata->tailFreeSize = 0;
metadata->nPendingPages = 0; metadata->nPendingPages = 0;
metadata->nPendingHeapTuples = 0; metadata->nPendingHeapTuples = 0;
metadata->nTotalPages = 0;
metadata->nEntryPages = 0;
metadata->nDataPages = 0;
metadata->nEntries = 0;
} }
int int
...@@ -354,3 +360,82 @@ ginoptions(PG_FUNCTION_ARGS) ...@@ -354,3 +360,82 @@ ginoptions(PG_FUNCTION_ARGS)
PG_RETURN_BYTEA_P(rdopts); PG_RETURN_BYTEA_P(rdopts);
} }
/*
* Fetch index's statistical data into *stats
*
* Note: in the result, nPendingPages can be trusted to be up-to-date,
* but the other fields are as of the last VACUUM.
*/
void
ginGetStats(Relation index, GinStatsData *stats)
{
Buffer metabuffer;
Page metapage;
GinMetaPageData *metadata;
metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
LockBuffer(metabuffer, GIN_SHARE);
metapage = BufferGetPage(metabuffer);
metadata = GinPageGetMeta(metapage);
stats->nPendingPages = metadata->nPendingPages;
stats->nTotalPages = metadata->nTotalPages;
stats->nEntryPages = metadata->nEntryPages;
stats->nDataPages = metadata->nDataPages;
stats->nEntries = metadata->nEntries;
UnlockReleaseBuffer(metabuffer);
}
/*
* Write the given statistics to the index's metapage
*
* Note: nPendingPages is *not* copied over
*/
void
ginUpdateStats(Relation index, const GinStatsData *stats)
{
Buffer metabuffer;
Page metapage;
GinMetaPageData *metadata;
metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
LockBuffer(metabuffer, GIN_EXCLUSIVE);
metapage = BufferGetPage(metabuffer);
metadata = GinPageGetMeta(metapage);
START_CRIT_SECTION();
metadata->nTotalPages = stats->nTotalPages;
metadata->nEntryPages = stats->nEntryPages;
metadata->nDataPages = stats->nDataPages;
metadata->nEntries = stats->nEntries;
MarkBufferDirty(metabuffer);
if (!index->rd_istemp)
{
XLogRecPtr recptr;
ginxlogUpdateMeta data;
XLogRecData rdata;
data.node = index->rd_node;
data.ntuples = 0;
data.newRightlink = data.prevTail = InvalidBlockNumber;
memcpy(&data.metadata, metadata, sizeof(GinMetaPageData));
rdata.buffer = InvalidBuffer;
rdata.data = (char *) &data;
rdata.len = sizeof(ginxlogUpdateMeta);
rdata.next = NULL;
recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE, &rdata);
PageSetLSN(metapage, recptr);
PageSetTLI(metapage, ThisTimeLineID);
}
UnlockReleaseBuffer(metabuffer);
END_CRIT_SECTION();
}
...@@ -707,9 +707,8 @@ ginvacuumcleanup(PG_FUNCTION_ARGS) ...@@ -707,9 +707,8 @@ ginvacuumcleanup(PG_FUNCTION_ARGS)
BlockNumber npages, BlockNumber npages,
blkno; blkno;
BlockNumber totFreePages; BlockNumber totFreePages;
BlockNumber lastBlock = GIN_ROOT_BLKNO,
lastFilledBlock = GIN_ROOT_BLKNO;
GinState ginstate; GinState ginstate;
GinStatsData idxStat;
/* /*
* In an autovacuum analyze, we want to clean up pending insertions. * In an autovacuum analyze, we want to clean up pending insertions.
...@@ -736,6 +735,8 @@ ginvacuumcleanup(PG_FUNCTION_ARGS) ...@@ -736,6 +735,8 @@ ginvacuumcleanup(PG_FUNCTION_ARGS)
ginInsertCleanup(index, &ginstate, true, stats); ginInsertCleanup(index, &ginstate, true, stats);
} }
memset(&idxStat, 0, sizeof(idxStat));
/* /*
* XXX we always report the heap tuple count as the number of index * XXX we always report the heap tuple count as the number of index
* entries. This is bogus if the index is partial, but it's real hard to * entries. This is bogus if the index is partial, but it's real hard to
...@@ -757,7 +758,7 @@ ginvacuumcleanup(PG_FUNCTION_ARGS) ...@@ -757,7 +758,7 @@ ginvacuumcleanup(PG_FUNCTION_ARGS)
totFreePages = 0; totFreePages = 0;
for (blkno = GIN_ROOT_BLKNO + 1; blkno < npages; blkno++) for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++)
{ {
Buffer buffer; Buffer buffer;
Page page; Page page;
...@@ -771,15 +772,28 @@ ginvacuumcleanup(PG_FUNCTION_ARGS) ...@@ -771,15 +772,28 @@ ginvacuumcleanup(PG_FUNCTION_ARGS)
if (GinPageIsDeleted(page)) if (GinPageIsDeleted(page))
{ {
Assert(blkno != GIN_ROOT_BLKNO);
RecordFreeIndexPage(index, blkno); RecordFreeIndexPage(index, blkno);
totFreePages++; totFreePages++;
} }
else else if (GinPageIsData(page))
lastFilledBlock = blkno; {
idxStat.nDataPages++;
}
else if (!GinPageIsList(page))
{
idxStat.nEntryPages++;
if ( GinPageIsLeaf(page) )
idxStat.nEntries += PageGetMaxOffsetNumber(page);
}
UnlockReleaseBuffer(buffer); UnlockReleaseBuffer(buffer);
} }
lastBlock = npages - 1;
/* Update the metapage with accurate page and entry counts */
idxStat.nTotalPages = npages;
ginUpdateStats(info->index, &idxStat);
/* Finally, vacuum the FSM */ /* Finally, vacuum the FSM */
IndexFreeSpaceMapVacuum(info->index); IndexFreeSpaceMapVacuum(info->index);
......
...@@ -839,7 +839,7 @@ ginContinueSplit(ginIncompleteSplit *split) ...@@ -839,7 +839,7 @@ ginContinueSplit(ginIncompleteSplit *split)
stack.parent = NULL; stack.parent = NULL;
findParents(&btree, &stack, split->rootBlkno); findParents(&btree, &stack, split->rootBlkno);
ginInsertValue(&btree, stack.parent); ginInsertValue(&btree, stack.parent, NULL);
FreeFakeRelcacheEntry(reln); FreeFakeRelcacheEntry(reln);
......
...@@ -91,6 +91,7 @@ ...@@ -91,6 +91,7 @@
#include <ctype.h> #include <ctype.h>
#include <math.h> #include <math.h>
#include "access/gin.h"
#include "access/sysattr.h" #include "access/sysattr.h"
#include "catalog/index.h" #include "catalog/index.h"
#include "catalog/pg_opfamily.h" #include "catalog/pg_opfamily.h"
...@@ -6235,6 +6236,24 @@ gistcostestimate(PG_FUNCTION_ARGS) ...@@ -6235,6 +6236,24 @@ gistcostestimate(PG_FUNCTION_ARGS)
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
/* Find the index column matching "op"; return its index, or -1 if no match */
static int
find_index_column(Node *op, IndexOptInfo *index)
{
int i;
for (i = 0; i < index->ncolumns; i++)
{
if (match_index_to_operand(op, i, index))
return i;
}
return -1;
}
/*
* GIN has search behavior completely different from other index types
*/
Datum Datum
gincostestimate(PG_FUNCTION_ARGS) gincostestimate(PG_FUNCTION_ARGS)
{ {
...@@ -6246,10 +6265,329 @@ gincostestimate(PG_FUNCTION_ARGS) ...@@ -6246,10 +6265,329 @@ gincostestimate(PG_FUNCTION_ARGS)
Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(5); Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6); Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
double *indexCorrelation = (double *) PG_GETARG_POINTER(7); double *indexCorrelation = (double *) PG_GETARG_POINTER(7);
ListCell *l;
int32 nfullscan = 0;
List *selectivityQuals;
double numPages = index->pages,
numTuples = index->tuples;
double numEntryPages,
numDataPages,
numPendingPages,
numEntries;
double partialEntriesInQuals = 0.0;
double searchEntriesInQuals = 0.0;
double exactEntriesInQuals = 0.0;
double entryPagesFetched,
dataPagesFetched,
dataPagesFetchedBySel;
double qual_op_cost,
qual_arg_cost,
spc_random_page_cost,
num_scans;
QualCost index_qual_cost;
Relation indexRel;
GinStatsData ginStats;
genericcostestimate(root, index, indexQuals, outer_rel, 0.0, /*
indexStartupCost, indexTotalCost, * Obtain statistic information from the meta page
indexSelectivity, indexCorrelation); */
indexRel = index_open(index->indexoid, AccessShareLock);
ginGetStats(indexRel, &ginStats);
index_close(indexRel, AccessShareLock);
numEntryPages = ginStats.nEntryPages;
numDataPages = ginStats.nDataPages;
numPendingPages = ginStats.nPendingPages;
numEntries = ginStats.nEntries;
/*
* nPendingPages can be trusted, but the other fields are as of the last
* VACUUM. Scale them by the ratio numPages / nTotalPages to account for
* growth since then. If the fields are zero (implying no VACUUM at all,
* and an index created pre-9.1), assume all pages are entry pages.
*/
if (ginStats.nTotalPages == 0 || ginStats.nEntryPages == 0)
{
numEntryPages = numPages;
numDataPages = 0;
numEntries = numTuples; /* bogus, but no other info available */
}
else
{
double scale = numPages / ginStats.nTotalPages;
numEntryPages = ceil(numEntryPages * scale);
numDataPages = ceil(numDataPages * scale);
numEntries = ceil(numEntries * scale);
/* ensure we didn't round up too much */
numEntryPages = Min(numEntryPages, numPages);
numDataPages = Min(numDataPages, numPages - numEntryPages);
}
/*
* Include predicate in selectivityQuals (should match genericcostestimate)
*/
if (index->indpred != NIL)
{
List *predExtraQuals = NIL;
foreach(l, index->indpred)
{
Node *predQual = (Node *) lfirst(l);
List *oneQual = list_make1(predQual);
if (!predicate_implied_by(oneQual, indexQuals))
predExtraQuals = list_concat(predExtraQuals, oneQual);
}
/* list_concat avoids modifying the passed-in indexQuals list */
selectivityQuals = list_concat(predExtraQuals, indexQuals);
}
else
selectivityQuals = indexQuals;
/* Estimate the fraction of main-table tuples that will be visited */
*indexSelectivity = clauselist_selectivity(root, selectivityQuals,
index->rel->relid,
JOIN_INNER,
NULL);
/* fetch estimated page cost for schema containing index */
get_tablespace_page_costs(index->reltablespace,
&spc_random_page_cost,
NULL);
/*
* Generic assumption about index correlation: there isn't any.
*/
*indexCorrelation = 0.0;
/*
* Examine quals to estimate number of search entries & partial matches
*/
foreach(l, indexQuals)
{
RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
Expr *clause;
Node *leftop,
*rightop,
*operand;
Oid extractProcOid;
Oid clause_op;
int strategy_op;
Oid lefttype,
righttype;
int32 nentries = 0;
bool *partial_matches = NULL;
Pointer *extra_data = NULL;
int indexcol;
Assert(IsA(rinfo, RestrictInfo));
clause = rinfo->clause;
Assert(IsA(clause, OpExpr));
leftop = get_leftop(clause);
rightop = get_rightop(clause);
clause_op = ((OpExpr *) clause)->opno;
if ((indexcol = find_index_column(leftop, index)) >= 0)
{
operand = rightop;
}
else if ((indexcol = find_index_column(rightop, index)) >= 0)
{
operand = leftop;
clause_op = get_commutator(clause_op);
}
else
{
elog(ERROR, "Could not match index to operand");
operand = NULL; /* keep compiler quiet */
}
if (IsA(operand, RelabelType))
operand = (Node *) ((RelabelType *) operand)->arg;
/*
* It's impossible to call extractQuery method for unknown operand.
* So unless operand is a Const we can't do much; just assume there
* will be one ordinary search entry from the operand at runtime.
*/
if (!IsA(operand, Const))
{
searchEntriesInQuals++;
continue;
}
/* If Const is null, there can be no matches */
if (((Const*) operand)->constisnull)
{
*indexStartupCost = 0;
*indexTotalCost = 0;
*indexSelectivity = 0;
PG_RETURN_VOID();
}
/*
* Get the operator's strategy number and declared input data types
* within the index opfamily.
*/
get_op_opfamily_properties(clause_op, index->opfamily[indexcol],
&strategy_op, &lefttype, &righttype);
/*
* GIN (like GiST) always has lefttype == righttype in pg_amproc
* and they are equal to type Oid on which index was created/designed
*/
extractProcOid = get_opfamily_proc(index->opfamily[indexcol],
lefttype, lefttype,
GIN_EXTRACTQUERY_PROC);
if (!OidIsValid(extractProcOid))
{
/* probably shouldn't happen, but cope sanely if so */
searchEntriesInQuals++;
continue;
}
OidFunctionCall5(extractProcOid,
((Const*)operand)->constvalue,
PointerGetDatum(&nentries),
UInt16GetDatum(strategy_op),
PointerGetDatum(&partial_matches),
PointerGetDatum(&extra_data));
if (nentries == 0)
{
nfullscan++;
}
else if (nentries < 0)
{
/*
* GIN_EXTRACTQUERY_PROC guarantees that nothing will be found
*/
*indexStartupCost = 0;
*indexTotalCost = 0;
*indexSelectivity = 0;
PG_RETURN_VOID();
}
else
{
int i;
for (i=0; i<nentries; i++)
{
/*
* For partial match we haven't any information to estimate
* number of matched entries in index, so, we just estimate it
* as 100
*/
if (partial_matches && partial_matches[i])
partialEntriesInQuals += 100;
else
exactEntriesInQuals++;
searchEntriesInQuals++;
}
}
}
if (nfullscan == list_length(indexQuals))
searchEntriesInQuals = numEntries;
/* Will we have more than one iteration of a nestloop scan? */
if (outer_rel != NULL && outer_rel->rows > 1)
num_scans = outer_rel->rows;
else
num_scans = 1;
/*
* cost to begin scan, first of all, pay attention to
* pending list.
*/
entryPagesFetched = numPendingPages;
/*
* Estimate number of entry pages read. We need to do
* searchEntriesInQuals searches. Use a power function as it should be,
* but tuples on leaf pages usually is much greater.
* Here we include all searches in entry tree, including
* search of first entry in partial match algorithm
*/
entryPagesFetched += ceil(searchEntriesInQuals * rint(pow(numEntryPages, 0.15)));
/*
* Add an estimate of entry pages read by partial match algorithm.
* It's a scan over leaf pages in entry tree. We haven't any useful stats
* here, so estimate it as proportion.
*/
entryPagesFetched += ceil(numEntryPages * partialEntriesInQuals / numEntries);
/*
* Partial match algorithm reads all data pages before
* doing actual scan, so it's a startup cost. Again,
* we havn't any useful stats here, so, estimate it as
* proportion
*/
dataPagesFetched = ceil(numDataPages * partialEntriesInQuals / numEntries);
/* calculate cache effects */
if (num_scans > 1 || searchEntriesInQuals > 1)
{
entryPagesFetched = index_pages_fetched(entryPagesFetched,
(BlockNumber) numEntryPages,
numEntryPages, root);
dataPagesFetched = index_pages_fetched(dataPagesFetched,
(BlockNumber) numDataPages,
numDataPages, root);
}
/*
* Here we use random page cost because logically-close pages could be
* far apart on disk.
*/
*indexStartupCost = (entryPagesFetched + dataPagesFetched) * spc_random_page_cost;
/* cost to scan data pages for each exact (non-partial) matched entry */
dataPagesFetched = ceil(numDataPages * exactEntriesInQuals / numEntries);
/*
* Estimate number of data pages read, using selectivity estimation and
* capacity of data page.
*/
dataPagesFetchedBySel = ceil(*indexSelectivity *
(numTuples / (BLCKSZ/SizeOfIptrData)));
if (dataPagesFetchedBySel > dataPagesFetched)
{
/*
* At least one of entries is very frequent and, unfortunately,
* we couldn't get statistic about entries (only tsvector has
* such statistics). So, we obviously have too small estimation of
* pages fetched from data tree. Re-estimate it from known
* capacity of data pages
*/
dataPagesFetched = dataPagesFetchedBySel;
}
if (num_scans > 1)
dataPagesFetched = index_pages_fetched(dataPagesFetched,
(BlockNumber) numDataPages,
numDataPages, root);
*indexTotalCost = *indexStartupCost +
dataPagesFetched * spc_random_page_cost;
/*
* Add on index qual eval costs, much as in genericcostestimate
*/
cost_qual_eval(&index_qual_cost, indexQuals, root);
qual_op_cost = cpu_operator_cost * list_length(indexQuals);
qual_arg_cost = index_qual_cost.startup +
index_qual_cost.per_tuple - qual_op_cost;
if (qual_arg_cost < 0) /* just in case... */
qual_arg_cost = 0;
*indexStartupCost += qual_arg_cost;
*indexTotalCost += qual_arg_cost;
*indexTotalCost += ( numTuples * *indexSelectivity ) * (cpu_index_tuple_cost + qual_op_cost);
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
...@@ -79,6 +79,14 @@ typedef struct GinMetaPageData ...@@ -79,6 +79,14 @@ typedef struct GinMetaPageData
*/ */
BlockNumber nPendingPages; BlockNumber nPendingPages;
int64 nPendingHeapTuples; int64 nPendingHeapTuples;
/*
* Statistics for planner use (accurate as of last VACUUM)
*/
BlockNumber nTotalPages;
BlockNumber nEntryPages;
BlockNumber nDataPages;
int64 nEntries;
} GinMetaPageData; } GinMetaPageData;
#define GinPageGetMeta(p) \ #define GinPageGetMeta(p) \
...@@ -94,6 +102,8 @@ typedef struct GinMetaPageData ...@@ -94,6 +102,8 @@ typedef struct GinMetaPageData
#define GinPageSetNonLeaf(page) ( GinPageGetOpaque(page)->flags &= ~GIN_LEAF ) #define GinPageSetNonLeaf(page) ( GinPageGetOpaque(page)->flags &= ~GIN_LEAF )
#define GinPageIsData(page) ( GinPageGetOpaque(page)->flags & GIN_DATA ) #define GinPageIsData(page) ( GinPageGetOpaque(page)->flags & GIN_DATA )
#define GinPageSetData(page) ( GinPageGetOpaque(page)->flags |= GIN_DATA ) #define GinPageSetData(page) ( GinPageGetOpaque(page)->flags |= GIN_DATA )
#define GinPageIsList(page) ( GinPageGetOpaque(page)->flags & GIN_LIST )
#define GinPageSetList(page) ( GinPageGetOpaque(page)->flags |= GIN_LIST )
#define GinPageHasFullRow(page) ( GinPageGetOpaque(page)->flags & GIN_LIST_FULLROW ) #define GinPageHasFullRow(page) ( GinPageGetOpaque(page)->flags & GIN_LIST_FULLROW )
#define GinPageSetFullRow(page) ( GinPageGetOpaque(page)->flags |= GIN_LIST_FULLROW ) #define GinPageSetFullRow(page) ( GinPageGetOpaque(page)->flags |= GIN_LIST_FULLROW )
...@@ -362,13 +372,28 @@ extern Datum *extractEntriesSU(GinState *ginstate, OffsetNumber attnum, Datum va ...@@ -362,13 +372,28 @@ extern Datum *extractEntriesSU(GinState *ginstate, OffsetNumber attnum, Datum va
extern Datum gin_index_getattr(GinState *ginstate, IndexTuple tuple); extern Datum gin_index_getattr(GinState *ginstate, IndexTuple tuple);
extern OffsetNumber gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple); extern OffsetNumber gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple);
/*
* GinStatsData represents stats data for planner use
*/
typedef struct GinStatsData
{
BlockNumber nPendingPages;
BlockNumber nTotalPages;
BlockNumber nEntryPages;
BlockNumber nDataPages;
int64 nEntries;
} GinStatsData;
extern void ginGetStats(Relation index, GinStatsData *stats);
extern void ginUpdateStats(Relation index, const GinStatsData *stats);
/* gininsert.c */ /* gininsert.c */
extern Datum ginbuild(PG_FUNCTION_ARGS); extern Datum ginbuild(PG_FUNCTION_ARGS);
extern Datum gininsert(PG_FUNCTION_ARGS); extern Datum gininsert(PG_FUNCTION_ARGS);
extern void ginEntryInsert(Relation index, GinState *ginstate, extern void ginEntryInsert(Relation index, GinState *ginstate,
OffsetNumber attnum, Datum value, OffsetNumber attnum, Datum value,
ItemPointerData *items, uint32 nitem, ItemPointerData *items, uint32 nitem,
bool isBuild); GinStatsData *buildStats);
/* ginxlog.c */ /* ginxlog.c */
extern void gin_redo(XLogRecPtr lsn, XLogRecord *record); extern void gin_redo(XLogRecPtr lsn, XLogRecord *record);
...@@ -406,6 +431,7 @@ typedef struct GinBtreeData ...@@ -406,6 +431,7 @@ typedef struct GinBtreeData
Page (*splitPage) (GinBtree, Buffer, Buffer, OffsetNumber, XLogRecData **); Page (*splitPage) (GinBtree, Buffer, Buffer, OffsetNumber, XLogRecData **);
void (*fillRoot) (GinBtree, Buffer, Buffer, Buffer); void (*fillRoot) (GinBtree, Buffer, Buffer, Buffer);
bool isData;
bool searchMode; bool searchMode;
Relation index; Relation index;
...@@ -432,7 +458,8 @@ typedef struct GinBtreeData ...@@ -432,7 +458,8 @@ typedef struct GinBtreeData
extern GinBtreeStack *ginPrepareFindLeafPage(GinBtree btree, BlockNumber blkno); extern GinBtreeStack *ginPrepareFindLeafPage(GinBtree btree, BlockNumber blkno);
extern GinBtreeStack *ginFindLeafPage(GinBtree btree, GinBtreeStack *stack); extern GinBtreeStack *ginFindLeafPage(GinBtree btree, GinBtreeStack *stack);
extern void freeGinBtreeStack(GinBtreeStack *stack); extern void freeGinBtreeStack(GinBtreeStack *stack);
extern void ginInsertValue(GinBtree btree, GinBtreeStack *stack); extern void ginInsertValue(GinBtree btree, GinBtreeStack *stack,
GinStatsData *buildStats);
extern void findParents(GinBtree btree, GinBtreeStack *stack, BlockNumber rootBlkno); extern void findParents(GinBtree btree, GinBtreeStack *stack, BlockNumber rootBlkno);
/* ginentrypage.c */ /* ginentrypage.c */
...@@ -462,8 +489,9 @@ typedef struct ...@@ -462,8 +489,9 @@ typedef struct
extern GinPostingTreeScan *prepareScanPostingTree(Relation index, extern GinPostingTreeScan *prepareScanPostingTree(Relation index,
BlockNumber rootBlkno, bool searchMode); BlockNumber rootBlkno, bool searchMode);
extern void insertItemPointer(GinPostingTreeScan *gdi, extern void ginInsertItemPointer(GinPostingTreeScan *gdi,
ItemPointerData *items, uint32 nitem); ItemPointerData *items, uint32 nitem,
GinStatsData *buildStats);
extern Buffer scanBeginPostingTree(GinPostingTreeScan *gdi); extern Buffer scanBeginPostingTree(GinPostingTreeScan *gdi);
extern void dataFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf); extern void dataFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf);
extern void prepareDataScan(GinBtree btree, Relation index); extern void prepareDataScan(GinBtree btree, Relation index);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment