Commit be14f884 authored by Peter Geoghegan's avatar Peter Geoghegan

Fix deduplication "single value" strategy bug.

It was possible for deduplication's single value strategy to mistakenly
believe that a very small duplicate tuple counts as one of the six large
tuples that it aims to leave behind after the page finally splits.  This
could cause slightly suboptimal space utilization with very low
cardinality indexes, though only under fairly narrow conditions.

To fix, be particular about what kind of tuple counts as a
maxpostingsize-capped tuple.  This avoids confusion in the event of a
small tuple that gets "wedged" between two large tuples, where all
tuples on the page are duplicates of the same value.

Discussion: https://postgr.es/m/CAH2-Wz=Y+sgSFc-O3LpiZX-POx2bC+okec2KafERHuzdVa7-rQ@mail.gmail.com
Backpatch: 13-, where deduplication was introduced (by commit 0d861bbb)
parent f9e9704f
...@@ -62,7 +62,6 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, ...@@ -62,7 +62,6 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
Page page = BufferGetPage(buf); Page page = BufferGetPage(buf);
BTPageOpaque opaque; BTPageOpaque opaque;
Page newpage; Page newpage;
int newpagendataitems = 0;
OffsetNumber deletable[MaxIndexTuplesPerPage]; OffsetNumber deletable[MaxIndexTuplesPerPage];
BTDedupState state; BTDedupState state;
int ndeletable = 0; int ndeletable = 0;
...@@ -124,6 +123,7 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, ...@@ -124,6 +123,7 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
*/ */
state = (BTDedupState) palloc(sizeof(BTDedupStateData)); state = (BTDedupState) palloc(sizeof(BTDedupStateData));
state->deduplicate = true; state->deduplicate = true;
state->nmaxitems = 0;
state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK); state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
/* Metadata about base tuple of current pending posting list */ /* Metadata about base tuple of current pending posting list */
state->base = NULL; state->base = NULL;
...@@ -204,26 +204,25 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, ...@@ -204,26 +204,25 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
* reset the state and move on without modifying the page. * reset the state and move on without modifying the page.
*/ */
pagesaving += _bt_dedup_finish_pending(newpage, state); pagesaving += _bt_dedup_finish_pending(newpage, state);
newpagendataitems++;
if (singlevalstrat) if (singlevalstrat)
{ {
/* /*
* Single value strategy's extra steps. * Single value strategy's extra steps.
* *
* Lower maxpostingsize for sixth and final item that might be * Lower maxpostingsize for sixth and final large posting list
* deduplicated by current deduplication pass. When sixth * tuple at the point where 5 maxpostingsize-capped tuples
* item formed/observed, stop deduplicating items. * have either been formed or observed.
* *
* Note: It's possible that this will be reached even when * When a sixth maxpostingsize-capped item is formed/observed,
* current deduplication pass has yet to merge together some * stop merging together tuples altogether. The few tuples
* existing items. It doesn't matter whether or not the * that remain at the end of the page won't be merged together
* current call generated the maxpostingsize-capped duplicate * at all (at least not until after a future page split takes
* tuples at the start of the page. * place).
*/ */
if (newpagendataitems == 5) if (state->nmaxitems == 5)
_bt_singleval_fillfactor(page, state, newitemsz); _bt_singleval_fillfactor(page, state, newitemsz);
else if (newpagendataitems == 6) else if (state->nmaxitems == 6)
{ {
state->deduplicate = false; state->deduplicate = false;
singlevalstrat = false; /* won't be back here */ singlevalstrat = false; /* won't be back here */
...@@ -237,7 +236,6 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, ...@@ -237,7 +236,6 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
/* Handle the last item */ /* Handle the last item */
pagesaving += _bt_dedup_finish_pending(newpage, state); pagesaving += _bt_dedup_finish_pending(newpage, state);
newpagendataitems++;
/* /*
* If no items suitable for deduplication were found, newpage must be * If no items suitable for deduplication were found, newpage must be
...@@ -404,7 +402,24 @@ _bt_dedup_save_htid(BTDedupState state, IndexTuple itup) ...@@ -404,7 +402,24 @@ _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
(state->nhtids + nhtids) * sizeof(ItemPointerData)); (state->nhtids + nhtids) * sizeof(ItemPointerData));
if (mergedtupsz > state->maxpostingsize) if (mergedtupsz > state->maxpostingsize)
{
/*
* Count this as an oversized item for single value strategy, though
* only when there are 50 TIDs in the final posting list tuple. This
* limit (which is fairly arbitrary) avoids confusion about how many
* 1/6 of a page tuples have been encountered/created by the current
* deduplication pass.
*
* Note: We deliberately don't consider which deduplication pass
* merged together tuples to create this item (could be a previous
* deduplication pass, or current pass). See _bt_do_singleval()
* comments.
*/
if (state->nhtids > 50)
state->nmaxitems++;
return false; return false;
}
/* /*
* Save heap TIDs to pending posting list tuple -- itup can be merged into * Save heap TIDs to pending posting list tuple -- itup can be merged into
......
...@@ -1095,6 +1095,7 @@ _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, ...@@ -1095,6 +1095,7 @@ _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
pfree(postingtuple); pfree(postingtuple);
} }
dstate->nmaxitems = 0;
dstate->nhtids = 0; dstate->nhtids = 0;
dstate->nitems = 0; dstate->nitems = 0;
dstate->phystupsize = 0; dstate->phystupsize = 0;
...@@ -1310,6 +1311,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) ...@@ -1310,6 +1311,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
dstate = (BTDedupState) palloc(sizeof(BTDedupStateData)); dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
dstate->deduplicate = true; /* unused */ dstate->deduplicate = true; /* unused */
dstate->nmaxitems = 0; /* unused */
dstate->maxpostingsize = 0; /* set later */ dstate->maxpostingsize = 0; /* set later */
/* Metadata about base tuple of current pending posting list */ /* Metadata about base tuple of current pending posting list */
dstate->base = NULL; dstate->base = NULL;
......
...@@ -483,6 +483,7 @@ btree_xlog_dedup(XLogReaderState *record) ...@@ -483,6 +483,7 @@ btree_xlog_dedup(XLogReaderState *record)
state = (BTDedupState) palloc(sizeof(BTDedupStateData)); state = (BTDedupState) palloc(sizeof(BTDedupStateData));
state->deduplicate = true; /* unused */ state->deduplicate = true; /* unused */
state->nmaxitems = 0; /* unused */
/* Conservatively use larger maxpostingsize than primary */ /* Conservatively use larger maxpostingsize than primary */
state->maxpostingsize = BTMaxItemSize(page); state->maxpostingsize = BTMaxItemSize(page);
state->base = NULL; state->base = NULL;
......
...@@ -739,6 +739,7 @@ typedef struct BTDedupStateData ...@@ -739,6 +739,7 @@ typedef struct BTDedupStateData
{ {
/* Deduplication status info for entire pass over page */ /* Deduplication status info for entire pass over page */
bool deduplicate; /* Still deduplicating page? */ bool deduplicate; /* Still deduplicating page? */
int nmaxitems; /* Number of max-sized tuples so far */
Size maxpostingsize; /* Limit on size of final tuple */ Size maxpostingsize; /* Limit on size of final tuple */
/* Metadata about base tuple of current pending posting list */ /* Metadata about base tuple of current pending posting list */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment