Commit 0bef1c06 authored by Teodor Sigaev's avatar Teodor Sigaev

Re-think predicate locking on GIN indexes.

The principle behind the locking was not very well thought-out, and not
documented. Add a section in the README to explain how it's supposed to
work, and change the code so that it actually works that way.

This fixes two bugs:

1. If fast update was turned on concurrently, subsequent inserts to the
   pending list would not conflict with predicate locks that were acquired
   earlier, on entry pages. The included 'predicate-gin-fastupdate' test
   demonstrates that. To fix, make all scans acquire a predicate lock on
   the metapage. That lock represents a scan of the pending list, whether
   or not there is a pending list at the moment. Forget about the
   optimization to skip locking/checking for locks, when fastupdate=off.
2. If a scan finds no match, it still needs to lock the entry page. The
   point of predicate locks is to lock the gabs between values, whether
   or not there is a match. The included 'predicate-gin-nomatch' test
   tests that case.

In addition to those two bug fixes, this removes some unnecessary locking,
following the principle laid out in the README. Because all items in
a posting tree have the same key value, a lock on the posting tree root is
enough to cover all the items. (With a very large posting tree, it would
possibly be better to lock the posting tree leaf pages instead, so that a
"skip scan" with a query like "A & B", you could avoid unnecessary conflict
if a new tuple is inserted with A but !B. But let's keep this simple.)

Also, some spelling  fixes.

Author: Heikki Linnakangas with some editorization by me
Review: Andrey Borodin, Alexander Korotkov
Discussion: https://www.postgresql.org/message-id/0b3ad2c2-2692-62a9-3a04-5724f2af9114@iki.fi
parent 7d867997
...@@ -331,6 +331,40 @@ page-deletions safe; it stamps the deleted pages with an XID and keeps the ...@@ -331,6 +331,40 @@ page-deletions safe; it stamps the deleted pages with an XID and keeps the
deleted pages around with the right-link intact until all concurrent scans deleted pages around with the right-link intact until all concurrent scans
have finished.) have finished.)
Predicate Locking
-----------------
GIN supports predicate locking, for serializable snapshot isolation.
A predicate locks represent that a scan has scanned a range of values. They
are not concerned with physical pages as such, but the logical key values.
A predicate lock on a page covers the key range that would belong on that
page, whether or not there are any matching tuples there currently. In other
words, a predicate lock on an index page covers the "gaps" between the index
tuples. To minimize false positives, predicate locks are acquired at the
finest level possible.
* Like in the B-tree index, it is enough to lock only leaf pages, because all
insertions happen at the leaf level.
* In an equality search (i.e. not a partial match search), if a key entry has
a posting tree, we lock the posting tree root page, to represent a lock on
just that key entry. Otherwise, we lock the entry tree page. We also lock
the entry tree page if no match is found, to lock the "gap" where the entry
would've been, had there been one.
* In a partial match search, we lock all the entry leaf pages that we scan,
in addition to locks on posting tree roots, to represent the "gaps" between
values.
* In addition to the locks on entry leaf pages and posting tree roots, all
scans grab a lock the metapage. This is to interlock with insertions to
the fast update pending list. An insertion to the pending list can really
belong anywhere in the tree, and the lock on the metapage represents that.
The interlock for fastupdate pending lists means that with fastupdate=on,
we effectively always grab a full-index lock, so you could get a lot of false
positives.
Compatibility Compatibility
------------- -------------
......
...@@ -84,6 +84,9 @@ ginFindLeafPage(GinBtree btree, bool searchMode, Snapshot snapshot) ...@@ -84,6 +84,9 @@ ginFindLeafPage(GinBtree btree, bool searchMode, Snapshot snapshot)
stack->parent = NULL; stack->parent = NULL;
stack->predictNumber = 1; stack->predictNumber = 1;
if (!searchMode)
CheckForSerializableConflictIn(btree->index, NULL, stack->buffer);
for (;;) for (;;)
{ {
Page page; Page page;
......
...@@ -1812,8 +1812,8 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, ...@@ -1812,8 +1812,8 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
blkno = BufferGetBlockNumber(buffer); blkno = BufferGetBlockNumber(buffer);
/* /*
* Copy a predicate lock from entry tree leaf (containing posting list) to * Copy any predicate locks from the entry tree leaf (containing posting
* posting tree. * list) to the posting tree.
*/ */
PredicateLockPageSplit(index, BufferGetBlockNumber(entrybuffer), blkno); PredicateLockPageSplit(index, BufferGetBlockNumber(entrybuffer), blkno);
...@@ -1864,7 +1864,7 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, ...@@ -1864,7 +1864,7 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
return blkno; return blkno;
} }
void static void
ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno) ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno)
{ {
memset(btree, 0, sizeof(GinBtreeData)); memset(btree, 0, sizeof(GinBtreeData));
...@@ -1911,7 +1911,6 @@ ginInsertItemPointers(Relation index, BlockNumber rootBlkno, ...@@ -1911,7 +1911,6 @@ ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
btree.itemptr = insertdata.items[insertdata.curitem]; btree.itemptr = insertdata.items[insertdata.curitem];
stack = ginFindLeafPage(&btree, false, NULL); stack = ginFindLeafPage(&btree, false, NULL);
GinCheckForSerializableConflictIn(btree.index, NULL, stack->buffer);
ginInsertValue(&btree, stack, &insertdata, buildStats); ginInsertValue(&btree, stack, &insertdata, buildStats);
} }
} }
......
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#include "postmaster/autovacuum.h" #include "postmaster/autovacuum.h"
#include "storage/indexfsm.h" #include "storage/indexfsm.h"
#include "storage/lmgr.h" #include "storage/lmgr.h"
#include "storage/predicate.h"
#include "utils/builtins.h" #include "utils/builtins.h"
/* GUC parameter */ /* GUC parameter */
...@@ -245,6 +246,13 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) ...@@ -245,6 +246,13 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
metapage = BufferGetPage(metabuffer); metapage = BufferGetPage(metabuffer);
/*
* An insertion to the pending list could logically belong anywhere in
* the tree, so it conflicts with all serializable scans. All scans
* acquire a predicate lock on the metabuffer to represent that.
*/
CheckForSerializableConflictIn(index, NULL, metabuffer);
if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize) if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize)
{ {
/* /*
......
...@@ -35,20 +35,6 @@ typedef struct pendingPosition ...@@ -35,20 +35,6 @@ typedef struct pendingPosition
} pendingPosition; } pendingPosition;
/*
* Place predicate lock on GIN page if needed.
*/
static void
GinPredicateLockPage(Relation index, BlockNumber blkno, Snapshot snapshot)
{
/*
* When fast update is on then no need in locking pages, because we anyway
* need to lock the whole index.
*/
if (!GinGetUseFastUpdate(index))
PredicateLockPage(index, blkno, snapshot);
}
/* /*
* Goes to the next page if current offset is outside of bounds * Goes to the next page if current offset is outside of bounds
*/ */
...@@ -68,7 +54,7 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot ...@@ -68,7 +54,7 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot
stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE); stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
stack->blkno = BufferGetBlockNumber(stack->buffer); stack->blkno = BufferGetBlockNumber(stack->buffer);
stack->off = FirstOffsetNumber; stack->off = FirstOffsetNumber;
GinPredicateLockPage(btree->index, stack->blkno, snapshot); PredicateLockPage(btree->index, stack->blkno, snapshot);
} }
return true; return true;
...@@ -100,11 +86,6 @@ scanPostingTree(Relation index, GinScanEntry scanEntry, ...@@ -100,11 +86,6 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
*/ */
for (;;) for (;;)
{ {
/*
* Predicate lock each leaf page in posting tree
*/
GinPredicateLockPage(index, BufferGetBlockNumber(buffer), snapshot);
page = BufferGetPage(buffer); page = BufferGetPage(buffer);
if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0) if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
{ {
...@@ -158,7 +139,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, ...@@ -158,7 +139,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
* Predicate lock entry leaf page, following pages will be locked by * Predicate lock entry leaf page, following pages will be locked by
* moveRightIfItNeeded() * moveRightIfItNeeded()
*/ */
GinPredicateLockPage(btree->index, stack->buffer, snapshot); PredicateLockPage(btree->index, stack->buffer, snapshot);
for (;;) for (;;)
{ {
...@@ -253,6 +234,13 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, ...@@ -253,6 +234,13 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
LockBuffer(stack->buffer, GIN_UNLOCK); LockBuffer(stack->buffer, GIN_UNLOCK);
/*
* Acquire predicate lock on the posting tree. We already hold
* a lock on the entry page, but insertions to the posting tree
* don't check for conflicts on that level.
*/
PredicateLockPage(btree->index, rootPostingTree, snapshot);
/* Collect all the TIDs in this entry's posting tree */ /* Collect all the TIDs in this entry's posting tree */
scanPostingTree(btree->index, scanEntry, rootPostingTree, scanPostingTree(btree->index, scanEntry, rootPostingTree,
snapshot); snapshot);
...@@ -400,10 +388,6 @@ restartScanEntry: ...@@ -400,10 +388,6 @@ restartScanEntry:
{ {
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off)); IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
/* Predicate lock visited entry leaf page */
GinPredicateLockPage(ginstate->index,
BufferGetBlockNumber(stackEntry->buffer), snapshot);
if (GinIsPostingTree(itup)) if (GinIsPostingTree(itup))
{ {
BlockNumber rootPostingTree = GinGetPostingTree(itup); BlockNumber rootPostingTree = GinGetPostingTree(itup);
...@@ -411,6 +395,13 @@ restartScanEntry: ...@@ -411,6 +395,13 @@ restartScanEntry:
Page page; Page page;
ItemPointerData minItem; ItemPointerData minItem;
/*
* This is an equality scan, so lock the root of the posting tree.
* It represents a lock on the exact key value, and covers all the
* items in the posting tree.
*/
PredicateLockPage(ginstate->index, rootPostingTree, snapshot);
/* /*
* We should unlock entry page before touching posting tree to * We should unlock entry page before touching posting tree to
* prevent deadlocks with vacuum processes. Because entry is never * prevent deadlocks with vacuum processes. Because entry is never
...@@ -425,12 +416,6 @@ restartScanEntry: ...@@ -425,12 +416,6 @@ restartScanEntry:
rootPostingTree, snapshot); rootPostingTree, snapshot);
entry->buffer = stack->buffer; entry->buffer = stack->buffer;
/*
* Predicate lock visited posting tree page, following pages will
* be locked by moveRightIfItNeeded or entryLoadMoreItems
*/
GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(entry->buffer), snapshot);
/* /*
* We keep buffer pinned because we need to prevent deletion of * We keep buffer pinned because we need to prevent deletion of
* page during scan. See GIN's vacuum implementation. RefCount is * page during scan. See GIN's vacuum implementation. RefCount is
...@@ -452,15 +437,38 @@ restartScanEntry: ...@@ -452,15 +437,38 @@ restartScanEntry:
freeGinBtreeStack(stack); freeGinBtreeStack(stack);
entry->isFinished = false; entry->isFinished = false;
} }
else if (GinGetNPosting(itup) > 0) else
{ {
entry->list = ginReadTuple(ginstate, entry->attnum, itup, /*
&entry->nlist); * Lock the entry leaf page. This is more coarse-grained than
entry->predictNumberResult = entry->nlist; * necessary, because it will conflict with any insertions that
* land on the same leaf page, not only the exacty key we searched
* for. But locking an individual tuple would require updating
* that lock whenever it moves because of insertions or vacuums,
* which seems too complicated.
*/
PredicateLockPage(ginstate->index,
BufferGetBlockNumber(stackEntry->buffer),
snapshot);
if (GinGetNPosting(itup) > 0)
{
entry->list = ginReadTuple(ginstate, entry->attnum, itup,
&entry->nlist);
entry->predictNumberResult = entry->nlist;
entry->isFinished = false; entry->isFinished = false;
}
} }
} }
else
{
/*
* No entry found. Predicate lock the leaf page, to lock the place
* where the entry would've been, had there been one.
*/
PredicateLockPage(ginstate->index,
BufferGetBlockNumber(stackEntry->buffer), snapshot);
}
if (needUnlock) if (needUnlock)
LockBuffer(stackEntry->buffer, GIN_UNLOCK); LockBuffer(stackEntry->buffer, GIN_UNLOCK);
...@@ -533,7 +541,7 @@ startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key) ...@@ -533,7 +541,7 @@ startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
for (i = 0; i < key->nentries - 1; i++) for (i = 0; i < key->nentries - 1; i++)
{ {
/* Pass all entries <= i as false, and the rest as MAYBE */ /* Pass all entries <= i as FALSE, and the rest as MAYBE */
for (j = 0; j <= i; j++) for (j = 0; j <= i; j++)
key->entryRes[entryIndexes[j]] = GIN_FALSE; key->entryRes[entryIndexes[j]] = GIN_FALSE;
for (j = i + 1; j < key->nentries; j++) for (j = i + 1; j < key->nentries; j++)
...@@ -673,8 +681,6 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ...@@ -673,8 +681,6 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
entry->btree.fullScan = false; entry->btree.fullScan = false;
stack = ginFindLeafPage(&entry->btree, true, snapshot); stack = ginFindLeafPage(&entry->btree, true, snapshot);
GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(stack->buffer), snapshot);
/* we don't need the stack, just the buffer. */ /* we don't need the stack, just the buffer. */
entry->buffer = stack->buffer; entry->buffer = stack->buffer;
IncrBufferRefCount(entry->buffer); IncrBufferRefCount(entry->buffer);
...@@ -719,10 +725,6 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ...@@ -719,10 +725,6 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
entry->buffer = ginStepRight(entry->buffer, entry->buffer = ginStepRight(entry->buffer,
ginstate->index, ginstate->index,
GIN_SHARE); GIN_SHARE);
GinPredicateLockPage(ginstate->index, BufferGetBlockNumber(entry->buffer), snapshot);
page = BufferGetPage(entry->buffer); page = BufferGetPage(entry->buffer);
} }
stepright = true; stepright = true;
...@@ -1084,8 +1086,8 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key, ...@@ -1084,8 +1086,8 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
* lossy page even when none of the other entries match. * lossy page even when none of the other entries match.
* *
* Our strategy is to call the tri-state consistent function, with the * Our strategy is to call the tri-state consistent function, with the
* lossy-page entries set to MAYBE, and all the other entries false. If it * lossy-page entries set to MAYBE, and all the other entries FALSE. If it
* returns false, none of the lossy items alone are enough for a match, so * returns FALSE, none of the lossy items alone are enough for a match, so
* we don't need to return a lossy-page pointer. Otherwise, return a * we don't need to return a lossy-page pointer. Otherwise, return a
* lossy-page pointer to indicate that the whole heap page must be * lossy-page pointer to indicate that the whole heap page must be
* checked. (On subsequent calls, we'll do nothing until minItem is past * checked. (On subsequent calls, we'll do nothing until minItem is past
...@@ -1746,8 +1748,7 @@ collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos) ...@@ -1746,8 +1748,7 @@ collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
} }
/* /*
* Collect all matched rows from pending list into bitmap. Also function * Collect all matched rows from pending list into bitmap.
* takes PendingLockRelation if it's needed.
*/ */
static void static void
scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
...@@ -1764,6 +1765,12 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) ...@@ -1764,6 +1765,12 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
*ntids = 0; *ntids = 0;
/*
* Acquire predicate lock on the metapage, to conflict with any
* fastupdate insertions.
*/
PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot);
LockBuffer(metabuffer, GIN_SHARE); LockBuffer(metabuffer, GIN_SHARE);
page = BufferGetPage(metabuffer); page = BufferGetPage(metabuffer);
TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
...@@ -1777,24 +1784,9 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) ...@@ -1777,24 +1784,9 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
{ {
/* No pending list, so proceed with normal scan */ /* No pending list, so proceed with normal scan */
UnlockReleaseBuffer(metabuffer); UnlockReleaseBuffer(metabuffer);
/*
* If fast update is enabled, we acquire a predicate lock on the
* entire relation as fast update postpones the insertion of tuples
* into index structure due to which we can't detect rw conflicts.
*/
if (GinGetUseFastUpdate(scan->indexRelation))
PredicateLockRelation(scan->indexRelation, scan->xs_snapshot);
return; return;
} }
/*
* Pending list is not empty, we need to lock the index doesn't despite on
* fastupdate state
*/
PredicateLockRelation(scan->indexRelation, scan->xs_snapshot);
pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno); pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
LockBuffer(pos.pendingBuffer, GIN_SHARE); LockBuffer(pos.pendingBuffer, GIN_SHARE);
pos.firstOffset = FirstOffsetNumber; pos.firstOffset = FirstOffsetNumber;
......
...@@ -219,7 +219,7 @@ ginEntryInsert(GinState *ginstate, ...@@ -219,7 +219,7 @@ ginEntryInsert(GinState *ginstate,
return; return;
} }
GinCheckForSerializableConflictIn(btree.index, NULL, stack->buffer); CheckForSerializableConflictIn(ginstate->index, NULL, stack->buffer);
/* modify an existing leaf entry */ /* modify an existing leaf entry */
itup = addItemPointersToLeafTuple(ginstate, itup, itup = addItemPointersToLeafTuple(ginstate, itup,
items, nitem, buildStats, stack->buffer); items, nitem, buildStats, stack->buffer);
...@@ -228,7 +228,7 @@ ginEntryInsert(GinState *ginstate, ...@@ -228,7 +228,7 @@ ginEntryInsert(GinState *ginstate,
} }
else else
{ {
GinCheckForSerializableConflictIn(btree.index, NULL, stack->buffer); CheckForSerializableConflictIn(ginstate->index, NULL, stack->buffer);
/* no match, so construct a new leaf entry */ /* no match, so construct a new leaf entry */
itup = buildFreshLeafTuple(ginstate, attnum, key, category, itup = buildFreshLeafTuple(ginstate, attnum, key, category,
items, nitem, buildStats, stack->buffer); items, nitem, buildStats, stack->buffer);
...@@ -517,18 +517,6 @@ gininsert(Relation index, Datum *values, bool *isnull, ...@@ -517,18 +517,6 @@ gininsert(Relation index, Datum *values, bool *isnull,
memset(&collector, 0, sizeof(GinTupleCollector)); memset(&collector, 0, sizeof(GinTupleCollector));
/*
* With fastupdate on each scan and each insert begin with access to
* pending list, so it effectively lock entire index. In this case we
* aquire predicate lock and check for conflicts over index relation,
* and hope that it will reduce locking overhead.
*
* Do not use GinCheckForSerializableConflictIn() here, because it
* will do nothing (it does actual work only with fastupdate off).
* Check for conflicts for entire index.
*/
CheckForSerializableConflictIn(index, NULL, InvalidBuffer);
for (i = 0; i < ginstate->origTupdesc->natts; i++) for (i = 0; i < ginstate->origTupdesc->natts; i++)
ginHeapTupleFastCollect(ginstate, &collector, ginHeapTupleFastCollect(ginstate, &collector,
(OffsetNumber) (i + 1), (OffsetNumber) (i + 1),
...@@ -539,16 +527,6 @@ gininsert(Relation index, Datum *values, bool *isnull, ...@@ -539,16 +527,6 @@ gininsert(Relation index, Datum *values, bool *isnull,
} }
else else
{ {
GinStatsData stats;
/*
* Fastupdate is off but if pending list isn't empty then we need to
* check conflicts with PredicateLockRelation in scanPendingInsert().
*/
ginGetStats(index, &stats);
if (stats.nPendingPages > 0)
CheckForSerializableConflictIn(index, NULL, InvalidBuffer);
for (i = 0; i < ginstate->origTupdesc->natts; i++) for (i = 0; i < ginstate->origTupdesc->natts; i++)
ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1), ginHeapTupleInsert(ginstate, (OffsetNumber) (i + 1),
values[i], isnull[i], values[i], isnull[i],
......
...@@ -718,10 +718,3 @@ ginUpdateStats(Relation index, const GinStatsData *stats) ...@@ -718,10 +718,3 @@ ginUpdateStats(Relation index, const GinStatsData *stats)
END_CRIT_SECTION(); END_CRIT_SECTION();
} }
void
GinCheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
{
if (!GinGetUseFastUpdate(relation))
CheckForSerializableConflictIn(relation, tuple, buffer);
}
...@@ -166,7 +166,6 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn ...@@ -166,7 +166,6 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
START_CRIT_SECTION(); START_CRIT_SECTION();
/* Unlink the page by changing left sibling's rightlink */ /* Unlink the page by changing left sibling's rightlink */
page = BufferGetPage(lBuffer); page = BufferGetPage(lBuffer);
GinPageGetOpaque(page)->rightlink = rightlink; GinPageGetOpaque(page)->rightlink = rightlink;
......
...@@ -1220,7 +1220,7 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, ...@@ -1220,7 +1220,7 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
bool is_split; bool is_split;
/* /*
* Check for any rw conflicts (in serialisation isolation level) just * Check for any rw conflicts (in serializable isolation level) just
* before we intend to modify the page * before we intend to modify the page
*/ */
CheckForSerializableConflictIn(state->r, NULL, stack->buffer); CheckForSerializableConflictIn(state->r, NULL, stack->buffer);
......
...@@ -373,21 +373,22 @@ index *leaf* pages needed to lock the appropriate index range. If, ...@@ -373,21 +373,22 @@ index *leaf* pages needed to lock the appropriate index range. If,
however, a search discovers that no root page has yet been created, a however, a search discovers that no root page has yet been created, a
predicate lock on the index relation is required. predicate lock on the index relation is required.
* Like a B-tree, GIN searches acquire predicate locks only on the
leaf pages of entry tree. When performing an equality scan, and an
entry has a posting tree, the posting tree root is locked instead, to
lock only that key value. However, fastupdate=on postpones the
insertion of tuples into index structure by temporarily storing them
into pending list. That makes us unable to detect r-w conflicts using
page-level locks. To cope with that, insertions to the pending list
conflict with all scans.
* GiST searches can determine that there are no matches at any * GiST searches can determine that there are no matches at any
level of the index, so we acquire predicate lock at each index level of the index, so we acquire predicate lock at each index
level during a GiST search. An index insert at the leaf level can level during a GiST search. An index insert at the leaf level can
then be trusted to ripple up to all levels and locations where then be trusted to ripple up to all levels and locations where
conflicting predicate locks may exist. In case there is a page split, conflicting predicate locks may exist. In case there is a page split,
we need to copy predicate lock from an original page to all new pages. we need to copy predicate lock from the original page to all the new
pages.
* GIN searches acquire predicate locks only on the leaf pages
of entry tree and posting tree. During a page split, a predicate locks are
copied from the original page to the new page. In the same way predicate locks
are copied from entry tree leaf page to freshly created posting tree root.
However, when fast update is enabled, a predicate lock on the whole index
relation is required. Fast update postpones the insertion of tuples into index
structure by temporarily storing them into pending list. That makes us unable
to detect r-w conflicts using page-level locks.
* Hash index searches acquire predicate locks on the primary * Hash index searches acquire predicate locks on the primary
page of a bucket. It acquires a lock on both the old and new buckets page of a bucket. It acquires a lock on both the old and new buckets
...@@ -395,7 +396,6 @@ for scans that happen concurrently with page splits. During a bucket ...@@ -395,7 +396,6 @@ for scans that happen concurrently with page splits. During a bucket
split, a predicate lock is copied from the primary page of an old split, a predicate lock is copied from the primary page of an old
bucket to the primary page of a new bucket. bucket to the primary page of a new bucket.
* The effects of page splits, overflows, consolidations, and * The effects of page splits, overflows, consolidations, and
removals must be carefully reviewed to ensure that predicate locks removals must be carefully reviewed to ensure that predicate locks
aren't "lost" during those operations, or kept with pages which could aren't "lost" during those operations, or kept with pages which could
......
...@@ -103,8 +103,6 @@ extern Datum *ginExtractEntries(GinState *ginstate, OffsetNumber attnum, ...@@ -103,8 +103,6 @@ extern Datum *ginExtractEntries(GinState *ginstate, OffsetNumber attnum,
extern OffsetNumber gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple); extern OffsetNumber gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple);
extern Datum gintuple_get_key(GinState *ginstate, IndexTuple tuple, extern Datum gintuple_get_key(GinState *ginstate, IndexTuple tuple,
GinNullCategory *category); GinNullCategory *category);
extern void GinCheckForSerializableConflictIn(Relation relation,
HeapTuple tuple, Buffer buffer);
/* gininsert.c */ /* gininsert.c */
extern IndexBuildResult *ginbuild(Relation heap, Relation index, extern IndexBuildResult *ginbuild(Relation heap, Relation index,
...@@ -227,7 +225,6 @@ extern void ginInsertItemPointers(Relation index, BlockNumber rootBlkno, ...@@ -227,7 +225,6 @@ extern void ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
GinStatsData *buildStats); GinStatsData *buildStats);
extern GinBtreeStack *ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno, Snapshot snapshot); extern GinBtreeStack *ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno, Snapshot snapshot);
extern void ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage); extern void ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage);
extern void ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno);
/* /*
* This is declared in ginvacuum.c, but is passed between ginVacuumItemPointers * This is declared in ginvacuum.c, but is passed between ginVacuumItemPointers
......
Parsed test spec with 3 sessions
starting permutation: r1 r2 w1 c1 w2 c2
step r1: SELECT count(*) FROM gin_tbl WHERE p @> array[1000];
count
2
step r2: SELECT * FROM other_tbl;
id
step w1: INSERT INTO other_tbl VALUES (42);
step c1: COMMIT;
step w2: INSERT INTO gin_tbl SELECT array[1000,19001];
ERROR: could not serialize access due to read/write dependencies among transactions
step c2: COMMIT;
starting permutation: r1 r2 w1 c1 fastupdate_on w2 c2
step r1: SELECT count(*) FROM gin_tbl WHERE p @> array[1000];
count
2
step r2: SELECT * FROM other_tbl;
id
step w1: INSERT INTO other_tbl VALUES (42);
step c1: COMMIT;
step fastupdate_on: ALTER INDEX ginidx SET (fastupdate = on);
step w2: INSERT INTO gin_tbl SELECT array[1000,19001];
ERROR: could not serialize access due to read/write dependencies among transactions
step c2: COMMIT;
Parsed test spec with 2 sessions
starting permutation: r1 r2 w1 c1 w2 c2
step r1: SELECT count(*) FROM gin_tbl WHERE p @> array[-1];
count
0
step r2: SELECT * FROM other_tbl;
id
step w1: INSERT INTO other_tbl VALUES (42);
step c1: COMMIT;
step w2: INSERT INTO gin_tbl SELECT array[-1];
ERROR: could not serialize access due to read/write dependencies among transactions
step c2: COMMIT;
...@@ -737,8 +737,8 @@ step c2: commit; ...@@ -737,8 +737,8 @@ step c2: commit;
starting permutation: fu1 rxy1 rxy2fu wx1 c1 wy2fu c2 starting permutation: fu1 rxy1 rxy2fu wx1 c1 wy2fu c2
step fu1: alter index ginidx set (fastupdate = on); step fu1: alter index ginidx set (fastupdate = on);
commit; commit;
begin isolation level serializable; begin isolation level serializable;
set enable_seqscan=off; set enable_seqscan=off;
step rxy1: select count(*) from gin_tbl where p @> array[4,5]; step rxy1: select count(*) from gin_tbl where p @> array[4,5];
count count
......
...@@ -69,6 +69,8 @@ test: vacuum-concurrent-drop ...@@ -69,6 +69,8 @@ test: vacuum-concurrent-drop
test: predicate-hash test: predicate-hash
test: predicate-gist test: predicate-gist
test: predicate-gin test: predicate-gin
test: predicate-gin-fastupdate
test: predicate-gin-nomatch
test: partition-key-update-1 test: partition-key-update-1
test: partition-key-update-2 test: partition-key-update-2
test: partition-key-update-3 test: partition-key-update-3
#
# Test that predicate locking on a GIN index works correctly, even if
# fastupdate is turned on concurrently.
#
# 0. fastupdate is off
# 1. Session 's1' acquires predicate lock on page X
# 2. fastupdate is turned on
# 3. Session 's2' inserts a new tuple to the pending list
#
# This test tests that if the lock acquired in step 1 would conflict with
# the scan in step 1, we detect that conflict correctly, even if fastupdate
# was turned on in-between.
#
setup
{
create table gin_tbl(p int4[]);
insert into gin_tbl select array[g, g*2,g*3] from generate_series(1, 10000) g;
insert into gin_tbl select array[4,5,6] from generate_series(10001, 20000) g;
create index ginidx on gin_tbl using gin(p) with (fastupdate = off);
create table other_tbl (id int4);
}
teardown
{
drop table gin_tbl;
drop table other_tbl;
}
session "s1"
setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan=off; }
step "r1" { SELECT count(*) FROM gin_tbl WHERE p @> array[1000]; }
step "w1" { INSERT INTO other_tbl VALUES (42); }
step "c1" { COMMIT; }
session "s2"
setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan=off; }
step "r2" { SELECT * FROM other_tbl; }
step "w2" { INSERT INTO gin_tbl SELECT array[1000,19001]; }
step "c2" { COMMIT; }
session "s3"
step "fastupdate_on" { ALTER INDEX ginidx SET (fastupdate = on); }
# This correctly throws serialization failure.
permutation "r1" "r2" "w1" "c1" "w2" "c2"
# But if fastupdate is turned on in the middle, we miss it.
permutation "r1" "r2" "w1" "c1" "fastupdate_on" "w2" "c2"
#
# Check that GIN index grabs an appropriate lock, even if there is no match.
#
setup
{
create table gin_tbl(p int4[]);
insert into gin_tbl select array[g, g*2,g*3] from generate_series(1, 10000) g;
insert into gin_tbl select array[4,5,6] from generate_series(10001, 20000) g;
create index ginidx on gin_tbl using gin(p) with (fastupdate = off);
create table other_tbl (id int4);
}
teardown
{
drop table gin_tbl;
drop table other_tbl;
}
session "s1"
setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan=off; }
# Scan with no match.
step "r1" { SELECT count(*) FROM gin_tbl WHERE p @> array[-1]; }
step "w1" { INSERT INTO other_tbl VALUES (42); }
step "c1" { COMMIT; }
session "s2"
setup { BEGIN ISOLATION LEVEL SERIALIZABLE; SET enable_seqscan=off; }
step "r2" { SELECT * FROM other_tbl; }
# Insert row that would've matched in step "r1"
step "w2" { INSERT INTO gin_tbl SELECT array[-1]; }
step "c2" { COMMIT; }
# This should throw serialization failure.
permutation "r1" "r2" "w1" "c1" "w2" "c2"
...@@ -32,8 +32,8 @@ setup ...@@ -32,8 +32,8 @@ setup
# enable pending list for a small subset of tests # enable pending list for a small subset of tests
step "fu1" { alter index ginidx set (fastupdate = on); step "fu1" { alter index ginidx set (fastupdate = on);
commit; commit;
begin isolation level serializable; begin isolation level serializable;
set enable_seqscan=off; } set enable_seqscan=off; }
step "rxy1" { select count(*) from gin_tbl where p @> array[4,5]; } step "rxy1" { select count(*) from gin_tbl where p @> array[4,5]; }
step "wx1" { insert into gin_tbl select g, array[5,6] from generate_series step "wx1" { insert into gin_tbl select g, array[5,6] from generate_series
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment