Commit 52ac6cd2 authored by Alexander Korotkov's avatar Alexander Korotkov

Prevent GIN deleted pages from being reclaimed too early

When GIN vacuum deletes a posting tree page, it assumes that no concurrent
searchers can access it, thanks to ginStepRight() locking two pages at once.
However, since 9.4 searches can skip parts of posting trees descending from the
root.  That leads to the risk that page is deleted and reclaimed before
concurrent search can access it.

This commit prevents the risk of above by waiting for every transaction, which
might wait to reference this page, to finish.  Due to binary compatibility
we can't change GinPageOpaqueData to store corresponding transaction id.
Instead we reuse page header pd_prune_xid field, which is unused in index pages.

Discussion: https://postgr.es/m/31a702a.14dd.166c1366ac1.Coremail.chjischj%40163.com
Author: Andrey Borodin, Alexander Korotkov
Reviewed-by: Alexander Korotkov
Backpatch-through: 9.4
parent c6ade7a8
...@@ -304,12 +304,10 @@ the lock on next page has been acquired. ...@@ -304,12 +304,10 @@ the lock on next page has been acquired.
The downlink is more tricky. A search descending the tree must release the The downlink is more tricky. A search descending the tree must release the
lock on the parent page before locking the child, or it could deadlock with lock on the parent page before locking the child, or it could deadlock with
a concurrent split of the child page; a page split locks the parent, while a concurrent split of the child page; a page split locks the parent, while
already holding a lock on the child page. However, posting trees are only already holding a lock on the child page. So, deleted page cannot be reclaimed
fully searched from left to right, starting from the leftmost leaf. (The immediately. Instead, we have to wait for every transaction, which might wait
tree-structure is only needed by insertions, to quickly find the correct to reference this page, to finish. Corresponding processes must observe that
insert location). So as long as we don't delete the leftmost page on each the page is marked deleted and recover accordingly.
level, a search can never follow a downlink to page that's about to be
deleted.
The previous paragraph's reasoning only applies to searches, and only to The previous paragraph's reasoning only applies to searches, and only to
posting trees. To protect from inserters following a downlink to a deleted posting trees. To protect from inserters following a downlink to a deleted
......
...@@ -309,12 +309,7 @@ GinNewBuffer(Relation index) ...@@ -309,12 +309,7 @@ GinNewBuffer(Relation index)
*/ */
if (ConditionalLockBuffer(buffer)) if (ConditionalLockBuffer(buffer))
{ {
Page page = BufferGetPage(buffer); if (GinPageIsRecyclable(BufferGetPage(buffer)))
if (PageIsNew(page))
return buffer; /* OK to use, if never initialized */
if (GinPageIsDeleted(page))
return buffer; /* OK to use */ return buffer; /* OK to use */
LockBuffer(buffer, GIN_UNLOCK); LockBuffer(buffer, GIN_UNLOCK);
......
...@@ -157,6 +157,9 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn ...@@ -157,6 +157,9 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
page = BufferGetPage(dBuffer); page = BufferGetPage(dBuffer);
rightlink = GinPageGetOpaque(page)->rightlink; rightlink = GinPageGetOpaque(page)->rightlink;
/* For deleted page remember last xid which could knew its address */
GinPageSetDeleteXid(page, ReadNewTransactionId());
/* /*
* Any insert which would have gone on the leaf block will now go to its * Any insert which would have gone on the leaf block will now go to its
* right sibling. * right sibling.
...@@ -213,6 +216,7 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn ...@@ -213,6 +216,7 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
data.parentOffset = myoff; data.parentOffset = myoff;
data.rightLink = GinPageGetOpaque(page)->rightlink; data.rightLink = GinPageGetOpaque(page)->rightlink;
data.deleteXid = GinPageGetDeleteXid(page);
XLogRegisterData((char *) &data, sizeof(ginxlogDeletePage)); XLogRegisterData((char *) &data, sizeof(ginxlogDeletePage));
...@@ -732,7 +736,7 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) ...@@ -732,7 +736,7 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
LockBuffer(buffer, GIN_SHARE); LockBuffer(buffer, GIN_SHARE);
page = (Page) BufferGetPage(buffer); page = (Page) BufferGetPage(buffer);
if (PageIsNew(page) || GinPageIsDeleted(page)) if (GinPageIsRecyclable(page))
{ {
Assert(blkno != GIN_ROOT_BLKNO); Assert(blkno != GIN_ROOT_BLKNO);
RecordFreeIndexPage(index, blkno); RecordFreeIndexPage(index, blkno);
......
...@@ -531,6 +531,7 @@ ginRedoDeletePage(XLogReaderState *record) ...@@ -531,6 +531,7 @@ ginRedoDeletePage(XLogReaderState *record)
page = BufferGetPage(dbuffer); page = BufferGetPage(dbuffer);
Assert(GinPageIsData(page)); Assert(GinPageIsData(page));
GinPageGetOpaque(page)->flags = GIN_DELETED; GinPageGetOpaque(page)->flags = GIN_DELETED;
GinPageSetDeleteXid(page, data->deleteXid);
PageSetLSN(page, lsn); PageSetLSN(page, lsn);
MarkBufferDirty(dbuffer); MarkBufferDirty(dbuffer);
} }
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#ifndef GINBLOCK_H #ifndef GINBLOCK_H
#define GINBLOCK_H #define GINBLOCK_H
#include "access/transam.h"
#include "storage/block.h" #include "storage/block.h"
#include "storage/itemptr.h" #include "storage/itemptr.h"
#include "storage/off.h" #include "storage/off.h"
...@@ -127,6 +128,15 @@ typedef struct GinMetaPageData ...@@ -127,6 +128,15 @@ typedef struct GinMetaPageData
#define GinPageRightMost(page) ( GinPageGetOpaque(page)->rightlink == InvalidBlockNumber) #define GinPageRightMost(page) ( GinPageGetOpaque(page)->rightlink == InvalidBlockNumber)
/*
* We should reclaim deleted page only once every transaction started before
* its deletion is over.
*/
#define GinPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid )
#define GinPageSetDeleteXid(page, xid) ( ((PageHeader) (page))->pd_prune_xid = xid)
#define GinPageIsRecyclable(page) ( PageIsNew(page) || (GinPageIsDeleted(page) \
&& TransactionIdPrecedes(GinPageGetDeleteXid(page), RecentGlobalXmin)))
/* /*
* We use our own ItemPointerGet(BlockNumber|OffsetNumber) * We use our own ItemPointerGet(BlockNumber|OffsetNumber)
* to avoid Asserts, since sometimes the ip_posid isn't "valid" * to avoid Asserts, since sometimes the ip_posid isn't "valid"
......
...@@ -158,6 +158,7 @@ typedef struct ginxlogDeletePage ...@@ -158,6 +158,7 @@ typedef struct ginxlogDeletePage
{ {
OffsetNumber parentOffset; OffsetNumber parentOffset;
BlockNumber rightLink; BlockNumber rightLink;
TransactionId deleteXid; /* last Xid which could see this page in scan */
} ginxlogDeletePage; } ginxlogDeletePage;
#define XLOG_GIN_UPDATE_META_PAGE 0x60 #define XLOG_GIN_UPDATE_META_PAGE 0x60
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment