Commit 4fafc4ec authored by Heikki Linnakangas's avatar Heikki Linnakangas

Cleanup of new b-tree page deletion code.

When marking a branch as half-dead, a pointer to the top of the branch is
stored in the leaf block's hi-key. During normal operation, the high key
was left in place, and the block number was just stored in the ctid field
of the high key tuple, but in WAL replay, the high key was recreated as a
truncated tuple with zero columns. For the sake of easier debugging, also
truncate the tuple in normal operation, so that the page is identical
after WAL replay. Also, rename the 'downlink' field in the WAL record to
'topparent', as that seems like a more descriptive name. And make sure
it's set to invalid when unlinking the leaf page.
parent d26b042c
...@@ -1303,6 +1303,10 @@ _bt_pagedel(Relation rel, Buffer buf) ...@@ -1303,6 +1303,10 @@ _bt_pagedel(Relation rel, Buffer buf)
return ndeleted; return ndeleted;
} }
/*
* First stage of page deletion. Remove the downlink to the top of the
* branch being deleted, and mark the leaf page as half-dead.
*/
static bool static bool
_bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
{ {
...@@ -1317,6 +1321,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) ...@@ -1317,6 +1321,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
OffsetNumber topoff; OffsetNumber topoff;
OffsetNumber nextoffset; OffsetNumber nextoffset;
IndexTuple itup; IndexTuple itup;
IndexTupleData trunctuple;
page = BufferGetPage(leafbuf); page = BufferGetPage(leafbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque = (BTPageOpaque) PageGetSpecialPointer(page);
...@@ -1406,12 +1411,17 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) ...@@ -1406,12 +1411,17 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque = (BTPageOpaque) PageGetSpecialPointer(page);
opaque->btpo_flags |= BTP_HALF_DEAD; opaque->btpo_flags |= BTP_HALF_DEAD;
itemid = PageGetItemId(page, P_HIKEY); PageIndexTupleDelete(page, P_HIKEY);
itup = (IndexTuple) PageGetItem(page, itemid); Assert(PageGetMaxOffsetNumber(page) == 0);
if (target == leafblkno) MemSet(&trunctuple, 0, sizeof(IndexTupleData));
ItemPointerSetInvalid(&(itup->t_tid)); trunctuple.t_info = sizeof(IndexTupleData);
if (target != leafblkno)
ItemPointerSet(&trunctuple.t_tid, target, P_HIKEY);
else else
ItemPointerSet(&(itup->t_tid), target, P_HIKEY); ItemPointerSetInvalid(&trunctuple.t_tid);
if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
false, false) == InvalidOffsetNumber)
elog(ERROR, "could not add dummy high key to half-dead page");
/* Must mark buffers dirty before XLogInsert */ /* Must mark buffers dirty before XLogInsert */
MarkBufferDirty(topparent); MarkBufferDirty(topparent);
...@@ -1427,7 +1437,10 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) ...@@ -1427,7 +1437,10 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
xlrec.target.node = rel->rd_node; xlrec.target.node = rel->rd_node;
ItemPointerSet(&(xlrec.target.tid), BufferGetBlockNumber(topparent), topoff); ItemPointerSet(&(xlrec.target.tid), BufferGetBlockNumber(topparent), topoff);
xlrec.leafblk = leafblkno; xlrec.leafblk = leafblkno;
xlrec.downlink = target; if (target != leafblkno)
xlrec.topparent = target;
else
xlrec.topparent = InvalidBlockNumber;
page = BufferGetPage(leafbuf); page = BufferGetPage(leafbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque = (BTPageOpaque) PageGetSpecialPointer(page);
...@@ -1768,7 +1781,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) ...@@ -1768,7 +1781,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
xlrec.leafblk = leafblkno; xlrec.leafblk = leafblkno;
xlrec.leafleftsib = leafleftsib; xlrec.leafleftsib = leafleftsib;
xlrec.leafrightsib = leafrightsib; xlrec.leafrightsib = leafrightsib;
xlrec.downlink = nextchild; xlrec.topparent = nextchild;
rdata[0].data = (char *) &xlrec; rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBtreeUnlinkPage; rdata[0].len = SizeOfBtreeUnlinkPage;
......
...@@ -870,8 +870,8 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogRecPtr lsn, XLogRecord *record) ...@@ -870,8 +870,8 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogRecPtr lsn, XLogRecord *record)
*/ */
MemSet(&trunctuple, 0, sizeof(IndexTupleData)); MemSet(&trunctuple, 0, sizeof(IndexTupleData));
trunctuple.t_info = sizeof(IndexTupleData); trunctuple.t_info = sizeof(IndexTupleData);
if (xlrec->downlink != InvalidBlockNumber) if (xlrec->topparent != InvalidBlockNumber)
ItemPointerSet(&trunctuple.t_tid, xlrec->downlink, P_HIKEY); ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY);
else else
ItemPointerSetInvalid(&trunctuple.t_tid); ItemPointerSetInvalid(&trunctuple.t_tid);
if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
...@@ -1006,8 +1006,8 @@ btree_xlog_unlink_page(uint8 info, XLogRecPtr lsn, XLogRecord *record) ...@@ -1006,8 +1006,8 @@ btree_xlog_unlink_page(uint8 info, XLogRecPtr lsn, XLogRecord *record)
/* Add a dummy hikey item */ /* Add a dummy hikey item */
MemSet(&trunctuple, 0, sizeof(IndexTupleData)); MemSet(&trunctuple, 0, sizeof(IndexTupleData));
trunctuple.t_info = sizeof(IndexTupleData); trunctuple.t_info = sizeof(IndexTupleData);
if (xlrec->downlink != InvalidBlockNumber) if (xlrec->topparent != InvalidBlockNumber)
ItemPointerSet(&trunctuple.t_tid, xlrec->downlink, P_HIKEY); ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY);
else else
ItemPointerSetInvalid(&trunctuple.t_tid); ItemPointerSetInvalid(&trunctuple.t_tid);
if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
......
...@@ -130,8 +130,8 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec) ...@@ -130,8 +130,8 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
appendStringInfoString(buf, "mark_page_halfdead: "); appendStringInfoString(buf, "mark_page_halfdead: ");
out_target(buf, &(xlrec->target)); out_target(buf, &(xlrec->target));
appendStringInfo(buf, "; downlink %u; leaf %u; left %u; right %u", appendStringInfo(buf, "; topparent %u; leaf %u; left %u; right %u",
xlrec->downlink, xlrec->leafblk, xlrec->leftblk, xlrec->rightblk); xlrec->topparent, xlrec->leafblk, xlrec->leftblk, xlrec->rightblk);
break; break;
} }
case XLOG_BTREE_UNLINK_PAGE_META: case XLOG_BTREE_UNLINK_PAGE_META:
...@@ -143,8 +143,8 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec) ...@@ -143,8 +143,8 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode); xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode);
appendStringInfo(buf, "dead %u; left %u; right %u; btpo_xact %u; ", appendStringInfo(buf, "dead %u; left %u; right %u; btpo_xact %u; ",
xlrec->deadblk, xlrec->leftsib, xlrec->rightsib, xlrec->btpo_xact); xlrec->deadblk, xlrec->leftsib, xlrec->rightsib, xlrec->btpo_xact);
appendStringInfo(buf, "leaf %u; leafleft %u; leafright %u; downlink %u", appendStringInfo(buf, "leaf %u; leafleft %u; leafright %u; topparent %u",
xlrec->leafblk, xlrec->leafleftsib, xlrec->leafrightsib, xlrec->downlink); xlrec->leafblk, xlrec->leafleftsib, xlrec->leafrightsib, xlrec->topparent);
break; break;
} }
case XLOG_BTREE_NEWROOT: case XLOG_BTREE_NEWROOT:
......
...@@ -379,13 +379,15 @@ typedef struct xl_btree_vacuum ...@@ -379,13 +379,15 @@ typedef struct xl_btree_vacuum
typedef struct xl_btree_mark_page_halfdead typedef struct xl_btree_mark_page_halfdead
{ {
xl_btreetid target; /* deleted tuple id in parent page */ xl_btreetid target; /* deleted tuple id in parent page */
/* information needed to recreate the leaf page: */
BlockNumber leafblk; /* leaf block ultimately being deleted */ BlockNumber leafblk; /* leaf block ultimately being deleted */
BlockNumber leftblk; /* leaf block's left sibling, if any */ BlockNumber leftblk; /* leaf block's left sibling, if any */
BlockNumber rightblk; /* leaf block's right sibling */ BlockNumber rightblk; /* leaf block's right sibling */
BlockNumber downlink; /* next child down in the branch */ BlockNumber topparent; /* topmost internal page in the branch */
} xl_btree_mark_page_halfdead; } xl_btree_mark_page_halfdead;
#define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, downlink) + sizeof(BlockNumber)) #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
/* /*
* This is what we need to know about deletion of a btree page. Note we do * This is what we need to know about deletion of a btree page. Note we do
...@@ -406,7 +408,7 @@ typedef struct xl_btree_unlink_page ...@@ -406,7 +408,7 @@ typedef struct xl_btree_unlink_page
BlockNumber leafblk; BlockNumber leafblk;
BlockNumber leafleftsib; BlockNumber leafleftsib;
BlockNumber leafrightsib; BlockNumber leafrightsib;
BlockNumber downlink; /* next child down in the branch */ BlockNumber topparent; /* next child down in the branch */
TransactionId btpo_xact; /* value of btpo.xact for use in recovery */ TransactionId btpo_xact; /* value of btpo.xact for use in recovery */
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */ /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment