Commit 93190c30 authored by Tom Lane's avatar Tom Lane

Repair still another bug in the btree page split WAL reduction patch:

it failed for splits of non-leaf pages because in such pages the first
data key on a page is suppressed, and so we can't just copy the first
key from the right page to reconstitute the left page's high key.
Problem found by Koichi Suzuki, patch by Heikki.
parent bb4a78c0
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.161 2007/11/15 21:14:32 momjian Exp $ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.162 2007/11/16 19:53:50 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -371,13 +371,13 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, ...@@ -371,13 +371,13 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
* removing any LP_DEAD tuples. * removing any LP_DEAD tuples.
* *
* On entry, *buf and *offsetptr point to the first legal position * On entry, *buf and *offsetptr point to the first legal position
* where the new tuple could be inserted. The caller should hold an * where the new tuple could be inserted. The caller should hold an
* exclusive lock on *buf. *offsetptr can also be set to * exclusive lock on *buf. *offsetptr can also be set to
* InvalidOffsetNumber, in which case the function will search the right * InvalidOffsetNumber, in which case the function will search for the
* location within the page if needed. On exit, they point to the chosen * right location within the page if needed. On exit, they point to the
* insert location. If findinsertloc decided to move right, the lock and * chosen insert location. If _bt_findinsertloc decides to move right,
* pin on the original page will be released and the new page returned to * the lock and pin on the original page will be released and the new
* the caller is exclusively locked instead. * page returned to the caller is exclusively locked instead.
* *
* newtup is the new tuple we're inserting, and scankey is an insertion * newtup is the new tuple we're inserting, and scankey is an insertion
* type scan key for it. * type scan key for it.
...@@ -422,8 +422,6 @@ _bt_findinsertloc(Relation rel, ...@@ -422,8 +422,6 @@ _bt_findinsertloc(Relation rel,
"Consider a function index of an MD5 hash of the value, " "Consider a function index of an MD5 hash of the value, "
"or use full text indexing."))); "or use full text indexing.")));
/*---------- /*----------
* If we will need to split the page to put the item on this page, * If we will need to split the page to put the item on this page,
* check whether we can put the tuple somewhere to the right, * check whether we can put the tuple somewhere to the right,
...@@ -1004,7 +1002,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -1004,7 +1002,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
xl_btree_split xlrec; xl_btree_split xlrec;
uint8 xlinfo; uint8 xlinfo;
XLogRecPtr recptr; XLogRecPtr recptr;
XLogRecData rdata[6]; XLogRecData rdata[7];
XLogRecData *lastrdata; XLogRecData *lastrdata;
xlrec.node = rel->rd_node; xlrec.node = rel->rd_node;
...@@ -1020,15 +1018,32 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -1020,15 +1018,32 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
lastrdata = &rdata[0]; lastrdata = &rdata[0];
/* Log downlink on non-leaf pages. */
if (ropaque->btpo.level > 0) if (ropaque->btpo.level > 0)
{ {
/* Log downlink on non-leaf pages */
lastrdata->next = lastrdata + 1; lastrdata->next = lastrdata + 1;
lastrdata++; lastrdata++;
lastrdata->data = (char *) &newitem->t_tid.ip_blkid; lastrdata->data = (char *) &newitem->t_tid.ip_blkid;
lastrdata->len = sizeof(BlockIdData); lastrdata->len = sizeof(BlockIdData);
lastrdata->buffer = InvalidBuffer; lastrdata->buffer = InvalidBuffer;
/*
* We must also log the left page's high key, because the right
* page's leftmost key is suppressed on non-leaf levels. Show it
* as belonging to the left page buffer, so that it is not stored
* if XLogInsert decides it needs a full-page image of the left
* page.
*/
lastrdata->next = lastrdata + 1;
lastrdata++;
itemid = PageGetItemId(origpage, P_HIKEY);
item = (IndexTuple) PageGetItem(origpage, itemid);
lastrdata->data = (char *) item;
lastrdata->len = MAXALIGN(IndexTupleSize(item));
lastrdata->buffer = buf; /* backup block 1 */
lastrdata->buffer_std = true;
} }
/* /*
...@@ -1057,7 +1072,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -1057,7 +1072,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
lastrdata->buffer = buf; /* backup block 1 */ lastrdata->buffer = buf; /* backup block 1 */
lastrdata->buffer_std = true; lastrdata->buffer_std = true;
} }
else else if (ropaque->btpo.level == 0)
{ {
/* /*
* Although we don't need to WAL-log the new item, we still need * Although we don't need to WAL-log the new item, we still need
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.48 2007/11/15 22:25:15 momjian Exp $ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.49 2007/11/16 19:53:50 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -273,6 +273,8 @@ btree_xlog_split(bool onleft, bool isroot, ...@@ -273,6 +273,8 @@ btree_xlog_split(bool onleft, bool isroot,
OffsetNumber newitemoff = 0; OffsetNumber newitemoff = 0;
Item newitem = NULL; Item newitem = NULL;
Size newitemsz = 0; Size newitemsz = 0;
Item left_hikey = NULL;
Size left_hikeysz = 0;
reln = XLogOpenRelation(xlrec->node); reln = XLogOpenRelation(xlrec->node);
...@@ -289,6 +291,17 @@ btree_xlog_split(bool onleft, bool isroot, ...@@ -289,6 +291,17 @@ btree_xlog_split(bool onleft, bool isroot,
datalen -= sizeof(BlockIdData); datalen -= sizeof(BlockIdData);
forget_matching_split(xlrec->node, downlink, false); forget_matching_split(xlrec->node, downlink, false);
/* Extract left hikey and its size (still assuming 16-bit alignment) */
if (!(record->xl_info & XLR_BKP_BLOCK_1))
{
/* We assume 16-bit alignment is enough for IndexTupleSize */
left_hikey = (Item) datapos;
left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
datapos += left_hikeysz;
datalen -= left_hikeysz;
}
} }
/* Extract newitem and newitemoff, if present */ /* Extract newitem and newitemoff, if present */
...@@ -302,17 +315,13 @@ btree_xlog_split(bool onleft, bool isroot, ...@@ -302,17 +315,13 @@ btree_xlog_split(bool onleft, bool isroot,
if (onleft && !(record->xl_info & XLR_BKP_BLOCK_1)) if (onleft && !(record->xl_info & XLR_BKP_BLOCK_1))
{ {
IndexTupleData itupdata;
/* /*
* We need to copy the tuple header to apply IndexTupleDSize, because * We assume that 16-bit alignment is enough to apply IndexTupleSize
* of alignment considerations. However, we assume that PageAddItem * (since it's fetching from a uint16 field) and also enough for
* doesn't care about the alignment of the newitem pointer it's given. * PageAddItem to insert the tuple.
*/ */
newitem = datapos; newitem = (Item) datapos;
memcpy(&itupdata, datapos, sizeof(IndexTupleData)); newitemsz = MAXALIGN(IndexTupleSize(newitem));
newitemsz = IndexTupleDSize(itupdata);
newitemsz = MAXALIGN(newitemsz);
datapos += newitemsz; datapos += newitemsz;
datalen -= newitemsz; datalen -= newitemsz;
} }
...@@ -333,6 +342,18 @@ btree_xlog_split(bool onleft, bool isroot, ...@@ -333,6 +342,18 @@ btree_xlog_split(bool onleft, bool isroot,
_bt_restore_page(rpage, datapos, datalen); _bt_restore_page(rpage, datapos, datalen);
/*
* On leaf level, the high key of the left page is equal to the
* first key on the right page.
*/
if (xlrec->level == 0)
{
ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
left_hikey = PageGetItem(rpage, hiItemId);
left_hikeysz = ItemIdGetLength(hiItemId);
}
PageSetLSN(rpage, lsn); PageSetLSN(rpage, lsn);
PageSetTLI(rpage, ThisTimeLineID); PageSetTLI(rpage, ThisTimeLineID);
MarkBufferDirty(rbuf); MarkBufferDirty(rbuf);
...@@ -360,8 +381,6 @@ btree_xlog_split(bool onleft, bool isroot, ...@@ -360,8 +381,6 @@ btree_xlog_split(bool onleft, bool isroot,
OffsetNumber maxoff = PageGetMaxOffsetNumber(lpage); OffsetNumber maxoff = PageGetMaxOffsetNumber(lpage);
OffsetNumber deletable[MaxOffsetNumber]; OffsetNumber deletable[MaxOffsetNumber];
int ndeletable = 0; int ndeletable = 0;
ItemId hiItemId;
Item hiItem;
/* /*
* Remove the items from the left page that were copied to the * Remove the items from the left page that were copied to the
...@@ -394,11 +413,8 @@ btree_xlog_split(bool onleft, bool isroot, ...@@ -394,11 +413,8 @@ btree_xlog_split(bool onleft, bool isroot,
elog(PANIC, "failed to add new item to left page after split"); elog(PANIC, "failed to add new item to left page after split");
} }
/* Set high key equal to the first key on the right page */ /* Set high key */
hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque)); if (PageAddItem(lpage, left_hikey, left_hikeysz,
hiItem = PageGetItem(rpage, hiItemId);
if (PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
P_HIKEY, false, false) == InvalidOffsetNumber) P_HIKEY, false, false) == InvalidOffsetNumber)
elog(PANIC, "failed to add high key to left page after split"); elog(PANIC, "failed to add high key to left page after split");
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.114 2007/11/15 21:14:42 momjian Exp $ * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.115 2007/11/16 19:53:50 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -289,8 +289,15 @@ typedef struct xl_btree_split ...@@ -289,8 +289,15 @@ typedef struct xl_btree_split
* than BlockNumber for alignment reasons: SizeOfBtreeSplit is only 16-bit * than BlockNumber for alignment reasons: SizeOfBtreeSplit is only 16-bit
* aligned.) * aligned.)
* *
* If level > 0, an IndexTuple representing the HIKEY of the left page
* follows. We don't need this on leaf pages, because it's the same
* as the leftmost key in the new right page. Also, it's suppressed if
* XLogInsert chooses to store the left page's whole page image.
*
* In the _L variants, next are OffsetNumber newitemoff and the new item. * In the _L variants, next are OffsetNumber newitemoff and the new item.
* (In the _R variants, the new item is one of the right page's tuples.) * (In the _R variants, the new item is one of the right page's tuples.)
* The new item, but not newitemoff, is suppressed if XLogInsert chooses
* to store the left page's whole page image.
* *
* Last are the right page's tuples in the form used by _bt_restore_page. * Last are the right page's tuples in the form used by _bt_restore_page.
*/ */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment