Commit 226a1005 authored by Tom Lane's avatar Tom Lane

Code review for btree page split WAL reduction patch. Make it actually work

(original code *always* created a full-page image for the left page, thus
leaving the intended savings unrealized), avoid risk of not having enough room
on the page during xlog restore, squeeze out another couple bytes in the xlog
record, clean up neglected comments.
parent f7424b0d
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.155 2007/03/25 19:45:14 tgl Exp $ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.156 2007/04/11 20:47:37 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -779,8 +779,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -779,8 +779,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
Buffer sbuf = InvalidBuffer; Buffer sbuf = InvalidBuffer;
Page spage = NULL; Page spage = NULL;
BTPageOpaque sopaque = NULL; BTPageOpaque sopaque = NULL;
OffsetNumber itup_off = 0;
BlockNumber itup_blkno = 0;
Size itemsz; Size itemsz;
ItemId itemid; ItemId itemid;
IndexTuple item; IndexTuple item;
...@@ -798,6 +796,14 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -798,6 +796,14 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
_bt_pageinit(leftpage, BufferGetPageSize(buf)); _bt_pageinit(leftpage, BufferGetPageSize(buf));
/* rightpage was already initialized by _bt_getbuf */ /* rightpage was already initialized by _bt_getbuf */
/*
* Copy the original page's LSN and TLI into leftpage, which will become
* the updated version of the page. We need this because XLogInsert will
* examine these fields and possibly dump them in a page image.
*/
PageSetLSN(leftpage, PageGetLSN(origpage));
PageSetTLI(leftpage, PageGetTLI(origpage));
/* init btree private data */ /* init btree private data */
oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage); lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
...@@ -864,7 +870,10 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -864,7 +870,10 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
leftoff = OffsetNumberNext(leftoff); leftoff = OffsetNumberNext(leftoff);
/* /*
* Now transfer all the data items to the appropriate page * Now transfer all the data items to the appropriate page.
*
* Note: we *must* insert at least the right page's items in item-number
* order, for the benefit of _bt_restore_page().
*/ */
maxoff = PageGetMaxOffsetNumber(origpage); maxoff = PageGetMaxOffsetNumber(origpage);
...@@ -881,16 +890,12 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -881,16 +890,12 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
{ {
_bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff, _bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff,
"left sibling"); "left sibling");
itup_off = leftoff;
itup_blkno = BufferGetBlockNumber(buf);
leftoff = OffsetNumberNext(leftoff); leftoff = OffsetNumberNext(leftoff);
} }
else else
{ {
_bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff, _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
"right sibling"); "right sibling");
itup_off = rightoff;
itup_blkno = BufferGetBlockNumber(rbuf);
rightoff = OffsetNumberNext(rightoff); rightoff = OffsetNumberNext(rightoff);
} }
} }
...@@ -921,8 +926,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -921,8 +926,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
Assert(!newitemonleft); Assert(!newitemonleft);
_bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff, _bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
"right sibling"); "right sibling");
itup_off = rightoff;
itup_blkno = BufferGetBlockNumber(rbuf);
rightoff = OffsetNumberNext(rightoff); rightoff = OffsetNumberNext(rightoff);
} }
...@@ -961,7 +964,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -961,7 +964,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
/* /*
* Right sibling is locked, new siblings are prepared, but original page * Right sibling is locked, new siblings are prepared, but original page
* is not updated yet. Log changes before continuing. * is not updated yet.
* *
* NO EREPORT(ERROR) till right sibling is updated. We can get away with * NO EREPORT(ERROR) till right sibling is updated. We can get away with
* not starting the critical section till here because we haven't been * not starting the critical section till here because we haven't been
...@@ -970,15 +973,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -970,15 +973,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
*/ */
START_CRIT_SECTION(); START_CRIT_SECTION();
MarkBufferDirty(buf);
MarkBufferDirty(rbuf);
if (!P_RIGHTMOST(ropaque))
{
sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
MarkBufferDirty(sbuf);
}
/* /*
* By here, the original data page has been split into two new halves, and * By here, the original data page has been split into two new halves, and
* these are correct. The algorithm requires that the left page never * these are correct. The algorithm requires that the left page never
...@@ -994,6 +988,15 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -994,6 +988,15 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
*/ */
PageRestoreTempPage(leftpage, origpage); PageRestoreTempPage(leftpage, origpage);
MarkBufferDirty(buf);
MarkBufferDirty(rbuf);
if (!P_RIGHTMOST(ropaque))
{
sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
MarkBufferDirty(sbuf);
}
/* XLOG stuff */ /* XLOG stuff */
if (!rel->rd_istemp) if (!rel->rd_istemp)
{ {
...@@ -1006,9 +1009,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -1006,9 +1009,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
xlrec.node = rel->rd_node; xlrec.node = rel->rd_node;
xlrec.leftsib = BufferGetBlockNumber(buf); xlrec.leftsib = BufferGetBlockNumber(buf);
xlrec.rightsib = BufferGetBlockNumber(rbuf); xlrec.rightsib = BufferGetBlockNumber(rbuf);
xlrec.firstright = firstright;
xlrec.rnext = ropaque->btpo_next; xlrec.rnext = ropaque->btpo_next;
xlrec.level = ropaque->btpo.level; xlrec.level = ropaque->btpo.level;
xlrec.firstright = firstright;
rdata[0].data = (char *) &xlrec; rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBtreeSplit; rdata[0].len = SizeOfBtreeSplit;
...@@ -1027,14 +1030,18 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -1027,14 +1030,18 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
lastrdata->buffer = InvalidBuffer; lastrdata->buffer = InvalidBuffer;
} }
/* Log the new item, if it was inserted on the left page. If it was /*
* put on the right page, we don't need to explicitly WAL log it * Log the new item and its offset, if it was inserted on the left
* because it's included with all the other items on the right page. * page. (If it was put on the right page, we don't need to explicitly
* WAL log it because it's included with all the other items on the
* right page.) Show these as belonging to the left page buffer,
* so that they are not stored if XLogInsert decides it needs a
* full-page image of the left page.
*/ */
lastrdata->next = lastrdata + 1;
lastrdata++;
if (newitemonleft) if (newitemonleft)
{ {
lastrdata->next = lastrdata + 1;
lastrdata++;
lastrdata->data = (char *) &newitemoff; lastrdata->data = (char *) &newitemoff;
lastrdata->len = sizeof(OffsetNumber); lastrdata->len = sizeof(OffsetNumber);
lastrdata->buffer = buf; /* backup block 1 */ lastrdata->buffer = buf; /* backup block 1 */
...@@ -1042,27 +1049,37 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -1042,27 +1049,37 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
lastrdata->next = lastrdata + 1; lastrdata->next = lastrdata + 1;
lastrdata++; lastrdata++;
lastrdata->data = (char *)newitem; lastrdata->data = (char *) newitem;
lastrdata->len = newitemsz; lastrdata->len = MAXALIGN(newitemsz);
lastrdata->buffer = buf; /* backup block 1 */ lastrdata->buffer = buf; /* backup block 1 */
lastrdata->buffer_std = true; lastrdata->buffer_std = true;
} }
else else
{ {
/*
* Although we don't need to WAL-log the new item, we still
* need XLogInsert to consider storing a full-page image of the
* left page, so make an empty entry referencing that buffer.
* This also ensures that the left page is always backup block 1.
*/
lastrdata->next = lastrdata + 1;
lastrdata++;
lastrdata->data = NULL; lastrdata->data = NULL;
lastrdata->len = 0; lastrdata->len = 0;
lastrdata->buffer = buf; /* backup block 1 */ lastrdata->buffer = buf; /* backup block 1 */
lastrdata->buffer_std = true; lastrdata->buffer_std = true;
} }
/* Log the contents of the right page in the format understood by /*
* Log the contents of the right page in the format understood by
* _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer, * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
* because we're going to recreate the whole page anyway. * because we're going to recreate the whole page anyway, so it
* should never be stored by XLogInsert.
* *
* Direct access to page is not good but faster - we should implement * Direct access to page is not good but faster - we should implement
* some new func in page API. Note we only store the tuples * some new func in page API. Note we only store the tuples
* themselves, knowing that the item pointers are in the same order * themselves, knowing that they were inserted in item-number order
* and can be reconstructed by scanning the tuples. See comments for * and so the item pointers can be reconstructed. See comments for
* _bt_restore_page(). * _bt_restore_page().
*/ */
lastrdata->next = lastrdata + 1; lastrdata->next = lastrdata + 1;
...@@ -1074,7 +1091,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -1074,7 +1091,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
((PageHeader) rightpage)->pd_upper; ((PageHeader) rightpage)->pd_upper;
lastrdata->buffer = InvalidBuffer; lastrdata->buffer = InvalidBuffer;
/* Log the right sibling, because we've changed it's prev-pointer. */ /* Log the right sibling, because we've changed its' prev-pointer. */
if (!P_RIGHTMOST(ropaque)) if (!P_RIGHTMOST(ropaque))
{ {
lastrdata->next = lastrdata + 1; lastrdata->next = lastrdata + 1;
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.42 2007/02/08 05:05:53 momjian Exp $ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.43 2007/04/11 20:47:38 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -125,7 +125,8 @@ forget_matching_deletion(RelFileNode node, BlockNumber delblk) ...@@ -125,7 +125,8 @@ forget_matching_deletion(RelFileNode node, BlockNumber delblk)
* in correct itemno sequence, but physically the opposite order from the * in correct itemno sequence, but physically the opposite order from the
* original, because we insert them in the opposite of itemno order. This * original, because we insert them in the opposite of itemno order. This
* does not matter in any current btree code, but it's something to keep an * does not matter in any current btree code, but it's something to keep an
* eye on. Is it worth changing just on general principles? * eye on. Is it worth changing just on general principles? See also the
* notes in btree_xlog_split().
*/ */
static void static void
_bt_restore_page(Page page, char *from, int len) _bt_restore_page(Page page, char *from, int len)
...@@ -264,14 +265,12 @@ btree_xlog_split(bool onleft, bool isroot, ...@@ -264,14 +265,12 @@ btree_xlog_split(bool onleft, bool isroot,
{ {
xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
Relation reln; Relation reln;
Buffer lbuf, rbuf; Buffer rbuf;
Page lpage, rpage; Page rpage;
BTPageOpaque ropaque, lopaque; BTPageOpaque ropaque;
char *datapos; char *datapos;
int datalen; int datalen;
bool bkp_left = record->xl_info & XLR_BKP_BLOCK_1; OffsetNumber newitemoff = 0;
bool bkp_nextsib = record->xl_info & XLR_BKP_BLOCK_2;
OffsetNumber newitemoff;
Item newitem = NULL; Item newitem = NULL;
Size newitemsz = 0; Size newitemsz = 0;
...@@ -283,6 +282,7 @@ btree_xlog_split(bool onleft, bool isroot, ...@@ -283,6 +282,7 @@ btree_xlog_split(bool onleft, bool isroot,
/* Forget any split this insertion completes */ /* Forget any split this insertion completes */
if (xlrec->level > 0) if (xlrec->level > 0)
{ {
/* we assume SizeOfBtreeSplit is at least 16-bit aligned */
BlockNumber downlink = BlockIdGetBlockNumber((BlockId) datapos); BlockNumber downlink = BlockIdGetBlockNumber((BlockId) datapos);
datapos += sizeof(BlockIdData); datapos += sizeof(BlockIdData);
...@@ -291,19 +291,22 @@ btree_xlog_split(bool onleft, bool isroot, ...@@ -291,19 +291,22 @@ btree_xlog_split(bool onleft, bool isroot,
forget_matching_split(xlrec->node, downlink, false); forget_matching_split(xlrec->node, downlink, false);
} }
/* Extract newitem and newitemoff, if present */
/* Extract newitem and newitemoff */ if (onleft && !(record->xl_info & XLR_BKP_BLOCK_1))
if (!bkp_left && onleft)
{ {
IndexTupleData itupdata; IndexTupleData itupdata;
/* Extract the offset of the new tuple and it's contents */ /* Extract the offset (still assuming 16-bit alignment) */
memcpy(&newitemoff, datapos, sizeof(OffsetNumber)); memcpy(&newitemoff, datapos, sizeof(OffsetNumber));
datapos += sizeof(OffsetNumber); datapos += sizeof(OffsetNumber);
datalen -= sizeof(OffsetNumber); datalen -= sizeof(OffsetNumber);
/*
* We need to copy the tuple header to apply IndexTupleDSize, because
* of alignment considerations. However, we assume that PageAddItem
* doesn't care about the alignment of the newitem pointer it's given.
*/
newitem = datapos; newitem = datapos;
/* Need to copy tuple header due to alignment considerations */
memcpy(&itupdata, datapos, sizeof(IndexTupleData)); memcpy(&itupdata, datapos, sizeof(IndexTupleData));
newitemsz = IndexTupleDSize(itupdata); newitemsz = IndexTupleDSize(itupdata);
newitemsz = MAXALIGN(newitemsz); newitemsz = MAXALIGN(newitemsz);
...@@ -311,7 +314,7 @@ btree_xlog_split(bool onleft, bool isroot, ...@@ -311,7 +314,7 @@ btree_xlog_split(bool onleft, bool isroot,
datalen -= newitemsz; datalen -= newitemsz;
} }
/* Reconstruct right (new) sibling */ /* Reconstruct right (new) sibling from scratch */
rbuf = XLogReadBuffer(reln, xlrec->rightsib, true); rbuf = XLogReadBuffer(reln, xlrec->rightsib, true);
Assert(BufferIsValid(rbuf)); Assert(BufferIsValid(rbuf));
rpage = (Page) BufferGetPage(rbuf); rpage = (Page) BufferGetPage(rbuf);
...@@ -331,55 +334,69 @@ btree_xlog_split(bool onleft, bool isroot, ...@@ -331,55 +334,69 @@ btree_xlog_split(bool onleft, bool isroot,
PageSetTLI(rpage, ThisTimeLineID); PageSetTLI(rpage, ThisTimeLineID);
MarkBufferDirty(rbuf); MarkBufferDirty(rbuf);
/* don't release the buffer yet, because reconstructing the left sibling /* don't release the buffer yet; we touch right page's first item below */
* needs to access the data on the right page
*/
/* Reconstruct left (original) sibling */
if(!bkp_left) /*
* Reconstruct left (original) sibling if needed. Note that this code
* ensures that the items remaining on the left page are in the correct
* item number order, but it does not reproduce the physical order they
* would have had. Is this worth changing? See also _bt_restore_page().
*/
if (!(record->xl_info & XLR_BKP_BLOCK_1))
{ {
lbuf = XLogReadBuffer(reln, xlrec->leftsib, false); Buffer lbuf = XLogReadBuffer(reln, xlrec->leftsib, false);
if (BufferIsValid(lbuf)) if (BufferIsValid(lbuf))
{ {
lpage = (Page) BufferGetPage(lbuf); Page lpage = (Page) BufferGetPage(lbuf);
lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
if (!XLByteLE(lsn, PageGetLSN(lpage))) if (!XLByteLE(lsn, PageGetLSN(lpage)))
{ {
/* Remove the items from the left page that were copied to
* right page, and add the new item if it was inserted to
* left page.
*/
OffsetNumber off; OffsetNumber off;
OffsetNumber maxoff = PageGetMaxOffsetNumber(lpage); OffsetNumber maxoff = PageGetMaxOffsetNumber(lpage);
OffsetNumber deletable[MaxOffsetNumber];
int ndeletable = 0;
ItemId hiItemId; ItemId hiItemId;
Item hiItem; Item hiItem;
for(off = maxoff ; off >= xlrec->firstright; off--) /*
PageIndexTupleDelete(lpage, off); * Remove the items from the left page that were copied to
* the right page. Also remove the old high key, if any.
* (We must remove everything before trying to insert any
* items, else we risk not having enough space.)
*/
if (!P_RIGHTMOST(lopaque))
{
deletable[ndeletable++] = P_HIKEY;
/*
* newitemoff is given to us relative to the original
* page's item numbering, so adjust it for this deletion.
*/
newitemoff--;
}
for (off = xlrec->firstright; off <= maxoff; off++)
deletable[ndeletable++] = off;
if (ndeletable > 0)
PageIndexMultiDelete(lpage, deletable, ndeletable);
/*
* Add the new item if it was inserted on left page.
*/
if (onleft) if (onleft)
{ {
if (PageAddItem(lpage, newitem, newitemsz, newitemoff, if (PageAddItem(lpage, newitem, newitemsz, newitemoff,
LP_USED) == InvalidOffsetNumber) LP_USED) == InvalidOffsetNumber)
elog(PANIC, "can't add new item to left sibling after split"); elog(PANIC, "failed to add new item to left page after split");
} }
/* Set high key equal to the first key on the right page */ /* Set high key equal to the first key on the right page */
hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque)); hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
hiItem = PageGetItem(rpage, hiItemId); hiItem = PageGetItem(rpage, hiItemId);
if(!P_RIGHTMOST(lopaque)) if (PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
{
/* but remove the old high key first */
PageIndexTupleDelete(lpage, P_HIKEY);
}
if(PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
P_HIKEY, LP_USED) == InvalidOffsetNumber) P_HIKEY, LP_USED) == InvalidOffsetNumber)
elog(PANIC, "can't add high key after split to left page"); elog(PANIC, "failed to add high key to left page after split");
/* Fix opaque fields */ /* Fix opaque fields */
lopaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0; lopaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
...@@ -393,16 +410,16 @@ btree_xlog_split(bool onleft, bool isroot, ...@@ -393,16 +410,16 @@ btree_xlog_split(bool onleft, bool isroot,
UnlockReleaseBuffer(lbuf); UnlockReleaseBuffer(lbuf);
} }
} }
/* we no longer need the right buffer. */ /* We no longer need the right buffer */
UnlockReleaseBuffer(rbuf); UnlockReleaseBuffer(rbuf);
/* Fix left-link of the page to the right of the new right sibling */ /* Fix left-link of the page to the right of the new right sibling */
if (!bkp_nextsib && xlrec->rnext != P_NONE) if (xlrec->rnext != P_NONE && !(record->xl_info & XLR_BKP_BLOCK_2))
{ {
Buffer buffer = XLogReadBuffer(reln, xlrec->rnext, false); Buffer buffer = XLogReadBuffer(reln, xlrec->rnext, false);
if (BufferIsValid(buffer)) if (BufferIsValid(buffer))
{ {
Page page = (Page) BufferGetPage(buffer); Page page = (Page) BufferGetPage(buffer);
...@@ -410,6 +427,7 @@ btree_xlog_split(bool onleft, bool isroot, ...@@ -410,6 +427,7 @@ btree_xlog_split(bool onleft, bool isroot,
if (!XLByteLE(lsn, PageGetLSN(page))) if (!XLByteLE(lsn, PageGetLSN(page)))
{ {
BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
pageop->btpo_prev = xlrec->rightsib; pageop->btpo_prev = xlrec->rightsib;
PageSetLSN(page, lsn); PageSetLSN(page, lsn);
...@@ -773,9 +791,9 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec) ...@@ -773,9 +791,9 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
appendStringInfo(buf, "split_l: rel %u/%u/%u ", appendStringInfo(buf, "split_l: rel %u/%u/%u ",
xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode); xlrec->node.relNode);
appendStringInfo(buf, "left %u, right %u off %u level %u", appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
xlrec->leftsib, xlrec->rightsib, xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
xlrec->firstright, xlrec->level); xlrec->level, xlrec->firstright);
break; break;
} }
case XLOG_BTREE_SPLIT_R: case XLOG_BTREE_SPLIT_R:
...@@ -785,9 +803,9 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec) ...@@ -785,9 +803,9 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
appendStringInfo(buf, "split_r: rel %u/%u/%u ", appendStringInfo(buf, "split_r: rel %u/%u/%u ",
xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode); xlrec->node.relNode);
appendStringInfo(buf, "left %u, right %u off %u level %u", appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
xlrec->leftsib, xlrec->rightsib, xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
xlrec->firstright, xlrec->level); xlrec->level, xlrec->firstright);
break; break;
} }
case XLOG_BTREE_SPLIT_L_ROOT: case XLOG_BTREE_SPLIT_L_ROOT:
...@@ -797,9 +815,9 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec) ...@@ -797,9 +815,9 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
appendStringInfo(buf, "split_l_root: rel %u/%u/%u ", appendStringInfo(buf, "split_l_root: rel %u/%u/%u ",
xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode); xlrec->node.relNode);
appendStringInfo(buf, "left %u, right %u off %u level %u", appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
xlrec->leftsib, xlrec->rightsib, xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
xlrec->firstright, xlrec->level); xlrec->level, xlrec->firstright);
break; break;
} }
case XLOG_BTREE_SPLIT_R_ROOT: case XLOG_BTREE_SPLIT_R_ROOT:
...@@ -809,9 +827,9 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec) ...@@ -809,9 +827,9 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
appendStringInfo(buf, "split_r_root: rel %u/%u/%u ", appendStringInfo(buf, "split_r_root: rel %u/%u/%u ",
xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode); xlrec->node.relNode);
appendStringInfo(buf, "left %u, right %u off %u level %u", appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
xlrec->leftsib, xlrec->rightsib, xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
xlrec->firstright, xlrec->level); xlrec->level, xlrec->firstright);
break; break;
} }
case XLOG_BTREE_DELETE: case XLOG_BTREE_DELETE:
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.112 2007/04/09 22:04:08 tgl Exp $ * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.113 2007/04/11 20:47:38 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -260,17 +260,17 @@ typedef struct xl_btree_insert ...@@ -260,17 +260,17 @@ typedef struct xl_btree_insert
#define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData) #define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData)
/* /*
* On insert with split we save items of both left and right siblings * On insert with split, we save all the items going into the right sibling
* and restore content of both pages from log record. This way takes less * so that we can restore it completely from the log record. This way takes
* xlog space than the normal approach, because if we did it standardly, * less xlog space than the normal approach, because if we did it standardly,
* XLogInsert would almost always think the right page is new and store its * XLogInsert would almost always think the right page is new and store its
* whole page image. * whole page image. The left page, however, is handled in the normal
* incremental-update fashion.
* *
* Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record. * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
* The _L and _R variants indicate whether the inserted tuple went into the * The _L and _R variants indicate whether the inserted tuple went into the
* left or right split page (and thus, whether newitemoff and the new item * left or right split page (and thus, whether newitemoff and the new item
* are stored or not. * are stored or not). The _ROOT variants indicate that we are splitting
* page of the split pair). The _ROOT variants indicate that we are splitting
* the root page, and thus that a newroot record rather than an insert or * the root page, and thus that a newroot record rather than an insert or
* split record should follow. Note that a split record never carries a * split record should follow. Note that a split record never carries a
* metapage update --- we'll do that in the parent-level update. * metapage update --- we'll do that in the parent-level update.
...@@ -280,18 +280,23 @@ typedef struct xl_btree_split ...@@ -280,18 +280,23 @@ typedef struct xl_btree_split
RelFileNode node; RelFileNode node;
BlockNumber leftsib; /* orig page / new left page */ BlockNumber leftsib; /* orig page / new left page */
BlockNumber rightsib; /* new right page */ BlockNumber rightsib; /* new right page */
OffsetNumber firstright; /* first item stored on right page */ BlockNumber rnext; /* next block (orig page's rightlink) */
BlockNumber rnext; /* next/right block pointer */
uint32 level; /* tree level of page being split */ uint32 level; /* tree level of page being split */
OffsetNumber firstright; /* first item moved to right page */
/* BlockIdData downlink follows if level > 0 */ /*
* If level > 0, BlockIdData downlink follows. (We use BlockIdData
/* OffsetNumber newitemoff follows in the _L variants. */ * rather than BlockNumber for alignment reasons: SizeOfBtreeSplit
/* New item follows in the _L variants */ * is only 16-bit aligned.)
/* RIGHT PAGES TUPLES FOLLOW AT THE END */ *
* In the _L variants, next are OffsetNumber newitemoff and the new item.
* (In the _R variants, the new item is one of the right page's tuples.)
*
* Last are the right page's tuples in the form used by _bt_restore_page.
*/
} xl_btree_split; } xl_btree_split;
#define SizeOfBtreeSplit (offsetof(xl_btree_split, level) + sizeof(uint32)) #define SizeOfBtreeSplit (offsetof(xl_btree_split, firstright) + sizeof(OffsetNumber))
/* /*
* This is what we need to know about delete of individual leaf index tuples. * This is what we need to know about delete of individual leaf index tuples.
......
...@@ -37,7 +37,7 @@ ...@@ -37,7 +37,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.402 2007/04/09 22:04:08 tgl Exp $ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.403 2007/04/11 20:47:38 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -53,6 +53,6 @@ ...@@ -53,6 +53,6 @@
*/ */
/* yyyymmddN */ /* yyyymmddN */
#define CATALOG_VERSION_NO 200704091 #define CATALOG_VERSION_NO 200704111
#endif #endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment