Commit b79575ce authored by Bruce Momjian's avatar Bruce Momjian

Reduce WAL activity for page splits:

> Currently, an index split writes all the data on the split page to
> WAL. That's a lot of WAL traffic. The tuples that are copied to the
> right page need to be WAL logged, but the tuples that stay on the
> original page don't.

Heikki Linnakangas
parent fe03a5f4
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.149 2007/02/06 14:55:11 tgl Exp $ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.150 2007/02/08 05:05:53 momjian Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -733,6 +733,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -733,6 +733,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
rightoff; rightoff;
OffsetNumber maxoff; OffsetNumber maxoff;
OffsetNumber i; OffsetNumber i;
bool isroot;
rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
origpage = BufferGetPage(buf); origpage = BufferGetPage(buf);
...@@ -747,6 +748,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -747,6 +748,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage); lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
isroot = P_ISROOT(oopaque);
/* if we're splitting this page, it won't be the root when we're done */ /* if we're splitting this page, it won't be the root when we're done */
/* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */ /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
lopaque->btpo_flags = oopaque->btpo_flags; lopaque->btpo_flags = oopaque->btpo_flags;
...@@ -921,61 +924,116 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -921,61 +924,116 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
MarkBufferDirty(sbuf); MarkBufferDirty(sbuf);
} }
/*
* By here, the original data page has been split into two new halves, and
* these are correct. The algorithm requires that the left page never
* move during a split, so we copy the new left page back on top of the
* original. Note that this is not a waste of time, since we also require
* (in the page management code) that the center of a page always be
* clean, and the most efficient way to guarantee this is just to compact
* the data by reinserting it into a new left page. (XXX the latter
* comment is probably obsolete.)
*
* We need to do this before writing the WAL record, so that XLogInsert can
* WAL log an image of the page if necessary.
*/
PageRestoreTempPage(leftpage, origpage);
/* XLOG stuff */ /* XLOG stuff */
if (!rel->rd_istemp) if (!rel->rd_istemp)
{ {
xl_btree_split xlrec; xl_btree_split xlrec;
uint8 xlinfo; uint8 xlinfo;
XLogRecPtr recptr; XLogRecPtr recptr;
XLogRecData rdata[4]; XLogRecData rdata[6];
XLogRecData *lastrdata;
xlrec.target.node = rel->rd_node; xlrec.node = rel->rd_node;
ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off); xlrec.leftsib = BufferGetBlockNumber(buf);
xlrec.rightsib = BufferGetBlockNumber(rbuf);
xlrec.firstright = firstright;
xlrec.rnext = ropaque->btpo_next;
xlrec.level = lopaque->btpo.level;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBtreeSplit;
rdata[0].buffer = InvalidBuffer;
lastrdata = &rdata[0];
/* Log downlink on non-leaf pages. */
if (lopaque->btpo.level > 0)
{
lastrdata->next = lastrdata + 1;
lastrdata++;
lastrdata->data = (char *) &newitem->t_tid.ip_blkid;
lastrdata->len = sizeof(BlockIdData);
lastrdata->buffer = InvalidBuffer;
}
/* Log the new item, if it was inserted on the left page. If it was
* put on the right page, we don't need to explicitly WAL log it
* because it's included with all the other items on the right page.
*/
lastrdata->next = lastrdata + 1;
lastrdata++;
if (newitemonleft) if (newitemonleft)
xlrec.otherblk = BufferGetBlockNumber(rbuf); {
lastrdata->data = (char *) &newitemoff;
lastrdata->len = sizeof(OffsetNumber);
lastrdata->buffer = buf; /* backup block 1 */
lastrdata->buffer_std = true;
lastrdata->next = lastrdata + 1;
lastrdata++;
lastrdata->data = (char *)newitem;
lastrdata->len = newitemsz;
lastrdata->buffer = buf; /* backup block 1 */
lastrdata->buffer_std = true;
}
else else
xlrec.otherblk = BufferGetBlockNumber(buf); {
xlrec.leftblk = lopaque->btpo_prev; lastrdata->data = NULL;
xlrec.rightblk = ropaque->btpo_next; lastrdata->len = 0;
xlrec.level = lopaque->btpo.level; lastrdata->buffer = buf; /* backup block 1 */
lastrdata->buffer_std = true;
}
/* /* Log the contents of the right page in the format understood by
* _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
* because we're going to recreate the whole page anyway.
*
* Direct access to page is not good but faster - we should implement * Direct access to page is not good but faster - we should implement
* some new func in page API. Note we only store the tuples * some new func in page API. Note we only store the tuples
* themselves, knowing that the item pointers are in the same order * themselves, knowing that the item pointers are in the same order
* and can be reconstructed by scanning the tuples. See comments for * and can be reconstructed by scanning the tuples. See comments for
* _bt_restore_page(). * _bt_restore_page().
*/ */
xlrec.leftlen = ((PageHeader) leftpage)->pd_special - lastrdata->next = lastrdata + 1;
((PageHeader) leftpage)->pd_upper; lastrdata++;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBtreeSplit;
rdata[0].buffer = InvalidBuffer;
rdata[0].next = &(rdata[1]);
rdata[1].data = (char *) leftpage + ((PageHeader) leftpage)->pd_upper; lastrdata->data = (char *) rightpage +
rdata[1].len = xlrec.leftlen;
rdata[1].buffer = InvalidBuffer;
rdata[1].next = &(rdata[2]);
rdata[2].data = (char *) rightpage + ((PageHeader) rightpage)->pd_upper;
rdata[2].len = ((PageHeader) rightpage)->pd_special -
((PageHeader) rightpage)->pd_upper; ((PageHeader) rightpage)->pd_upper;
rdata[2].buffer = InvalidBuffer; lastrdata->len = ((PageHeader) rightpage)->pd_special -
rdata[2].next = NULL; ((PageHeader) rightpage)->pd_upper;
lastrdata->buffer = InvalidBuffer;
/* Log the right sibling, because we've changed it's prev-pointer. */
if (!P_RIGHTMOST(ropaque)) if (!P_RIGHTMOST(ropaque))
{ {
rdata[2].next = &(rdata[3]); lastrdata->next = lastrdata + 1;
rdata[3].data = NULL; lastrdata++;
rdata[3].len = 0;
rdata[3].buffer = sbuf; lastrdata->data = NULL;
rdata[3].buffer_std = true; lastrdata->len = 0;
rdata[3].next = NULL; lastrdata->buffer = sbuf; /* backup block 2 */
lastrdata->buffer_std = true;
} }
if (P_ISROOT(oopaque)) lastrdata->next = NULL;
if (isroot)
xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT; xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;
else else
xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R; xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
...@@ -993,24 +1051,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ...@@ -993,24 +1051,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
} }
} }
/*
* By here, the original data page has been split into two new halves, and
* these are correct. The algorithm requires that the left page never
* move during a split, so we copy the new left page back on top of the
* original. Note that this is not a waste of time, since we also require
* (in the page management code) that the center of a page always be
* clean, and the most efficient way to guarantee this is just to compact
* the data by reinserting it into a new left page. (XXX the latter
* comment is probably obsolete.)
*
* It's a bit weird that we don't fill in the left page till after writing
* the XLOG entry, but not really worth changing. Note that we use the
* origpage data (specifically its BTP_ROOT bit) while preparing the XLOG
* entry, so simply reshuffling the code won't do.
*/
PageRestoreTempPage(leftpage, origpage);
END_CRIT_SECTION(); END_CRIT_SECTION();
/* release the old right sibling */ /* release the old right sibling */
......
This diff is collapsed.
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.110 2007/02/05 04:22:18 tgl Exp $ * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.111 2007/02/08 05:05:53 momjian Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -259,7 +259,8 @@ typedef struct xl_btree_insert ...@@ -259,7 +259,8 @@ typedef struct xl_btree_insert
* *
* Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record. * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
* The _L and _R variants indicate whether the inserted tuple went into the * The _L and _R variants indicate whether the inserted tuple went into the
* left or right split page (and thus, whether otherblk is the right or left * left or right split page (and thus, whether newitemoff and the new item
* are stored or not.
* page of the split pair). The _ROOT variants indicate that we are splitting * page of the split pair). The _ROOT variants indicate that we are splitting
* the root page, and thus that a newroot record rather than an insert or * the root page, and thus that a newroot record rather than an insert or
* split record should follow. Note that a split record never carries a * split record should follow. Note that a split record never carries a
...@@ -267,17 +268,21 @@ typedef struct xl_btree_insert ...@@ -267,17 +268,21 @@ typedef struct xl_btree_insert
*/ */
typedef struct xl_btree_split typedef struct xl_btree_split
{ {
xl_btreetid target; /* inserted tuple id */ RelFileNode node;
BlockNumber otherblk; /* second block participated in split: */ BlockNumber leftsib; /* orig page / new left page */
/* first one is stored in target' tid */ BlockNumber rightsib; /* new right page */
BlockNumber leftblk; /* prev/left block */ OffsetNumber firstright; /* first item stored on right page */
BlockNumber rightblk; /* next/right block */ BlockNumber rnext; /* next/right block pointer */
uint32 level; /* tree level of page being split */ uint32 level; /* tree level of page being split */
uint16 leftlen; /* len of left page items below */
/* LEFT AND RIGHT PAGES TUPLES FOLLOW AT THE END */ /* BlockIdData downlink follows if level > 0 */
/* OffsetNumber newitemoff follows in the _L variants. */
/* New item follows in the _L variants */
/* RIGHT PAGES TUPLES FOLLOW AT THE END */
} xl_btree_split; } xl_btree_split;
#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16)) #define SizeOfBtreeSplit (offsetof(xl_btree_split, level) + sizeof(uint32))
/* /*
* This is what we need to know about delete of individual leaf index tuples. * This is what we need to know about delete of individual leaf index tuples.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment