Reduce WAL activity for page splits:

> Currently, an index split writes all the data on the split page to > WAL. That's a lot of WAL traffic. The tuples that are copied to the > right page need to be WAL logged, but the tuples that stay on the > original page don't. Heikki Linnakangas

Reduce WAL activity for page splits:
> Currently, an index split writes all the data on the split page to > WAL. That's a lot of WAL traffic. The tuples that are copied to the > right page need to be WAL logged, but the tuples that stay on the > original page don't. Heikki Linnakangas
b79575ce · Bruce Momjian · fe03a5f4 · b79575ce · b79575ce · b79575ce
Commit b79575ce authored Feb 08, 2007 by Bruce Momjian
3 changed files
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.149 2007/02/06 14:55:11 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.150 2007/02/08 05:05:53 momjian Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -733,6 +733,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 				rightoff;
 	OffsetNumber maxoff;
 	OffsetNumber i;
+	bool		isroot;
 	rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
 	origpage = BufferGetPage(buf);
@@ -747,6 +748,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 	lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
 	ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
+	isroot = P_ISROOT(oopaque);
 	/* if we're splitting this page, it won't be the root when we're done */
 	/* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
 	lopaque->btpo_flags = oopaque->btpo_flags;
@@ -921,61 +924,116 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 		MarkBufferDirty(sbuf);
 	}
+	/*
+	 * By here, the original data page has been split into two new halves, and
+	 * these are correct.  The algorithm requires that the left page never
+	 * move during a split, so we copy the new left page back on top of the
+	 * original.  Note that this is not a waste of time, since we also require
+	 * (in the page management code) that the center of a page always be
+	 * clean, and the most efficient way to guarantee this is just to compact
+	 * the data by reinserting it into a new left page.  (XXX the latter
+	 * comment is probably obsolete.)
+	 *
+	 * We need to do this before writing the WAL record, so that XLogInsert can
+	 * WAL log an image of the page if necessary.
+	 */
+	PageRestoreTempPage(leftpage, origpage);
 	/* XLOG stuff */
 	if (!rel->rd_istemp)
 	{
 		xl_btree_split xlrec;
 		uint8		xlinfo;
 		XLogRecPtr	recptr;
-		XLogRecData rdata[4];
+		XLogRecData rdata[6];
+		XLogRecData *lastrdata;
-		xlrec.target.node = rel->rd_node;
+		xlrec.node = rel->rd_node;
-		ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off);
+		xlrec.leftsib = BufferGetBlockNumber(buf);
+		xlrec.rightsib = BufferGetBlockNumber(rbuf);
+		xlrec.firstright = firstright;
+		xlrec.rnext = ropaque->btpo_next;
+		xlrec.level = lopaque->btpo.level;
+		rdata[0].data = (char *) &xlrec;
+		rdata[0].len = SizeOfBtreeSplit;
+		rdata[0].buffer = InvalidBuffer;
+		lastrdata = &rdata[0];
+		/* Log downlink on non-leaf pages. */
+		if (lopaque->btpo.level > 0)
+		{
+			lastrdata->next = lastrdata + 1;
+			lastrdata++;
+			lastrdata->data = (char *) &newitem->t_tid.ip_blkid;
+			lastrdata->len = sizeof(BlockIdData);
+			lastrdata->buffer = InvalidBuffer;
+		}
+		/* Log the new item, if it was inserted on the left page. If it was 
+		 * put on the right page, we don't need to explicitly WAL log it 
+		 * because it's included with all the other items on the right page.
+		 */
+		lastrdata->next = lastrdata + 1;
+		lastrdata++;
 		if (newitemonleft)
-			xlrec.otherblk = BufferGetBlockNumber(rbuf);
+		{
+			lastrdata->data = (char *) &newitemoff;
+			lastrdata->len = sizeof(OffsetNumber);
+			lastrdata->buffer = buf;		/* backup block 1 */
+			lastrdata->buffer_std = true;
+			lastrdata->next = lastrdata + 1;
+			lastrdata++;
+			lastrdata->data = (char *)newitem;
+			lastrdata->len = newitemsz;
+			lastrdata->buffer = buf;		/* backup block 1 */
+			lastrdata->buffer_std = true;
+		}
 		else
-			xlrec.otherblk = BufferGetBlockNumber(buf);
+		{
-		xlrec.leftblk = lopaque->btpo_prev;
+			lastrdata->data = NULL;
-		xlrec.rightblk = ropaque->btpo_next;
+			lastrdata->len = 0;
-		xlrec.level = lopaque->btpo.level;
+			lastrdata->buffer = buf;		/* backup block 1 */
+			lastrdata->buffer_std = true;
+		}
-		/*
+		/* Log the contents of the right page in the format understood by
+		 * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
+		 * because we're going to recreate the whole page anyway.
+		 *
 		 * Direct access to page is not good but faster - we should implement
 		 * some new func in page API.  Note we only store the tuples
 		 * themselves, knowing that the item pointers are in the same order
 		 * and can be reconstructed by scanning the tuples.  See comments for
 		 * _bt_restore_page().
 		 */
-		xlrec.leftlen = ((PageHeader) leftpage)->pd_special -
+		lastrdata->next = lastrdata + 1;
-			((PageHeader) leftpage)->pd_upper;
+		lastrdata++;
-		rdata[0].data = (char *) &xlrec;
+		lastrdata->data = (char *) rightpage + 
-		rdata[0].len = SizeOfBtreeSplit;
-		rdata[0].buffer = InvalidBuffer;
-		rdata[0].next = &(rdata[1]);
-		rdata[1].data = (char *) leftpage + ((PageHeader) leftpage)->pd_upper;
-		rdata[1].len = xlrec.leftlen;
-		rdata[1].buffer = InvalidBuffer;
-		rdata[1].next = &(rdata[2]);
-		rdata[2].data = (char *) rightpage + ((PageHeader) rightpage)->pd_upper;
-		rdata[2].len = ((PageHeader) rightpage)->pd_special -
 			((PageHeader) rightpage)->pd_upper;
-		rdata[2].buffer = InvalidBuffer;
+		lastrdata->len = ((PageHeader) rightpage)->pd_special -
-		rdata[2].next = NULL;
+			((PageHeader) rightpage)->pd_upper;
+		lastrdata->buffer = InvalidBuffer;
+		/* Log the right sibling, because we've changed it's prev-pointer. */
 		if (!P_RIGHTMOST(ropaque))
 		{
-			rdata[2].next = &(rdata[3]);
+			lastrdata->next = lastrdata + 1;
-			rdata[3].data = NULL;
+			lastrdata++;
-			rdata[3].len = 0;
-			rdata[3].buffer = sbuf;
+			lastrdata->data = NULL;
-			rdata[3].buffer_std = true;
+			lastrdata->len = 0;
-			rdata[3].next = NULL;
+			lastrdata->buffer = sbuf;		/* backup block 2 */
+			lastrdata->buffer_std = true;
 		}
-		if (P_ISROOT(oopaque))
+		lastrdata->next = NULL;
+		if (isroot)
 			xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;
 		else
 			xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
@@ -993,24 +1051,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 		}
 	}
-	/*
-	 * By here, the original data page has been split into two new halves, and
-	 * these are correct.  The algorithm requires that the left page never
-	 * move during a split, so we copy the new left page back on top of the
-	 * original.  Note that this is not a waste of time, since we also require
-	 * (in the page management code) that the center of a page always be
-	 * clean, and the most efficient way to guarantee this is just to compact
-	 * the data by reinserting it into a new left page.  (XXX the latter
-	 * comment is probably obsolete.)
-	 *
-	 * It's a bit weird that we don't fill in the left page till after writing
-	 * the XLOG entry, but not really worth changing.  Note that we use the
-	 * origpage data (specifically its BTP_ROOT bit) while preparing the XLOG
-	 * entry, so simply reshuffling the code won't do.
-	 */
-	PageRestoreTempPage(leftpage, origpage);
 	END_CRIT_SECTION();
 	/* release the old right sibling */

--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.110 2007/02/05 04:22:18 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.111 2007/02/08 05:05:53 momjian Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -259,7 +259,8 @@ typedef struct xl_btree_insert
 *
 * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
 * The _L and _R variants indicate whether the inserted tuple went into the
- * left or right split page (and thus, whether otherblk is the right or left
+ * left or right split page (and thus, whether newitemoff and the new item
+ * are stored or not.
 * page of the split pair).  The _ROOT variants indicate that we are splitting
 * the root page, and thus that a newroot record rather than an insert or
 * split record should follow.	Note that a split record never carries a
@@ -267,17 +268,21 @@ typedef struct xl_btree_insert
 */
 typedef struct xl_btree_split
 {
-	xl_btreetid target;			/* inserted tuple id */
+	RelFileNode node;
-	BlockNumber otherblk;		/* second block participated in split: */
+	BlockNumber leftsib;	 /* orig page / new left page */
-	/* first one is stored in target' tid */
+	BlockNumber rightsib;	 /* new right page */
-	BlockNumber leftblk;		/* prev/left block */
+	OffsetNumber firstright; /* first item stored on right page */
-	BlockNumber rightblk;		/* next/right block */
+	BlockNumber rnext;		 /* next/right block pointer */
-	uint32		level;			/* tree level of page being split */
+	uint32		level;		 /* tree level of page being split */
-	uint16		leftlen;		/* len of left page items below */
-	/* LEFT AND RIGHT PAGES TUPLES FOLLOW AT THE END */
+	/* BlockIdData downlink follows if level > 0 */
+	/* OffsetNumber newitemoff follows in the  _L variants. */
+	/* New item follows in the _L variants */
+	/* RIGHT PAGES TUPLES FOLLOW AT THE END */
 } xl_btree_split;
-#define SizeOfBtreeSplit	(offsetof(xl_btree_split, leftlen) + sizeof(uint16))
+#define SizeOfBtreeSplit	(offsetof(xl_btree_split, level) + sizeof(uint32))
 /*
 * This is what we need to know about delete of individual leaf index tuples.