Commit 70508ba7 authored by Tom Lane's avatar Tom Lane

Make btree index structure adjustments and WAL logging changes needed to

support btree compaction, as per proposal of a few days ago.  btree index
pages no longer store parent links, instead they have a level indicator
(counting up from zero for leaf pages).  The FixBTree recovery logic is
removed, and replaced by code that detects missing parent-level insertions
during WAL replay.  Also, generate appropriate WAL entries when updating
btree metapage and when building a btree index from scratch.  I believe
btree indexes are now completely WAL-legal for the first time.
initdb forced due to index and WAL changes.
parent 4df0f1d2
......@@ -4,7 +4,7 @@
# Makefile for access/nbtree
#
# IDENTIFICATION
# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.11 2001/07/15 22:48:16 tgl Exp $
# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.12 2003/02/21 00:06:21 tgl Exp $
#
#-------------------------------------------------------------------------
......@@ -13,7 +13,7 @@ top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtsearch.o \
nbtstrat.o nbtutils.o nbtsort.o
nbtstrat.o nbtutils.o nbtsort.o nbtxlog.o
all: SUBSYS.o
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.72 2002/06/20 20:29:25 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.73 2003/02/21 00:06:21 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -895,6 +895,89 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
return true;
}
/*
* _bt_get_endpoint() -- Find the first or last page on a given tree level
*
* If the index is empty, we will return InvalidBuffer; any other failure
* condition causes elog().
*
* The returned buffer is pinned and read-locked.
*/
Buffer
_bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
{
Buffer buf;
Page page;
BTPageOpaque opaque;
OffsetNumber offnum;
BlockNumber blkno;
BTItem btitem;
IndexTuple itup;
/*
* If we are looking for a leaf page, okay to descend from fast root;
* otherwise better descend from true root. (There is no point in being
* smarter about intermediate levels.)
*/
if (level == 0)
buf = _bt_getroot(rel, BT_READ);
else
buf = _bt_gettrueroot(rel);
if (!BufferIsValid(buf))
{
/* empty index... */
return InvalidBuffer;
}
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
for (;;)
{
/*
* If we landed on a deleted page, step right to find a live page
* (there must be one). Also, if we want the rightmost page,
* step right if needed to get to it (this could happen if the
* page split since we obtained a pointer to it).
*/
while (P_ISDELETED(opaque) ||
(rightmost && !P_RIGHTMOST(opaque)))
{
blkno = opaque->btpo_next;
if (blkno == P_NONE)
elog(ERROR, "_bt_get_endpoint: ran off end of btree");
_bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
}
/* Done? */
if (opaque->btpo.level == level)
break;
if (opaque->btpo.level < level)
elog(ERROR, "_bt_get_endpoint: btree level %u not found", level);
/* Step to leftmost or rightmost child page */
if (rightmost)
offnum = PageGetMaxOffsetNumber(page);
else
offnum = P_FIRSTDATAKEY(opaque);
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
itup = &(btitem->bti_itup);
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
_bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
}
return buf;
}
/*
* _bt_endpoint() -- Find the first or last key in the index.
*
......@@ -910,8 +993,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Page page;
BTPageOpaque opaque;
ItemPointer current;
OffsetNumber offnum,
maxoff;
OffsetNumber maxoff;
OffsetNumber start;
BlockNumber blkno;
BTItem btitem;
......@@ -929,7 +1011,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
* simplified version of _bt_search(). We don't maintain a stack
* since we know we won't need it.
*/
buf = _bt_getroot(rel, BT_READ);
buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
if (!BufferIsValid(buf))
{
......@@ -942,51 +1024,14 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
blkno = BufferGetBlockNumber(buf);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
Assert(P_ISLEAF(opaque));
for (;;)
{
if (P_ISLEAF(opaque))
break;
if (ScanDirectionIsForward(dir))
offnum = P_FIRSTDATAKEY(opaque);
else
offnum = PageGetMaxOffsetNumber(page);
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
itup = &(btitem->bti_itup);
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
_bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/*
* Race condition: If the child page we just stepped onto was just
* split, we need to make sure we're all the way at the right edge
* of the tree. See the paper by Lehman and Yao.
*/
if (ScanDirectionIsBackward(dir) && !P_RIGHTMOST(opaque))
{
do
{
blkno = opaque->btpo_next;
_bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
} while (!P_RIGHTMOST(opaque));
}
}
/* okay, we've got the {left,right}-most page in the tree */
maxoff = PageGetMaxOffsetNumber(page);
if (ScanDirectionIsForward(dir))
{
Assert(P_LEFTMOST(opaque));
/* There could be dead pages to the left, so not this: */
/* Assert(P_LEFTMOST(opaque)); */
start = P_FIRSTDATAKEY(opaque);
}
......
......@@ -35,7 +35,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.70 2002/11/15 01:26:08 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.71 2003/02/21 00:06:21 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -43,6 +43,7 @@
#include "postgres.h"
#include "access/nbtree.h"
#include "miscadmin.h"
#include "utils/tuplesort.h"
......@@ -76,7 +77,7 @@ typedef struct BTPageState
BTItem btps_minkey; /* copy of minimum key (first item) on
* page */
OffsetNumber btps_lastoff; /* last item offset loaded */
int btps_level; /* tree level (0 = leaf) */
uint32 btps_level; /* tree level (0 = leaf) */
Size btps_full; /* "full" if less than this much free
* space */
struct BTPageState *btps_next; /* link to parent level, if any */
......@@ -90,8 +91,9 @@ typedef struct BTPageState
0)
static void _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags);
static BTPageState *_bt_pagestate(Relation index, int flags, int level);
static void _bt_blnewpage(Relation index, Buffer *buf, Page *page,
uint32 level);
static BTPageState *_bt_pagestate(Relation index, uint32 level);
static void _bt_slideleft(Relation index, Buffer buf, Page page);
static void _bt_sortaddtup(Page page, Size itemsize,
BTItem btitem, OffsetNumber itup_off);
......@@ -179,7 +181,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
* allocate a new, clean btree page, not linked to any siblings.
*/
static void
_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
_bt_blnewpage(Relation index, Buffer *buf, Page *page, uint32 level)
{
BTPageOpaque opaque;
......@@ -192,23 +194,67 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
/* Initialize BT opaque state */
opaque = (BTPageOpaque) PageGetSpecialPointer(*page);
opaque->btpo_prev = opaque->btpo_next = P_NONE;
opaque->btpo_flags = flags;
opaque->btpo.level = level;
opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
/* Make the P_HIKEY line pointer appear allocated */
((PageHeader) *page)->pd_lower += sizeof(ItemIdData);
}
/*
* emit a completed btree page, and release the lock and pin on it.
* This is essentially _bt_wrtbuf except we also emit a WAL record.
*/
static void
_bt_blwritepage(Relation index, Buffer buf)
{
Page pg = BufferGetPage(buf);
/* NO ELOG(ERROR) from here till newpage op is logged */
START_CRIT_SECTION();
/* XLOG stuff */
if (!index->rd_istemp)
{
xl_btree_newpage xlrec;
XLogRecPtr recptr;
XLogRecData rdata[2];
xlrec.node = index->rd_node;
xlrec.blkno = BufferGetBlockNumber(buf);
rdata[0].buffer = InvalidBuffer;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBtreeNewpage;
rdata[0].next = &(rdata[1]);
rdata[1].buffer = buf;
rdata[1].data = (char *) pg;
rdata[1].len = BLCKSZ;
rdata[1].next = NULL;
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWPAGE, rdata);
PageSetLSN(pg, recptr);
PageSetSUI(pg, ThisStartUpID);
}
END_CRIT_SECTION();
_bt_wrtbuf(index, buf);
}
/*
* allocate and initialize a new BTPageState. the returned structure
* is suitable for immediate use by _bt_buildadd.
*/
static BTPageState *
_bt_pagestate(Relation index, int flags, int level)
_bt_pagestate(Relation index, uint32 level)
{
BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
/* create initial page */
_bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags);
_bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), level);
state->btps_minkey = (BTItem) NULL;
/* initialize lastoff so first item goes into P_FIRSTKEY */
......@@ -365,9 +411,8 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
ItemId hii;
BTItem obti;
/* Create new page */
_bt_blnewpage(index, &nbuf, &npage,
(state->btps_level > 0) ? 0 : BTP_LEAF);
/* Create new page on same level */
_bt_blnewpage(index, &nbuf, &npage, state->btps_level);
/*
* We copy the last item on the page into the new page, and then
......@@ -396,10 +441,8 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
* btree level.
*/
if (state->btps_next == (BTPageState *) NULL)
{
state->btps_next =
_bt_pagestate(index, 0, state->btps_level + 1);
}
state->btps_next = _bt_pagestate(index, state->btps_level + 1);
Assert(state->btps_minkey != NULL);
ItemPointerSet(&(state->btps_minkey->bti_itup.t_tid),
BufferGetBlockNumber(obuf), P_HIKEY);
......@@ -414,16 +457,7 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
state->btps_minkey = _bt_formitem(&(obti->bti_itup));
/*
* Set the sibling links for both pages, and parent links too.
*
* It's not necessary to set the parent link at all, because it's
* only used for handling concurrent root splits, but we may as
* well do it as a debugging aid. Note we set new page's link as
* well as old's, because if the new page turns out to be the last
* of the level, _bt_uppershutdown won't change it. The links may
* be out of date by the time the build finishes, but that's OK;
* they need only point to a left-sibling of the true parent. See
* the README file for more info.
* Set the sibling links for both pages.
*/
{
BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
......@@ -431,9 +465,7 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
oopaque->btpo_next = BufferGetBlockNumber(nbuf);
nopaque->btpo_prev = BufferGetBlockNumber(obuf);
nopaque->btpo_next = P_NONE;
oopaque->btpo_parent = nopaque->btpo_parent =
BufferGetBlockNumber(state->btps_next->btps_buf);
nopaque->btpo_next = P_NONE; /* redundant */
}
/*
......@@ -441,7 +473,7 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
* can give up our lock (if we had one; most likely BuildingBtree
* is set, so we aren't locking).
*/
_bt_wrtbuf(index, obuf);
_bt_blwritepage(index, obuf);
/*
* Reset last_off to point to new page
......@@ -519,7 +551,7 @@ _bt_uppershutdown(Relation index, BTPageState *state)
* slid back one slot. Then we can dump out the page.
*/
_bt_slideleft(index, s->btps_buf, s->btps_page);
_bt_wrtbuf(index, s->btps_buf);
_bt_blwritepage(index, s->btps_buf);
}
}
......@@ -603,7 +635,7 @@ _bt_load(Relation index, BTSpool *btspool, BTSpool *btspool2)
/* When we see first tuple, create first index page */
if (state == NULL)
state = _bt_pagestate(index, BTP_LEAF, 0);
state = _bt_pagestate(index, 0);
if (load1)
{
......@@ -623,13 +655,13 @@ _bt_load(Relation index, BTSpool *btspool, BTSpool *btspool2)
_bt_freeskey(indexScanKey);
}
else
/* merge is unnecessary */
{
/* merge is unnecessary */
while (bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, true, &should_free), bti != (BTItem) NULL)
{
/* When we see first tuple, create first index page */
if (state == NULL)
state = _bt_pagestate(index, BTP_LEAF, 0);
state = _bt_pagestate(index, 0);
_bt_buildadd(index, state, bti);
if (should_free)
......
This diff is collapsed.
......@@ -3,7 +3,7 @@
*
* Resource managers definition
*
* $Header: /cvsroot/pgsql/src/backend/access/transam/rmgr.c,v 1.9 2001/08/25 18:52:41 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/access/transam/rmgr.c,v 1.10 2003/02/21 00:06:22 tgl Exp $
*/
#include "postgres.h"
......@@ -19,21 +19,22 @@
#include "commands/sequence.h"
RmgrData RmgrTable[] = {
{"XLOG", xlog_redo, xlog_undo, xlog_desc},
{"Transaction", xact_redo, xact_undo, xact_desc},
{"Storage", smgr_redo, smgr_undo, smgr_desc},
{"CLOG", clog_redo, clog_undo, clog_desc},
{"Reserved 4", NULL, NULL, NULL},
{"Reserved 5", NULL, NULL, NULL},
{"Reserved 6", NULL, NULL, NULL},
{"Reserved 7", NULL, NULL, NULL},
{"Reserved 8", NULL, NULL, NULL},
{"Reserved 9", NULL, NULL, NULL},
{"Heap", heap_redo, heap_undo, heap_desc},
{"Btree", btree_redo, btree_undo, btree_desc},
{"Hash", hash_redo, hash_undo, hash_desc},
{"Rtree", rtree_redo, rtree_undo, rtree_desc},
{"Gist", gist_redo, gist_undo, gist_desc},
{"Sequence", seq_redo, seq_undo, seq_desc}
RmgrData RmgrTable[RM_MAX_ID+1] = {
{"XLOG", xlog_redo, xlog_undo, xlog_desc, NULL, NULL},
{"Transaction", xact_redo, xact_undo, xact_desc, NULL, NULL},
{"Storage", smgr_redo, smgr_undo, smgr_desc, NULL, NULL},
{"CLOG", clog_redo, clog_undo, clog_desc, NULL, NULL},
{"Reserved 4", NULL, NULL, NULL, NULL, NULL},
{"Reserved 5", NULL, NULL, NULL, NULL, NULL},
{"Reserved 6", NULL, NULL, NULL, NULL, NULL},
{"Reserved 7", NULL, NULL, NULL, NULL, NULL},
{"Reserved 8", NULL, NULL, NULL, NULL, NULL},
{"Reserved 9", NULL, NULL, NULL, NULL, NULL},
{"Heap", heap_redo, heap_undo, heap_desc, NULL, NULL},
{"Btree", btree_redo, btree_undo, btree_desc,
btree_xlog_startup, btree_xlog_cleanup},
{"Hash", hash_redo, hash_undo, hash_desc, NULL, NULL},
{"Rtree", rtree_redo, rtree_undo, rtree_desc, NULL, NULL},
{"Gist", gist_redo, gist_undo, gist_desc, NULL, NULL},
{"Sequence", seq_redo, seq_undo, seq_desc, NULL, NULL}
};
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.111 2003/01/25 03:06:04 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.112 2003/02/21 00:06:22 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -1203,16 +1203,6 @@ XLogFlush(XLogRecPtr record)
XLogRecPtr WriteRqstPtr;
XLogwrtRqst WriteRqst;
if (XLOG_DEBUG)
{
elog(LOG, "XLogFlush%s%s: request %X/%X; write %X/%X; flush %X/%X",
(IsBootstrapProcessingMode()) ? "(bootstrap)" : "",
(InRedo) ? "(redo)" : "",
record.xlogid, record.xrecoff,
LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
}
/* Disabled during REDO */
if (InRedo)
return;
......@@ -1221,6 +1211,15 @@ XLogFlush(XLogRecPtr record)
if (XLByteLE(record, LogwrtResult.Flush))
return;
if (XLOG_DEBUG)
{
elog(LOG, "XLogFlush%s: request %X/%X; write %X/%X; flush %X/%X",
(IsBootstrapProcessingMode()) ? "(bootstrap)" : "",
record.xlogid, record.xrecoff,
LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
}
START_CRIT_SECTION();
/*
......@@ -2515,6 +2514,12 @@ StartupXLOG(void)
elog(LOG, "database system was interrupted at %s",
str_time(ControlFile->time));
/* This is just to allow attaching to startup process with a debugger */
#ifdef XLOG_REPLAY_DELAY
if (XLOG_DEBUG && ControlFile->state != DB_SHUTDOWNED)
sleep(60);
#endif
/*
* Get the last valid checkpoint record. If the latest one according
* to pg_control is broken, try the next-to-last one.
......@@ -2578,14 +2583,23 @@ StartupXLOG(void)
/* REDO */
if (InRecovery)
{
int rmid;
elog(LOG, "database system was not properly shut down; "
"automatic recovery in progress");
ControlFile->state = DB_IN_RECOVERY;
ControlFile->time = time(NULL);
UpdateControlFile();
/* Start up the recovery environment */
XLogInitRelationCache();
for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
{
if (RmgrTable[rmid].rm_startup != NULL)
RmgrTable[rmid].rm_startup();
}
/* Is REDO required ? */
if (XLByteLT(checkPoint.redo, RecPtr))
record = ReadRecord(&(checkPoint.redo), PANIC, buffer);
......@@ -2737,7 +2751,25 @@ StartupXLOG(void)
if (InRecovery)
{
int rmid;
/*
* Allow resource managers to do any required cleanup.
*/
for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
{
if (RmgrTable[rmid].rm_cleanup != NULL)
RmgrTable[rmid].rm_cleanup();
}
/* suppress in-transaction check in CreateCheckPoint */
MyLastRecPtr.xrecoff = 0;
MyXactMadeXLogEntry = false;
MyXactMadeTempRelUpdate = false;
/*
* Perform a new checkpoint to update our recovery activity to disk.
*
* In case we had to use the secondary checkpoint, make sure that
* it will still be shown as the secondary checkpoint after this
* CreateCheckPoint operation; we don't want the broken primary
......@@ -2745,6 +2777,10 @@ StartupXLOG(void)
*/
ControlFile->checkPoint = checkPointLoc;
CreateCheckPoint(true, true);
/*
* Close down recovery environment
*/
XLogCloseRelationCache();
}
......
This diff is collapsed.
......@@ -6,7 +6,7 @@
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: xlog.h,v 1.40 2002/11/15 02:44:57 momjian Exp $
* $Id: xlog.h,v 1.41 2003/02/21 00:06:22 tgl Exp $
*/
#ifndef XLOG_H
#define XLOG_H
......@@ -145,10 +145,12 @@ typedef XLogPageHeaderData *XLogPageHeader;
*/
typedef struct RmgrData
{
char *rm_name;
const char *rm_name;
void (*rm_redo) (XLogRecPtr lsn, XLogRecord *rptr);
void (*rm_undo) (XLogRecPtr lsn, XLogRecord *rptr);
void (*rm_desc) (char *buf, uint8 xl_info, char *rec);
void (*rm_startup) (void);
void (*rm_cleanup) (void);
} RmgrData;
extern RmgrData RmgrTable[];
......
......@@ -37,7 +37,7 @@
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: catversion.h,v 1.177 2003/02/16 02:30:39 tgl Exp $
* $Id: catversion.h,v 1.178 2003/02/21 00:06:22 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 200302151
#define CATALOG_VERSION_NO 200302171
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment