Commit c6e6d292 authored by Vadim B. Mikheev's avatar Vadim B. Mikheev

First step in attempt to fix tree at runtime: create upper levels

and new root page if old root one was splitted but new root page
wasn't created.
New code is protected by FixBTree bool flag setted to FALSE, so
nothing should be affected by this untested approach.
parent 19c4197b
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.76 2001/01/24 19:42:48 momjian Exp $ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.77 2001/01/26 01:24:31 vadim Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -34,7 +34,9 @@ typedef struct ...@@ -34,7 +34,9 @@ typedef struct
int best_delta; /* best size delta so far */ int best_delta; /* best size delta so far */
} FindSplitData; } FindSplitData;
void _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); Buffer _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release);
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
static TransactionId _bt_check_unique(Relation rel, BTItem btitem, static TransactionId _bt_check_unique(Relation rel, BTItem btitem,
Relation heapRel, Buffer buf, Relation heapRel, Buffer buf,
...@@ -44,6 +46,8 @@ static InsertIndexResult _bt_insertonpg(Relation rel, Buffer buf, ...@@ -44,6 +46,8 @@ static InsertIndexResult _bt_insertonpg(Relation rel, Buffer buf,
int keysz, ScanKey scankey, int keysz, ScanKey scankey,
BTItem btitem, BTItem btitem,
OffsetNumber afteritem); OffsetNumber afteritem);
static void _bt_insertuple(Relation rel, Buffer buf,
Size itemsz, BTItem btitem, OffsetNumber newitemoff);
static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
OffsetNumber newitemoff, Size newitemsz, OffsetNumber newitemoff, Size newitemsz,
BTItem newitem, bool newitemonleft, BTItem newitem, bool newitemonleft,
...@@ -456,9 +460,14 @@ _bt_insertonpg(Relation rel, ...@@ -456,9 +460,14 @@ _bt_insertonpg(Relation rel,
if (is_root) if (is_root)
{ {
Buffer rootbuf;
Assert(stack == (BTStack) NULL); Assert(stack == (BTStack) NULL);
/* create a new root node and release the split buffers */ /* create a new root node and release the split buffers */
_bt_newroot(rel, buf, rbuf); rootbuf = _bt_newroot(rel, buf, rbuf);
_bt_wrtbuf(rel, rootbuf);
_bt_wrtbuf(rel, rbuf);
_bt_wrtbuf(rel, buf);
} }
else else
{ {
...@@ -519,10 +528,31 @@ _bt_insertonpg(Relation rel, ...@@ -519,10 +528,31 @@ _bt_insertonpg(Relation rel,
} }
else else
{ {
START_CRIT_SECTION();
_bt_pgaddtup(rel, page, itemsz, btitem, newitemoff, "page");
itup_off = newitemoff; itup_off = newitemoff;
itup_blkno = BufferGetBlockNumber(buf); itup_blkno = BufferGetBlockNumber(buf);
_bt_insertuple(rel, buf, itemsz, btitem, newitemoff);
/* Write out the updated page and release pin/lock */
_bt_wrtbuf(rel, buf);
}
/* by here, the new tuple is inserted at itup_blkno/itup_off */
res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
return res;
}
static void
_bt_insertuple(Relation rel, Buffer buf,
Size itemsz, BTItem btitem, OffsetNumber newitemoff)
{
Page page = BufferGetPage(buf);
BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
START_CRIT_SECTION();
_bt_pgaddtup(rel, page, itemsz, btitem, newitemoff, "page");
/* XLOG stuff */ /* XLOG stuff */
{ {
xl_btree_insert xlrec; xl_btree_insert xlrec;
...@@ -530,7 +560,6 @@ _bt_insertonpg(Relation rel, ...@@ -530,7 +560,6 @@ _bt_insertonpg(Relation rel,
XLogRecPtr recptr; XLogRecPtr recptr;
XLogRecData rdata[2]; XLogRecData rdata[2];
BTItemData truncitem; BTItemData truncitem;
xlrec.target.node = rel->rd_node; xlrec.target.node = rel->rd_node;
ItemPointerSet(&(xlrec.target.tid), BufferGetBlockNumber(buf), newitemoff); ItemPointerSet(&(xlrec.target.tid), BufferGetBlockNumber(buf), newitemoff);
rdata[0].buffer = InvalidBuffer; rdata[0].buffer = InvalidBuffer;
...@@ -539,7 +568,7 @@ _bt_insertonpg(Relation rel, ...@@ -539,7 +568,7 @@ _bt_insertonpg(Relation rel,
rdata[0].next = &(rdata[1]); rdata[0].next = &(rdata[1]);
/* Read comments in _bt_pgaddtup */ /* Read comments in _bt_pgaddtup */
if (!(P_ISLEAF(lpageop)) && newitemoff == P_FIRSTDATAKEY(lpageop)) if (!(P_ISLEAF(pageop)) && newitemoff == P_FIRSTDATAKEY(pageop))
{ {
truncitem = *btitem; truncitem = *btitem;
truncitem.bti_itup.t_info = sizeof(BTItemData); truncitem.bti_itup.t_info = sizeof(BTItemData);
...@@ -554,8 +583,7 @@ _bt_insertonpg(Relation rel, ...@@ -554,8 +583,7 @@ _bt_insertonpg(Relation rel,
} }
rdata[1].buffer = buf; rdata[1].buffer = buf;
rdata[1].next = NULL; rdata[1].next = NULL;
if (P_ISLEAF(pageop))
if (P_ISLEAF(lpageop))
flag |= XLOG_BTREE_LEAF; flag |= XLOG_BTREE_LEAF;
recptr = XLogInsert(RM_BTREE_ID, flag, rdata); recptr = XLogInsert(RM_BTREE_ID, flag, rdata);
...@@ -565,15 +593,6 @@ _bt_insertonpg(Relation rel, ...@@ -565,15 +593,6 @@ _bt_insertonpg(Relation rel,
} }
END_CRIT_SECTION(); END_CRIT_SECTION();
/* Write out the updated page and release pin/lock */
_bt_wrtbuf(rel, buf);
}
/* by here, the new tuple is inserted at itup_blkno/itup_off */
res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
return res;
} }
/* /*
...@@ -1131,10 +1150,11 @@ _bt_getstackbuf(Relation rel, BTStack stack) ...@@ -1131,10 +1150,11 @@ _bt_getstackbuf(Relation rel, BTStack stack)
* *
* On entry, lbuf (the old root) and rbuf (its new peer) are write- * On entry, lbuf (the old root) and rbuf (its new peer) are write-
* locked. On exit, a new root page exists with entries for the * locked. On exit, a new root page exists with entries for the
* two new children. The new root page is neither pinned nor locked, and * two new children, metapage is updated and unlocked/unpinned.
* we have also written out lbuf and rbuf and dropped their pins/locks. * The new root buffer is returned to caller which has to unlock/unpin
* lbuf, rbuf & rootbuf.
*/ */
void static Buffer
_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
{ {
Buffer rootbuf; Buffer rootbuf;
...@@ -1257,13 +1277,156 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) ...@@ -1257,13 +1277,156 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
} }
END_CRIT_SECTION(); END_CRIT_SECTION();
/* write and let go of the new root buffer */ /* write and let go of metapage buffer */
_bt_wrtbuf(rel, rootbuf);
_bt_wrtbuf(rel, metabuf); _bt_wrtbuf(rel, metabuf);
/* update and release new sibling, and finally the old root */ return(rootbuf);
_bt_wrtbuf(rel, rbuf); }
_bt_wrtbuf(rel, lbuf);
/*
* In the event old root page was splitted but no new one was created we
* build required parent levels keeping write lock on old root page.
* Note: it's assumed that old root page' btpo_parent points to meta page,
* ie not to parent page. On exit, new root page buffer is write locked.
* If "release" is TRUE then oldrootbuf will be released immediately
* after upper level is builded.
*/
Buffer
_bt_fixroot(Relation rel, Buffer oldrootbuf, bool release)
{
Buffer rootbuf;
BlockNumber rootblk;
Page rootpage;
XLogRecPtr rootLSN;
Page oldrootpage = BufferGetPage(oldrootbuf);
BTPageOpaque oldrootopaque = (BTPageOpaque)
PageGetSpecialPointer(oldrootpage);
Buffer buf, leftbuf, rightbuf;
Page page, leftpage, rightpage;
BTPageOpaque opaque, leftopaque, rightopaque;
OffsetNumber newitemoff;
BTItem btitem, ritem;
Size itemsz;
if (! P_LEFTMOST(oldrootopaque) || P_RIGHTMOST(oldrootopaque))
elog(ERROR, "bt_fixroot: not valid old root page");
/* Read right neighbor and create new root page*/
leftbuf = _bt_getbuf(rel, oldrootopaque->btpo_next, BT_WRITE);
leftpage = BufferGetPage(leftbuf);
leftopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
rootbuf = _bt_newroot(rel, oldrootbuf, leftbuf);
rootpage = BufferGetPage(rootbuf);
rootLSN = PageGetLSN(rootpage);
rootblk = BufferGetBlockNumber(rootbuf);
/*
* Update LSN & StartUpID of old root buffer and its neighbor to
* ensure that they will be written on disk after logging new
* root creation. Unfortunately, for the moment (?) we do not
* log this operation and so possibly break our rule to log entire
* page content of first after checkpoint modification.
*/
HOLD_INTERRUPTS();
oldrootopaque->btpo_parent = rootblk;
leftopaque->btpo_parent = rootblk;
PageSetLSN(oldrootpage, rootLSN);
PageSetSUI(oldrootpage, ThisStartUpID);
PageSetLSN(leftpage, rootLSN);
PageSetSUI(leftpage, ThisStartUpID);
RESUME_INTERRUPTS();
/* parent page where to insert pointers */
buf = rootbuf;
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/*
* Now read other pages (if any) on level and add them to new root.
* If concurrent process will split one of pages on this level then it
* will notice either btpo_parent == metablock or btpo_parent == rootblk.
* In first case it will give up its locks and try to lock leftmost page
* buffer (oldrootbuf) to fix root - ie it will wait for us and let us
* continue. In second case it will try to lock rootbuf keeping its locks
* on buffers we already passed, also waiting for us. If we'll have to
* unlock rootbuf (split it) and that process will have to split page
* of new level we created (level of rootbuf) then it will wait while
* we create upper level. Etc.
*/
while(! P_RIGHTMOST(leftopaque))
{
rightbuf = _bt_getbuf(rel, leftopaque->btpo_next, BT_WRITE);
rightpage = BufferGetPage(rightbuf);
rightopaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
/* Update LSN & StartUpID (see comments above) */
HOLD_INTERRUPTS();
rightopaque->btpo_parent = rootblk;
if (XLByteLT(PageGetLSN(rightpage), rootLSN))
PageSetLSN(rightpage, rootLSN);
PageSetSUI(rightpage, ThisStartUpID);
RESUME_INTERRUPTS();
ritem = (BTItem) PageGetItem(leftpage, PageGetItemId(leftpage, P_HIKEY));
btitem = _bt_formitem(&(ritem->bti_itup));
ItemPointerSet(&(btitem->bti_itup.t_tid), leftopaque->btpo_next, P_HIKEY);
itemsz = IndexTupleDSize(btitem->bti_itup)
+ (sizeof(BTItemData) - sizeof(IndexTupleData));
itemsz = MAXALIGN(itemsz);
newitemoff = OffsetNumberNext(PageGetMaxOffsetNumber(page));
if (PageGetFreeSpace(page) < itemsz)
{
Buffer newbuf;
OffsetNumber firstright;
OffsetNumber itup_off;
BlockNumber itup_blkno;
bool newitemonleft;
firstright = _bt_findsplitloc(rel, page,
newitemoff, itemsz, &newitemonleft);
newbuf = _bt_split(rel, buf, firstright,
newitemoff, itemsz, btitem, newitemonleft,
&itup_off, &itup_blkno);
/* Keep lock on new "root" buffer ! */
if (buf != rootbuf)
_bt_relbuf(rel, buf, BT_WRITE);
buf = newbuf;
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
}
else
_bt_insertuple(rel, buf, itemsz, btitem, newitemoff);
/* give up left buffer */
_bt_relbuf(rel, leftbuf, BT_WRITE);
leftbuf = rightbuf;
leftpage = rightpage;
leftopaque = rightopaque;
}
/* give up rightmost page buffer */
_bt_relbuf(rel, leftbuf, BT_WRITE);
/*
* Here we hold locks on old root buffer, new root buffer we've
* created with _bt_newroot() - rootbuf, - and buf we've used
* for last insert ops - buf. If rootbuf != buf then we have to
* create at least one more level. And if "release" is TRUE
* (ie we've already created some levels) then we give up
* oldrootbuf.
*/
if (release)
_bt_relbuf(rel, oldrootbuf, BT_WRITE);
if (rootbuf != buf)
{
_bt_relbuf(rel, buf, BT_WRITE);
return(_bt_fixroot(rel, rootbuf, true));
}
return(rootbuf);
} }
/* /*
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.47 2001/01/24 19:42:48 momjian Exp $ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.48 2001/01/26 01:24:31 vadim Exp $
* *
* NOTES * NOTES
* Postgres btree pages look like ordinary relation pages. The opaque * Postgres btree pages look like ordinary relation pages. The opaque
...@@ -28,6 +28,8 @@ ...@@ -28,6 +28,8 @@
#include "miscadmin.h" #include "miscadmin.h"
#include "storage/lmgr.h" #include "storage/lmgr.h"
extern bool FixBTree; /* comments in nbtree.c */
extern Buffer _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release);
/* /*
* We use high-concurrency locking on btrees. There are two cases in * We use high-concurrency locking on btrees. There are two cases in
...@@ -237,7 +239,58 @@ _bt_getroot(Relation rel, int access) ...@@ -237,7 +239,58 @@ _bt_getroot(Relation rel, int access)
if (! P_ISROOT(rootopaque)) if (! P_ISROOT(rootopaque))
{ {
/* it happened, try again */ /*
* It happened, but if root page splitter failed to create
* new root page then we'll go in loop trying to call
* _bt_getroot again and again.
*/
if (FixBTree)
{
Buffer newrootbuf;
check_parent:;
if (rootopaque->btpo_parent == BTREE_METAPAGE) /* unupdated! */
{
LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
LockBuffer(rootbuf, BT_WRITE);
/* handle concurrent fix of root page */
if (rootopaque->btpo_parent == BTREE_METAPAGE) /* unupdated! */
{
newrootbuf = _bt_fixroot(rel, rootbuf, true);
LockBuffer(newrootbuf, BUFFER_LOCK_UNLOCK);
LockBuffer(newrootbuf, BT_READ);
rootbuf = newrootbuf;
rootpage = BufferGetPage(rootbuf);
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
/* New root might be splitted while changing lock */
if (P_ISROOT(rootopaque))
return(rootbuf);
/* rootbuf is read locked */
goto check_parent;
}
else /* someone else already fixed root */
{
LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
LockBuffer(rootbuf, BT_READ);
}
}
/*
* Ok, here we have old root page with btpo_parent pointing
* to upper level - check parent page because of there is
* good chance that parent is root page.
*/
newrootbuf = _bt_getbuf(rel, rootopaque->btpo_parent, BT_READ);
_bt_relbuf(rel, rootbuf, BT_READ);
rootbuf = newrootbuf;
rootpage = BufferGetPage(rootbuf);
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
if (P_ISROOT(rootopaque))
return(rootbuf);
/* no luck -:( */
}
/* try again */
_bt_relbuf(rel, rootbuf, BT_READ); _bt_relbuf(rel, rootbuf, BT_READ);
return _bt_getroot(rel, access); return _bt_getroot(rel, access);
} }
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.75 2001/01/24 19:42:48 momjian Exp $ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.76 2001/01/26 01:24:31 vadim Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -26,13 +26,18 @@ ...@@ -26,13 +26,18 @@
#include "executor/executor.h" #include "executor/executor.h"
#include "miscadmin.h" #include "miscadmin.h"
#include "storage/sinval.h" #include "storage/sinval.h"
#include "access/xlogutils.h"
bool BuildingBtree = false; /* see comment in btbuild() */ bool BuildingBtree = false; /* see comment in btbuild() */
bool FastBuild = true; /* use sort/build instead of insertion bool FastBuild = true; /* use sort/build instead */
* build */ /* of insertion build */
#include "access/xlogutils.h"
/*
* TEMPORARY FLAG FOR TESTING NEW FIX TREE
* CODE WITHOUT AFFECTING ANYONE ELSE
*/
bool FixBTree = false;
static void _bt_restscan(IndexScanDesc scan); static void _bt_restscan(IndexScanDesc scan);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment