Commit 421f0baa authored by Tom Lane's avatar Tom Lane

Further cleanup of btbuild (CREATE INDEX). Avoid storing unneeded

left keys during bottom-up index build, and leave some free space
instead of packing the pages to the brim (so as to avoid vast numbers
of page splits during the first interactive insertions).
parent 1ea912e1
$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.2 2000/07/21 06:42:32 tgl Exp $ $Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.3 2000/07/21 22:14:09 tgl Exp $
This directory contains a correct implementation of Lehman and Yao's This directory contains a correct implementation of Lehman and Yao's
high-concurrency B-tree management algorithm (P. Lehman and S. Yao, high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
...@@ -168,8 +168,7 @@ Notes about data representation: ...@@ -168,8 +168,7 @@ Notes about data representation:
Notes to operator class implementors: Notes to operator class implementors:
+ With this implementation, we require the user to supply us with + With this implementation, we require each supported datatype to supply
a procedure for pg_amproc. This procedure should take two keys us with a comparison procedure via pg_amproc. This procedure must take
A and B and return < 0, 0, or > 0 if A < B, A = B, or A > B, two nonnull values A and B and return an int32 < 0, 0, or > 0 if A < B,
respectively. See the contents of that relation for the btree A = B, or A > B, respectively. See nbtcompare.c for examples.
access method for some samples.
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
* its parent level. When we have only one page on a level, it must be * its parent level. When we have only one page on a level, it must be
* the root -- it can be attached to the btree metapage and we are done. * the root -- it can be attached to the btree metapage and we are done.
* *
* this code is moderately slow (~10% slower) compared to the regular * This code is moderately slow (~10% slower) compared to the regular
* btree (insertion) build code on sorted or well-clustered data. on * btree (insertion) build code on sorted or well-clustered data. On
* random data, however, the insertion build code is unusable -- the * random data, however, the insertion build code is unusable -- the
* difference on a 60MB heap is a factor of 15 because the random * difference on a 60MB heap is a factor of 15 because the random
* probes into the btree thrash the buffer pool. (NOTE: the above * probes into the btree thrash the buffer pool. (NOTE: the above
...@@ -22,25 +22,20 @@ ...@@ -22,25 +22,20 @@
* not very good external sort implementation that used to exist in * not very good external sort implementation that used to exist in
* this module. tuplesort.c is almost certainly faster.) * this module. tuplesort.c is almost certainly faster.)
* *
* this code currently packs the pages to 100% of capacity. this is * It is not wise to pack the pages entirely full, since then *any*
* not wise, since *any* insertion will cause splitting. filling to * insertion would cause a split (and not only of the leaf page; the need
* something like the standard 70% steady-state load factor for btrees * for a split would cascade right up the tree). The steady-state load
* would probably be better. * factor for btrees is usually estimated at 70%. We choose to pack leaf
* * pages to 90% and upper pages to 70%. This gives us reasonable density
* Another limitation is that we currently load full copies of all keys * (there aren't many upper pages if the keys are reasonable-size) without
* into upper tree levels. The leftmost data key in each non-leaf node * incurring a lot of cascading splits during early insertions.
* could be omitted as far as normal btree operations are concerned
* (see README for more info). However, because we build the tree from
* the bottom up, we need that data key to insert into the node's parent.
* This could be fixed by keeping a spare copy of the minimum key in the
* state stack, but I haven't time for that right now.
* *
* *
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.55 2000/07/21 06:42:33 tgl Exp $ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.56 2000/07/21 22:14:09 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -51,14 +46,6 @@ ...@@ -51,14 +46,6 @@
#include "utils/tuplesort.h" #include "utils/tuplesort.h"
/*
* turn on debugging output.
*
* XXX this code just does a numeric printf of the index key, so it's
* only really useful for integer keys.
*/
/*#define FASTBUILD_DEBUG*/
/* /*
* Status record for spooling. * Status record for spooling.
*/ */
...@@ -72,13 +59,24 @@ struct BTSpool ...@@ -72,13 +59,24 @@ struct BTSpool
/* /*
* Status record for a btree page being built. We have one of these * Status record for a btree page being built. We have one of these
* for each active tree level. * for each active tree level.
*
* The reason we need to store a copy of the minimum key is that we'll
* need to propagate it to the parent node when this page is linked
* into its parent. However, if the page is not a leaf page, the first
* entry on the page doesn't need to contain a key, so we will not have
* stored the key itself on the page. (You might think we could skip
* copying the minimum key on leaf pages, but actually we must have a
* writable copy anyway because we'll poke the page's address into it
* before passing it up to the parent...)
*/ */
typedef struct BTPageState typedef struct BTPageState
{ {
Buffer btps_buf; /* current buffer & page */ Buffer btps_buf; /* current buffer & page */
Page btps_page; Page btps_page;
BTItem btps_minkey; /* copy of minimum key (first item) on page */
OffsetNumber btps_lastoff; /* last item offset loaded */ OffsetNumber btps_lastoff; /* last item offset loaded */
int btps_level; int btps_level; /* tree level (0 = leaf) */
Size btps_full; /* "full" if less than this much free space */
struct BTPageState *btps_next; /* link to parent level, if any */ struct BTPageState *btps_next; /* link to parent level, if any */
} BTPageState; } BTPageState;
...@@ -90,12 +88,14 @@ typedef struct BTPageState ...@@ -90,12 +88,14 @@ typedef struct BTPageState
0) 0)
static void _bt_load(Relation index, BTSpool *btspool); static void _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags);
static void _bt_buildadd(Relation index, BTPageState *state,
BTItem bti, int flags);
static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);
static BTPageState *_bt_pagestate(Relation index, int flags, int level); static BTPageState *_bt_pagestate(Relation index, int flags, int level);
static void _bt_slideleft(Relation index, Buffer buf, Page page);
static void _bt_sortaddtup(Page page, Size itemsize,
BTItem btitem, OffsetNumber itup_off);
static void _bt_buildadd(Relation index, BTPageState *state, BTItem bti);
static void _bt_uppershutdown(Relation index, BTPageState *state); static void _bt_uppershutdown(Relation index, BTPageState *state);
static void _bt_load(Relation index, BTSpool *btspool);
/* /*
...@@ -190,6 +190,35 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags) ...@@ -190,6 +190,35 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
opaque->btpo_flags = flags; opaque->btpo_flags = flags;
} }
/*
* allocate and initialize a new BTPageState. the returned structure
* is suitable for immediate use by _bt_buildadd.
*/
static BTPageState *
_bt_pagestate(Relation index, int flags, int level)
{
BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState));
MemSet((char *) state, 0, sizeof(BTPageState));
/* create initial page */
_bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags);
state->btps_minkey = (BTItem) NULL;
/* initialize lastoff so first item goes into P_FIRSTKEY */
state->btps_lastoff = P_HIKEY;
state->btps_level = level;
/* set "full" threshold based on level. See notes at head of file. */
if (level > 0)
state->btps_full = (PageGetPageSize(state->btps_page) * 3) / 10;
else
state->btps_full = PageGetPageSize(state->btps_page) / 10;
/* no parent level, yet */
state->btps_next = (BTPageState *) NULL;
return state;
}
/* /*
* slide an array of ItemIds back one slot (from P_FIRSTKEY to * slide an array of ItemIds back one slot (from P_FIRSTKEY to
* P_HIKEY, overwriting P_HIKEY). we need to do this when we discover * P_HIKEY, overwriting P_HIKEY). we need to do this when we discover
...@@ -219,53 +248,49 @@ _bt_slideleft(Relation index, Buffer buf, Page page) ...@@ -219,53 +248,49 @@ _bt_slideleft(Relation index, Buffer buf, Page page)
} }
/* /*
* allocate and initialize a new BTPageState. the returned structure * Add an item to a page being built.
* is suitable for immediate use by _bt_buildadd. *
*/ * The main difference between this routine and a bare PageAddItem call
static BTPageState * * is that this code knows that the leftmost data item on a non-leaf
_bt_pagestate(Relation index, int flags, int level) * btree page doesn't need to have a key. Therefore, it strips such
{ * items down to just the item header.
BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState)); *
* This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use
MemSet((char *) state, 0, sizeof(BTPageState)); * that because it assumes that P_RIGHTMOST() will return the correct
_bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags); * answer for the page. Here, we don't know yet if the page will be
state->btps_lastoff = P_HIKEY; * rightmost. Offset P_FIRSTKEY is always the first data key.
state->btps_next = (BTPageState *) NULL;
state->btps_level = level;
return state;
}
/*
* return a copy of the minimum (P_HIKEY or P_FIRSTKEY) item on
* 'opage'. the copy is modified to point to 'opage' (as opposed to
* the page to which the item used to point, e.g., a heap page if
* 'opage' is a leaf page).
*/ */
static BTItem static void
_bt_minitem(Page opage, BlockNumber oblkno, int atend) _bt_sortaddtup(Page page,
Size itemsize,
BTItem btitem,
OffsetNumber itup_off)
{ {
OffsetNumber off; BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
BTItem obti; BTItemData truncitem;
BTItem nbti;
off = atend ? P_HIKEY : P_FIRSTKEY; if (! P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
obti = (BTItem) PageGetItem(opage, PageGetItemId(opage, off)); {
nbti = _bt_formitem(&(obti->bti_itup)); memcpy(&truncitem, btitem, sizeof(BTItemData));
ItemPointerSet(&(nbti->bti_itup.t_tid), oblkno, P_HIKEY); truncitem.bti_itup.t_info = sizeof(BTItemData);
btitem = &truncitem;
itemsize = sizeof(BTItemData);
}
return nbti; if (PageAddItem(page, (Item) btitem, itemsize, itup_off,
LP_USED) == InvalidOffsetNumber)
elog(FATAL, "btree: failed to add item to the page in _bt_sort");
} }
/* /*----------
* add an item to a disk page from the sort output. * Add an item to a disk page from the sort output.
* *
* we must be careful to observe the following restrictions, placed * We must be careful to observe the page layout conventions of nbtsearch.c:
* upon us by the conventions in nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
* - rightmost pages start data items at P_HIKEY instead of at * - on non-leaf pages, the key portion of the first item need not be
* P_FIRSTKEY. * stored, we should store only the link.
* *
* a leaf page being built looks like: * A leaf page being built looks like:
* *
* +----------------+---------------------------------+ * +----------------+---------------------------------+
* | PageHeaderData | linp0 linp1 linp2 ... | * | PageHeaderData | linp0 linp1 linp2 ... |
...@@ -280,16 +305,18 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend) ...@@ -280,16 +305,18 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend)
* | ... item3 item2 item1 | "special space" | * | ... item3 item2 item1 | "special space" |
* +--------------------------------+-----------------+ * +--------------------------------+-----------------+
* *
* contrast this with the diagram in bufpage.h; note the mismatch * Contrast this with the diagram in bufpage.h; note the mismatch
* between linps and items. this is because we reserve linp0 as a * between linps and items. This is because we reserve linp0 as a
* placeholder for the pointer to the "high key" item; when we have * placeholder for the pointer to the "high key" item; when we have
* filled up the page, we will set linp0 to point to itemN and clear * filled up the page, we will set linp0 to point to itemN and clear
* linpN. * linpN. On the other hand, if we find this is the last (rightmost)
* page, we leave the items alone and slide the linp array over.
* *
* 'last' pointer indicates the last offset added to the page. * 'last' pointer indicates the last offset added to the page.
*----------
*/ */
static void static void
_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
{ {
Buffer nbuf; Buffer nbuf;
Page npage; Page npage;
...@@ -321,44 +348,34 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) ...@@ -321,44 +348,34 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
btisz, btisz,
(PageGetPageSize(npage) - sizeof(PageHeaderData) - MAXALIGN(sizeof(BTPageOpaqueData))) /3 - sizeof(ItemIdData)); (PageGetPageSize(npage) - sizeof(PageHeaderData) - MAXALIGN(sizeof(BTPageOpaqueData))) /3 - sizeof(ItemIdData));
if (pgspc < btisz) if (pgspc < btisz || pgspc < state->btps_full)
{ {
/* /*
* Item won't fit on this page, so finish off the page and * Item won't fit on this page, or we feel the page is full enough
* write it out. * already. Finish off the page and write it out.
*/ */
Buffer obuf = nbuf; Buffer obuf = nbuf;
Page opage = npage; Page opage = npage;
ItemId ii; ItemId ii;
ItemId hii; ItemId hii;
BTItem nbti; BTItem obti;
_bt_blnewpage(index, &nbuf, &npage, flags); /* Create new page */
_bt_blnewpage(index, &nbuf, &npage,
(state->btps_level > 0) ? 0 : BTP_LEAF);
/* /*
* We copy the last item on the page into the new page, and then * We copy the last item on the page into the new page, and then
* rearrange the old page so that the 'last item' becomes its high * rearrange the old page so that the 'last item' becomes its high
* key rather than a true data item. * key rather than a true data item. There had better be at least
* * two items on the page already, else the page would be empty of
* note that since we always copy an item to the new page, * useful data. (Hence, we must allow pages to be packed at least
* 'bti' will never be the first data item on the new page. * 2/3rds full; the 70% figure used above is close to minimum.)
*/ */
Assert(last_off > P_FIRSTKEY);
ii = PageGetItemId(opage, last_off); ii = PageGetItemId(opage, last_off);
if (PageAddItem(npage, PageGetItem(opage, ii), ii->lp_len, obti = (BTItem) PageGetItem(opage, ii);
P_FIRSTKEY, LP_USED) == InvalidOffsetNumber) _bt_sortaddtup(npage, ItemIdGetLength(ii), obti, P_FIRSTKEY);
elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)");
#ifdef FASTBUILD_DEBUG
{
bool isnull;
BTItem tmpbti =
(BTItem) PageGetItem(npage, PageGetItemId(npage, P_FIRSTKEY));
Datum d = index_getattr(&(tmpbti->bti_itup), 1,
index->rd_att, &isnull);
printf("_bt_buildadd: moved <%x> to offset %d at level %d\n",
d, P_FIRSTKEY, state->btps_level);
}
#endif
/* /*
* Move 'last' into the high key position on opage * Move 'last' into the high key position on opage
...@@ -369,12 +386,39 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) ...@@ -369,12 +386,39 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
/* /*
* Reset last_off to point to new page * Link the old buffer into its parent, using its minimum key.
* If we don't have a parent, we have to create one;
* this adds a new btree level.
*/ */
last_off = PageGetMaxOffsetNumber(npage); if (state->btps_next == (BTPageState *) NULL)
{
state->btps_next =
_bt_pagestate(index, 0, state->btps_level + 1);
}
Assert(state->btps_minkey != NULL);
ItemPointerSet(&(state->btps_minkey->bti_itup.t_tid),
BufferGetBlockNumber(obuf), P_HIKEY);
_bt_buildadd(index, state->btps_next, state->btps_minkey);
pfree((void *) state->btps_minkey);
/* /*
* set the page (side link) pointers. * Save a copy of the minimum key for the new page. We have to
* copy it off the old page, not the new one, in case we are
* not at leaf level.
*/
state->btps_minkey = _bt_formitem(&(obti->bti_itup));
/*
* Set the sibling links for both pages, and parent links too.
*
* It's not necessary to set the parent link at all, because it's
* only used for handling concurrent root splits, but we may as well
* do it as a debugging aid. Note we set new page's link as well
* as old's, because if the new page turns out to be the last of
* the level, _bt_uppershutdown won't change it. The links may be
* out of date by the time the build finishes, but that's OK; they
* need only point to a left-sibling of the true parent. See the
* README file for more info.
*/ */
{ {
BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
...@@ -383,46 +427,40 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) ...@@ -383,46 +427,40 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
oopaque->btpo_next = BufferGetBlockNumber(nbuf); oopaque->btpo_next = BufferGetBlockNumber(nbuf);
nopaque->btpo_prev = BufferGetBlockNumber(obuf); nopaque->btpo_prev = BufferGetBlockNumber(obuf);
nopaque->btpo_next = P_NONE; nopaque->btpo_next = P_NONE;
oopaque->btpo_parent = nopaque->btpo_parent =
BufferGetBlockNumber(state->btps_next->btps_buf);
} }
/* /*
* Link the old buffer into its parent, using its minimum key. * Write out the old page. We never want to see it again, so we
* If we don't have a parent, we have to create one; * can give up our lock (if we had one; most likely BuildingBtree
* this adds a new btree level. * is set, so we aren't locking).
*/ */
if (state->btps_next == (BTPageState *) NULL) _bt_wrtbuf(index, obuf);
{
state->btps_next =
_bt_pagestate(index, 0, state->btps_level + 1);
}
nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0);
_bt_buildadd(index, state->btps_next, nbti, 0);
pfree((void *) nbti);
/* /*
* write out the old stuff. we never want to see it again, so we * Reset last_off to point to new page
* can give up our lock (if we had one; BuildingBtree is set, so
* we aren't locking).
*/ */
_bt_wrtbuf(index, obuf); last_off = P_FIRSTKEY;
} }
/* /*
* Add the new item into the current page. * If the new item is the first for its page, stash a copy for later.
* Note this will only happen for the first item on a level; on later
* pages, the first item for a page is copied from the prior page
* in the code above.
*/ */
last_off = OffsetNumberNext(last_off); if (last_off == P_HIKEY)
if (PageAddItem(npage, (Item) bti, btisz,
last_off, LP_USED) == InvalidOffsetNumber)
elog(FATAL, "btree: failed to add item to the page in _bt_sort (2)");
#ifdef FASTBUILD_DEBUG
{ {
bool isnull; Assert(state->btps_minkey == NULL);
Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull); state->btps_minkey = _bt_formitem(&(bti->bti_itup));
printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n",
d, last_off, state->btps_level);
} }
#endif
/*
* Add the new item into the current page.
*/
last_off = OffsetNumberNext(last_off);
_bt_sortaddtup(npage, btisz, bti, last_off);
state->btps_buf = nbuf; state->btps_buf = nbuf;
state->btps_page = npage; state->btps_page = npage;
...@@ -436,15 +474,15 @@ static void ...@@ -436,15 +474,15 @@ static void
_bt_uppershutdown(Relation index, BTPageState *state) _bt_uppershutdown(Relation index, BTPageState *state)
{ {
BTPageState *s; BTPageState *s;
BlockNumber blkno;
BTPageOpaque opaque;
BTItem bti;
/* /*
* Each iteration of this loop completes one more level of the tree. * Each iteration of this loop completes one more level of the tree.
*/ */
for (s = state; s != (BTPageState *) NULL; s = s->btps_next) for (s = state; s != (BTPageState *) NULL; s = s->btps_next)
{ {
BlockNumber blkno;
BTPageOpaque opaque;
blkno = BufferGetBlockNumber(s->btps_buf); blkno = BufferGetBlockNumber(s->btps_buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page); opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
...@@ -463,9 +501,12 @@ _bt_uppershutdown(Relation index, BTPageState *state) ...@@ -463,9 +501,12 @@ _bt_uppershutdown(Relation index, BTPageState *state)
} }
else else
{ {
bti = _bt_minitem(s->btps_page, blkno, 0); Assert(s->btps_minkey != NULL);
_bt_buildadd(index, s->btps_next, bti, 0); ItemPointerSet(&(s->btps_minkey->bti_itup.t_tid),
pfree((void *) bti); blkno, P_HIKEY);
_bt_buildadd(index, s->btps_next, s->btps_minkey);
pfree((void *) s->btps_minkey);
s->btps_minkey = NULL;
} }
/* /*
...@@ -500,11 +541,13 @@ _bt_load(Relation index, BTSpool *btspool) ...@@ -500,11 +541,13 @@ _bt_load(Relation index, BTSpool *btspool)
if (state == NULL) if (state == NULL)
state = _bt_pagestate(index, BTP_LEAF, 0); state = _bt_pagestate(index, BTP_LEAF, 0);
_bt_buildadd(index, state, bti, BTP_LEAF); _bt_buildadd(index, state, bti);
if (should_free) if (should_free)
pfree((void *) bti); pfree((void *) bti);
} }
/* Close down final pages, if we had any data at all */
if (state != NULL) if (state != NULL)
_bt_uppershutdown(index, state); _bt_uppershutdown(index, state);
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment