Commit 5d9f146c authored by Marc G. Fournier's avatar Marc G. Fournier

What looks like some *major* improvements to btree indexing...

Patches from: aoki@CS.Berkeley.EDU (Paul M. Aoki)

i gave jolly my btree bulkload code a long, long time ago but never
gave him a bunch of my bugfixes.  here's a diff against the 6.0
baseline.

for some reason, this code has slowed down somewhat relative to the
insertion-build code on very small tables.  don't know why -- it used
to be within about 10%.  anyway, here are some (highly unscientific!)
timings on a dec 3000/300 for synthetic tables with 10k, 100k and
1000k tuples (basically, 1mb, 10mb and 100mb heaps).  'c' means
clustered (pre-sorted) inputs and 'u' means unclustered (randomly
ordered) inputs.  the 10k table basically fits in the buffer pool, but
the 100k and 1000k tables don't.  as you can see, insertion build is
fine if you've sorted your heaps on your index key or if your heap
fits in core, but is absolutely horrible on unordered data (yes,
that's 7.5 hours to index 100mb of data...) because of the zillions of
random i/os.

if it doesn't work for you for whatever reason, you can always turn it
back off by flipping the FastBuild flag in nbtree.c.  i don't have
time to maintain it.

good luck!

baseline code:

time psql -c 'create index c10 on k10 using btree (c int4_ops)' bttest
real   8.6
time psql -c 'create index u10 on k10 using btree (b int4_ops)' bttest
real   9.1
time psql -c 'create index c100 on k100 using btree (c int4_ops)' bttest
real   59.2
time psql -c 'create index u100 on k100 using btree (b int4_ops)' bttest
real   652.4
time psql -c 'create index c1000 on k1000 using btree (c int4_ops)' bttest
real   636.1
time psql -c 'create index u1000 on k1000 using btree (b int4_ops)' bttest
real   26772.9

bulkloading code:

time psql -c 'create index c10 on k10 using btree (c int4_ops)' bttest
real   11.3
time psql -c 'create index u10 on k10 using btree (b int4_ops)' bttest
real   10.4
time psql -c 'create index c100 on k100 using btree (c int4_ops)' bttest
real   59.5
time psql -c 'create index u100 on k100 using btree (b int4_ops)' bttest
real   63.5
time psql -c 'create index c1000 on k1000 using btree (c int4_ops)' bttest
real   636.9
time psql -c 'create index u1000 on k1000 using btree (b int4_ops)' bttest
real   701.0
parent d5a3f52d
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.12 1997/01/10 09:46:33 vadim Exp $
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.13 1997/02/12 05:04:17 scrappy Exp $
*
* NOTES
* This file contains only the public interface routines.
......@@ -33,8 +33,8 @@
# include <string.h>
#endif
bool BuildingBtree = false;
bool FastBuild = false; /* turn this on to make bulk builds work*/
bool BuildingBtree = false; /* see comment in btbuild() */
bool FastBuild = true; /* use sort/build instead of insertion build */
/*
* btbuild() -- build a new btree index.
......@@ -67,21 +67,34 @@ btbuild(Relation heap,
int i;
BTItem btitem;
#ifndef OMIT_PARTIAL_INDEX
ExprContext *econtext;
TupleTable tupleTable;
TupleTableSlot *slot;
ExprContext *econtext = (ExprContext *) NULL;
TupleTable tupleTable = (TupleTable) NULL;
TupleTableSlot *slot = (TupleTableSlot *) NULL;
#endif
Oid hrelid, irelid;
Node *pred, *oldPred;
void *spool;
void *spool = (void *) NULL;
bool isunique;
bool usefast;
#if 0
ResetBufferUsage();
#endif
/* note that this is a new btree */
BuildingBtree = true;
pred = predInfo->pred;
oldPred = predInfo->oldPred;
/*
* bootstrap processing does something strange, so don't use
* sort/build for initial catalog indices. at some point i need
* to look harder at this. (there is some kind of incremental
* processing going on there.) -- pma 08/29/95
*/
usefast = (FastBuild && IsNormalProcessingMode());
/* see if index is unique */
isunique = IndexIsUniqueNoCache(RelationGetRelationId(index));
......@@ -110,13 +123,16 @@ btbuild(Relation heap,
slot = ExecAllocTableSlot(tupleTable);
econtext = makeNode(ExprContext);
FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer);
/*
* we never want to use sort/build if we are extending an
* existing partial index -- it works by inserting the
* newly-qualifying tuples into the existing index.
* (sort/build would overwrite the existing index with one
* consisting of the newly-qualifying tuples.)
*/
usefast = false;
}
else
{
econtext = NULL;
tupleTable = NULL;
slot = NULL;
}
#endif /* OMIT_PARTIAL_INDEX */
/* start a heap scan */
......@@ -126,12 +142,10 @@ btbuild(Relation heap,
/* build the index */
nhtups = nitups = 0;
if (FastBuild) {
if (usefast) {
spool = _bt_spoolinit(index, 7);
res = (InsertIndexResult) NULL;
}
else
spool = NULL;
for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) {
......@@ -219,7 +233,7 @@ btbuild(Relation heap,
* into a spool page for subsequent processing. otherwise, we
* insert into the btree.
*/
if (FastBuild) {
if (usefast) {
_bt_spool(index, btitem, spool);
} else {
res = _bt_doinsert(index, btitem, isunique, heap);
......@@ -248,12 +262,24 @@ btbuild(Relation heap,
* merging the runs, (2) inserting the sorted tuples into btree
* pages and (3) building the upper levels.
*/
if (FastBuild) {
_bt_spool(index, (BTItem) NULL, spool); /* flush spool */
if (usefast) {
_bt_spool(index, (BTItem) NULL, spool); /* flush the spool */
_bt_leafbuild(index, spool);
_bt_spooldestroy(spool);
}
#if 0
{
extern int ReadBufferCount, BufferHitCount, BufferFlushCount;
extern long NDirectFileRead, NDirectFileWrite;
printf("buffer(%d): r=%d w=%d\n", heap->rd_rel->relblocksz,
ReadBufferCount - BufferHitCount, BufferFlushCount);
printf("direct(%d): r=%d w=%d\n", LocalBlockSize,
NDirectFileRead, NDirectFileWrite);
}
#endif
/*
* Since we just counted the tuples in the heap, we update its
* stats in pg_class to guarantee that the planner takes advantage
......@@ -312,7 +338,10 @@ btinsert(Relation rel, Datum *datum, char *nulls, ItemPointer ht_ctid, Relation
pfree(btitem);
pfree(itup);
/* adjust any active scans that will be affected by this insertion */
_bt_adjscans(rel, &(res->pointerData), BT_INSERT);
return (res);
}
......@@ -533,7 +562,7 @@ void
btdelete(Relation rel, ItemPointer tid)
{
/* adjust any active scans that will be affected by this deletion */
_bt_adjscans(rel, tid);
_bt_adjscans(rel, tid, BT_DELETE);
/* delete the data from the page */
_bt_pagedel(rel, tid);
......
......@@ -5,7 +5,7 @@
*
*
* IDENTIFICATION
* $Id: nbtsort.c,v 1.8 1996/11/05 10:35:35 scrappy Exp $
* $Id: nbtsort.c,v 1.9 1997/02/12 05:04:20 scrappy Exp $
*
* NOTES
*
......@@ -20,15 +20,15 @@
* - when each input run has been exhausted, switch to another output
* tape and start processing another run.
* - when we have fewer runs than tapes, we know we are ready to start
* merging into the btree leaf pages.
* - every time we complete a level of the btree, we can construct the
* next level up. when we have only one page on a level, it can be
* attached to the btree metapage and we are done.
* merging into the btree leaf pages. (i.e., we do not have to wait
* until we have exactly one tape.)
* - as we extract tuples from the final runs, we build the pages for
* each level. when we have only one page on a level, it must be the
* root -- it can be attached to the btree metapage and we are done.
*
* conventions:
* - external interface routines take in and return "void *" for their
* opaque handles. this is for modularity reasons (i prefer not to
* export these structures without good reason).
* opaque handles. this is for modularity reasons.
*
* this code is moderately slow (~10% slower) compared to the regular
* btree (insertion) build code on sorted or well-clustered data. on
......@@ -63,12 +63,23 @@
# include <string.h>
#endif
#ifdef FASTBUILD
/*
* turn on debugging output.
*
* XXX this code just does a numeric printf of the index key, so it's
* only really useful for integer keys.
*/
/*#define FASTBUILD_DEBUG*/
#define FASTBUILD_SPOOL
#define FASTBUILD_MERGE
#define MAXTAPES (7)
#define TAPEBLCKSZ (BLCKSZ << 2)
#define TAPEBLCKSZ (MAXBLCKSZ << 2)
#define TAPETEMP "pg_btsortXXXXXX"
extern int NDirectFileRead;
extern int NDirectFileWrite;
extern char *mktemp(char *template);
/*-------------------------------------------------------------------------
* sorting comparison routine - returns {-1,0,1} depending on whether
......@@ -88,6 +99,11 @@
* what the heck.
* *-------------------------------------------------------------------------
*/
typedef struct {
Datum btsk_datum;
BTItem btsk_item;
} BTSortKey;
static Relation _bt_sortrel;
static void
......@@ -97,28 +113,41 @@ _bt_isortcmpinit(Relation index)
}
static int
_bt_isortcmp(const void *bti1p,const void *bti2p)
_bt_isortcmp(BTSortKey *k1, BTSortKey *k2)
{
BTItem bti1 = *(BTItem *)bti1p;
BTItem bti2 = *(BTItem *)bti2p;
if (bti1 == (BTItem) NULL) {
if (bti2 == (BTItem) NULL) {
if (k1->btsk_item == (BTItem) NULL) {
if (k2->btsk_item == (BTItem) NULL) {
return(0); /* 1 = 2 */
}
return(1); /* 1 > 2 */
} else if (bti2 == (BTItem) NULL) {
} else if (k2->btsk_item == (BTItem) NULL) {
return(-1); /* 1 < 2 */
} else if (_bt_itemcmp(_bt_sortrel, 1, bti1, bti2,
BTGreaterStrategyNumber)) {
} else if (_bt_invokestrat(_bt_sortrel, 1, BTGreaterStrategyNumber,
k1->btsk_datum, k2->btsk_datum)) {
return(1); /* 1 > 2 */
} else if (_bt_itemcmp(_bt_sortrel, 1, bti2, bti1,
BTGreaterStrategyNumber)) {
} else if (_bt_invokestrat(_bt_sortrel, 1, BTGreaterStrategyNumber,
k2->btsk_datum, k1->btsk_datum)) {
return(-1); /* 1 < 2 */
}
return(0); /* 1 = 2 */
}
static void
_bt_setsortkey(Relation index, BTItem bti, BTSortKey *sk)
{
sk->btsk_item = (BTItem) NULL;
sk->btsk_datum = (Datum) NULL;
if (bti != (BTItem) NULL) {
bool isnull;
Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull);
if (!isnull) {
sk->btsk_item = bti;
sk->btsk_datum = d;
}
}
}
/*-------------------------------------------------------------------------
* priority queue methods
*
......@@ -133,7 +162,7 @@ _bt_isortcmp(const void *bti1p,const void *bti2p)
typedef struct {
int btpqe_tape; /* tape identifier */
BTItem btpqe_item; /* pointer to BTItem in tape buffer */
BTSortKey btpqe_item; /* pointer to BTItem in tape buffer */
} BTPriQueueElem;
#define MAXELEM MAXTAPES
......@@ -336,7 +365,8 @@ static void
_bt_tapewrite(BTTapeBlock *tape, int eor)
{
tape->bttb_eor = eor;
FileWrite(tape->bttb_fd, (char*)tape, TAPEBLCKSZ);
FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ);
NDirectFileWrite += TAPEBLCKSZ;
_bt_tapereset(tape);
}
......@@ -356,7 +386,7 @@ _bt_taperead(BTTapeBlock *tape)
int nread;
if (tape->bttb_eor) {
return(0); /* we are at End-Of-Run */
return(0); /* we are already at End-Of-Run */
}
/*
......@@ -364,7 +394,7 @@ _bt_taperead(BTTapeBlock *tape)
* VFD (the one in the block we're reading is bogus).
*/
fd = tape->bttb_fd;
nread = FileRead(fd, (char*) tape, TAPEBLCKSZ);
nread = FileRead(fd, (char *) tape, TAPEBLCKSZ);
tape->bttb_fd = fd;
if (nread != TAPEBLCKSZ) {
......@@ -372,6 +402,7 @@ _bt_taperead(BTTapeBlock *tape)
return(0);
}
Assert(tape->bttb_magic == BTTAPEMAGIC);
NDirectFileRead += TAPEBLCKSZ;
return(1);
}
......@@ -445,8 +476,6 @@ typedef struct {
void *
_bt_spoolinit(Relation index, int ntapes)
{
char *mktemp();
BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool));
int i;
char *fname = (char *) palloc(sizeof(TAPETEMP) + 1);
......@@ -567,6 +596,7 @@ _bt_spool(Relation index, BTItem btitem, void *spool)
BTSpool *btspool = (BTSpool *) spool;
BTTapeBlock *itape;
Size itemsz;
int i;
itape = btspool->bts_itape[btspool->bts_tape];
itemsz = BTITEMSZ(btitem);
......@@ -579,7 +609,7 @@ _bt_spool(Relation index, BTItem btitem, void *spool)
* buffer.
*/
if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) {
BTItem *parray;
BTSortKey *parray = (BTSortKey *) NULL;
BTTapeBlock *otape;
BTItem bti;
char *pos;
......@@ -590,45 +620,49 @@ _bt_spool(Relation index, BTItem btitem, void *spool)
* build an array of pointers to the BTItemDatas on the input
* block.
*/
parray = (BTItem *) palloc(itape->bttb_ntup * sizeof(BTItem));
if (parray == (BTItem *) NULL) {
elog(WARN, "_bt_spool: out of memory");
}
pos = itape->bttb_data;
for (i = 0; i < itape->bttb_ntup; ++i) {
parray[i] = _bt_tapenext(itape, &pos);
if (itape->bttb_ntup > 0) {
parray =
(BTSortKey *) palloc(itape->bttb_ntup * sizeof(BTSortKey));
if (parray == (BTSortKey *) NULL) {
elog(WARN, "_bt_spool: out of memory");
}
pos = itape->bttb_data;
for (i = 0; i < itape->bttb_ntup; ++i) {
_bt_setsortkey(index, _bt_tapenext(itape, &pos), &(parray[i]));
}
/*
* qsort the pointer array.
*/
_bt_isortcmpinit(index);
qsort((void *) parray, itape->bttb_ntup, sizeof(BTSortKey),
_bt_isortcmp);
}
/*
* qsort the pointer array.
*/
_bt_isortcmpinit(index);
qsort((void *) parray, itape->bttb_ntup, sizeof(BTItem), _bt_isortcmp);
/*
* write the spooled run into the output tape. we copy the
* BTItemDatas in the order dictated by the sorted array of
* BTItems, not the original order.
*
* (since everything was DOUBLEALIGN'd and is all on a single
* page, everything had *better* still fit on one page..)
* tape block, everything had *better* still fit on one tape
* block..)
*/
otape = btspool->bts_otape[btspool->bts_tape];
for (i = 0; i < itape->bttb_ntup; ++i) {
bti = parray[i];
bti = parray[i].btsk_item;
btisz = BTITEMSZ(bti);
btisz = DOUBLEALIGN(btisz);
_bt_tapeadd(otape, bti, btisz);
#ifdef FASTBUILD_DEBUG
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_SPOOL)
{
bool isnull;
Datum d = index_getattr(&(bti->bti_itup), 1,
RelationGetTupleDescriptor(index),
&isnull);
Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att,
&isnull);
printf("_bt_spool: inserted <%x> into output tape %d\n",
d, btspool->bts_tape);
}
#endif /* FASTBUILD_DEBUG */
#endif /* FASTBUILD_DEBUG && FASTBUILD_SPOOL */
}
/*
......@@ -653,7 +687,9 @@ _bt_spool(Relation index, BTItem btitem, void *spool)
/*
* destroy the pointer array.
*/
pfree((void *) parray);
if (parray != (BTSortKey *) NULL) {
pfree((void *) parray);
}
}
/* insert this item into the current buffer */
......@@ -671,6 +707,9 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
BTPageOpaque opaque;
*buf = _bt_getbuf(index, P_NEW, BT_WRITE);
#if 0
printf("\tblk=%d\n", BufferGetBlockNumber(*buf));
#endif
*page = BufferGetPage(*buf);
_bt_pageinit(*page, BufferGetPageSize(*buf));
opaque = (BTPageOpaque) PageGetSpecialPointer(*page);
......@@ -680,8 +719,9 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
/*
* slide an array of ItemIds back one slot (from P_FIRSTKEY to
* P_HIKEY). we need to do this when we discover that we have built
* an ItemId array in what has turned out to be a P_RIGHTMOST page.
* P_HIKEY, overwriting P_HIKEY). we need to do this when we discover
* that we have built an ItemId array in what has turned out to be a
* P_RIGHTMOST page.
*/
static void
_bt_slideleft(Relation index, Buffer buf, Page page)
......@@ -691,24 +731,71 @@ _bt_slideleft(Relation index, Buffer buf, Page page)
ItemId previi;
ItemId thisii;
maxoff = PageGetMaxOffsetNumber(page);
previi = PageGetItemId(page, P_HIKEY);
for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) {
thisii = PageGetItemId(page, off);
*previi = *thisii;
previi = thisii;
if (!PageIsEmpty(page)) {
maxoff = PageGetMaxOffsetNumber(page);
previi = PageGetItemId(page, P_HIKEY);
for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) {
thisii = PageGetItemId(page, off);
*previi = *thisii;
previi = thisii;
}
((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
}
((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
}
typedef struct {
typedef struct BTPageState {
Buffer btps_buf;
Page btps_page;
BTItem btps_lastbti;
OffsetNumber btps_lastoff;
OffsetNumber btps_firstoff;
int btps_level;
bool btps_doupper;
struct BTPageState *btps_next;
} BTPageState;
/*
* allocate and initialize a new BTPageState. the returned structure
* is suitable for immediate use by _bt_buildadd.
*/
void *
_bt_pagestate(Relation index, int flags, int level, bool doupper)
{
BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState));
(void) memset((char *) state, 0, sizeof(BTPageState));
_bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags);
state->btps_firstoff = InvalidOffsetNumber;
state->btps_lastoff = P_HIKEY;
state->btps_lastbti = (BTItem) NULL;
state->btps_next = (BTPageState *) NULL;
state->btps_level = level;
state->btps_doupper = doupper;
return((void *) state);
}
/*
* return a copy of the minimum (P_HIKEY or P_FIRSTKEY) item on
* 'opage'. the copy is modified to point to 'opage' (as opposed to
* the page to which the item used to point, e.g., a heap page if
* 'opage' is a leaf page).
*/
BTItem
_bt_minitem(Page opage, BlockNumber oblkno, int atend)
{
OffsetNumber off;
BTItem obti;
BTItem nbti;
off = atend ? P_HIKEY : P_FIRSTKEY;
obti = (BTItem) PageGetItem(opage, PageGetItemId(opage, off));
nbti = _bt_formitem(&(obti->bti_itup));
ItemPointerSet(&(nbti->bti_itup.t_tid), oblkno, P_HIKEY);
return(nbti);
}
/*
* add an item to a disk page from a merge tape block.
*
......@@ -748,11 +835,13 @@ typedef struct {
*
* if all keys are unique, 'first' will always be the same as 'last'.
*/
static void
_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
BTItem
_bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
{
BTPageState *state = (BTPageState *) pstate;
Buffer nbuf;
Page npage;
char *pos;
BTItem last_bti;
OffsetNumber first_off;
OffsetNumber last_off;
......@@ -804,19 +893,26 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
ii = PageGetItemId(opage, o);
(void) PageAddItem(npage, PageGetItem(opage, ii),
ii->lp_len, n, LP_USED);
#ifdef FASTBUILD_DEBUG
#if 0
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
{
bool isnull;
BTItem tmpbti =
(BTItem) PageGetItem(npage, PageGetItemId(npage, n));
Datum d = index_getattr(&(tmpbti->bti_itup), 1,
RelationGetTupleDescriptor(index),
&isnull);
printf("_bt_buildadd: moved <%x> to offset %d\n",
d, n);
index->rd_att, &isnull);
printf("_bt_buildadd: moved <%x> to offset %d at level %d\n",
d, n, state->btps_level);
}
#endif /* FASTBUILD_DEBUG */
#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
#endif
}
/*
* this loop is backward because PageIndexTupleDelete shuffles
* the tuples to fill holes in the page -- by starting at the
* end and working back, we won't create holes (and thereby
* avoid shuffling).
*/
for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) {
PageIndexTupleDelete(opage, o);
}
......@@ -842,6 +938,23 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
nopaque->btpo_next = P_NONE;
}
/*
* copy the old buffer's minimum key to its parent. if we
* don't have a parent, we have to create one; this adds a new
* btree level.
*/
if (state->btps_doupper) {
BTItem nbti;
if (state->btps_next == (BTPageState *) NULL) {
state->btps_next =
_bt_pagestate(index, 0, state->btps_level + 1, true);
}
nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0);
(void) _bt_buildadd(index, state->btps_next, nbti, 0);
pfree((void *) nbti);
}
/*
* write out the old stuff. we never want to see it again, so
* we can give up our lock (if we had one; BuildingBtree is
......@@ -856,16 +969,16 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
*/
off = OffsetNumberNext(last_off);
(void) PageAddItem(npage, (Item) bti, btisz, off, LP_USED);
#ifdef FASTBUILD_DEBUG
#if 0
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
{
bool isnull;
Datum d = index_getattr(&(bti->bti_itup), 1,
RelationGetTupleDescriptor(index),
&isnull);
printf("_bt_buildadd: inserted <%x> at offset %d\n",
d, off);
Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull);
printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n",
d, off, state->btps_level);
}
#endif /* FASTBUILD_DEBUG */
#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
#endif
if (last_bti == (BTItem) NULL) {
first_off = P_FIRSTKEY;
} else if (!_bt_itemcmp(index, 1, bti, last_bti, BTEqualStrategyNumber)) {
......@@ -879,6 +992,48 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
state->btps_lastbti = last_bti;
state->btps_lastoff = last_off;
state->btps_firstoff = first_off;
return(last_bti);
}
void
_bt_uppershutdown(Relation index, BTPageState *state)
{
BTPageState *s;
BlockNumber blkno;
BTPageOpaque opaque;
BTItem bti;
for (s = state; s != (BTPageState *) NULL; s = s->btps_next) {
blkno = BufferGetBlockNumber(s->btps_buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
/*
* if this is the root, attach it to the metapage. otherwise,
* stick the minimum key of the last page on this level (which
* has not been split, or else it wouldn't be the last page)
* into its parent. this may cause the last page of upper
* levels to split, but that's not a problem -- we haven't
* gotten to them yet.
*/
if (s->btps_doupper) {
if (s->btps_next == (BTPageState *) NULL) {
opaque->btpo_flags |= BTP_ROOT;
_bt_metaproot(index, blkno);
} else {
bti = _bt_minitem(s->btps_page, blkno, 0);
(void) _bt_buildadd(index, s->btps_next, bti, 0);
pfree((void *) bti);
}
}
/*
* this is the rightmost page, so the ItemId array needs to be
* slid back one slot.
*/
_bt_slideleft(index, s->btps_buf, s->btps_page);
_bt_wrtbuf(index, s->btps_buf);
}
}
/*
......@@ -888,11 +1043,10 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
*
* XXX three nested loops? gross. cut me up into smaller routines.
*/
static BlockNumber
static void
_bt_merge(Relation index, BTSpool *btspool)
{
BTPageState state;
BlockNumber firstblk;
BTPageState *state;
BTPriQueue q;
BTPriQueueElem e;
BTItem bti;
......@@ -902,29 +1056,31 @@ _bt_merge(Relation index, BTSpool *btspool)
int tapedone[MAXTAPES];
int t;
int goodtapes;
int npass;
int nruns;
Size btisz;
bool doleaf = false;
BTPageState *s;
BTPageOpaque *opaque;
/*
* initialize state needed for the merge into the btree leaf pages.
*/
(void) memset((char *) &state, 0, sizeof(BTPageState));
_bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), BTP_LEAF);
state.btps_lastoff = P_HIKEY;
state.btps_lastbti = (BTItem) NULL;
firstblk = BufferGetBlockNumber(state.btps_buf);
state = (BTPageState *) _bt_pagestate(index, BTP_LEAF, 0, true);
npass = 0;
do { /* pass */
/*
* each pass starts by flushing the previous outputs and
* swapping inputs and outputs. this process also clears the
* new output tapes and rewinds the new input tapes.
* swapping inputs and outputs. flushing sets End-of-Run for
* any dirty output tapes. swapping clears the new output
* tapes and rewinds the new input tapes.
*/
btspool->bts_tape = btspool->bts_ntapes - 1;
_bt_spoolflush(btspool);
_bt_spoolswap(btspool);
++npass;
nruns = 0;
for (;;) { /* run */
......@@ -949,22 +1105,27 @@ _bt_merge(Relation index, BTSpool *btspool)
for (t = 0; t < btspool->bts_ntapes; ++t) {
itape = btspool->bts_itape[t];
tapepos[t] = itape->bttb_data;
tapedone[t] = 0;
_bt_tapereset(itape);
if (_bt_taperead(itape) == 0) {
tapedone[t] = 1;
} else {
do {
if (_bt_taperead(itape) == 0) {
tapedone[t] = 1;
}
} while (!tapedone[t] && EMPTYTAPE(itape));
if (!tapedone[t]) {
++goodtapes;
tapedone[t] = 0;
e.btpqe_tape = t;
e.btpqe_item = _bt_tapenext(itape, &tapepos[t]);
if (e.btpqe_item != (BTItem) NULL) {
_bt_setsortkey(index, _bt_tapenext(itape, &tapepos[t]),
&(e.btpqe_item));
if (e.btpqe_item.btsk_item != (BTItem) NULL) {
_bt_pqadd(&q, &e);
}
}
}
/*
* if we don't have any tapes with any input (i.e., they
* are all at EOF), we must be done with this pass.
* are all at EOF), there is no work to do in this run --
* we must be done with this pass.
*/
if (goodtapes == 0) {
break; /* for */
......@@ -972,8 +1133,8 @@ _bt_merge(Relation index, BTSpool *btspool)
++nruns;
/*
* output the smallest element from the queue until there are no
* more.
* output the smallest element from the queue until there
* are no more.
*/
while (_bt_pqnext(&q, &e) >= 0) { /* item */
/*
......@@ -982,63 +1143,59 @@ _bt_merge(Relation index, BTSpool *btspool)
* if it hits either End-Of-Run or EOF.
*/
t = e.btpqe_tape;
bti = e.btpqe_item;
bti = e.btpqe_item.btsk_item;
if (bti != (BTItem) NULL) {
btisz = BTITEMSZ(bti);
btisz = DOUBLEALIGN(btisz);
if (doleaf) {
_bt_buildadd(index, &state, bti, BTP_LEAF);
#ifdef FASTBUILD_DEBUG
(void) _bt_buildadd(index, state, bti, BTP_LEAF);
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
{
bool isnull;
Datum d = index_getattr(&(bti->bti_itup), 1,
RelationGetTupleDescriptor(index),
&isnull);
printf("_bt_merge: inserted <%x> into block %d\n",
d, BufferGetBlockNumber(state.btps_buf));
index->rd_att, &isnull);
printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into block %d\n",
npass, nruns, d, t,
BufferGetBlockNumber(state->btps_buf));
}
#endif /* FASTBUILD_DEBUG */
#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
} else {
if (SPCLEFT(otape) < btisz) {
/*
* if it's full, write it out and add the
* item to the next block. (since we know
* there will be at least one more block,
* we know we do *not* want to set
* End-Of-Run here!)
* item to the next block. (since we will
* be adding another tuple immediately
* after this, we can be sure that there
* will be at least one more block in this
* run and so we know we do *not* want to
* set End-Of-Run here.)
*/
_bt_tapewrite(otape, 0);
}
_bt_tapeadd(otape, bti, btisz);
#ifdef FASTBUILD_DEBUG
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
{
bool isnull;
Datum d = index_getattr(&(bti->bti_itup), 1,
RelationGetTupleDescriptor(index), &isnull);
printf("_bt_merge: inserted <%x> into tape %d\n",
d, btspool->bts_tape);
index->rd_att, &isnull);
printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into output tape %d\n",
npass, nruns, d, t,
btspool->bts_tape);
}
#endif /* FASTBUILD_DEBUG */
#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
}
}
#ifdef FASTBUILD_DEBUG
{
bool isnull;
Datum d = index_getattr(&(bti->bti_itup), 1,
RelationGetTupleDescriptor(index),
&isnull);
printf("_bt_merge: got <%x> from tape %d\n", d, t);
}
#endif /* FASTBUILD_DEBUG */
itape = btspool->bts_itape[t];
if (!tapedone[t]) {
BTItem newbti = _bt_tapenext(itape, &tapepos[t]);
if (newbti == (BTItem) NULL) {
if (_bt_taperead(itape) == 0) {
tapedone[t] = 1;
} else {
do {
if (_bt_taperead(itape) == 0) {
tapedone[t] = 1;
}
} while (!tapedone[t] && EMPTYTAPE(itape));
if (!tapedone[t]) {
tapepos[t] = itape->bttb_data;
newbti = _bt_tapenext(itape, &tapepos[t]);
}
......@@ -1047,11 +1204,17 @@ _bt_merge(Relation index, BTSpool *btspool)
BTPriQueueElem nexte;
nexte.btpqe_tape = t;
nexte.btpqe_item = newbti;
_bt_setsortkey(index, newbti, &(nexte.btpqe_item));
_bt_pqadd(&q, &nexte);
}
}
} /* item */
/*
* that's it for this run. flush the output tape, marking
* End-of-Run.
*/
_bt_tapewrite(otape, 1);
} /* run */
/*
......@@ -1068,60 +1231,50 @@ _bt_merge(Relation index, BTSpool *btspool)
}
} while (nruns > 0); /* pass */
/*
* this is the rightmost page, so the ItemId array needs to be
* slid back one slot.
*/
_bt_slideleft(index, state.btps_buf, state.btps_page);
_bt_wrtbuf(index, state.btps_buf);
return(firstblk);
_bt_uppershutdown(index, state);
}
/*
* given the block number 'blk' of the first page of a set of linked
* siblings (i.e., the start of an entire level of the btree),
* construct the corresponding next level of the btree. we do this by
* placing minimum keys from each page into this page. the format of
* the internal pages is otherwise the same as for leaf pages.
* given the (appropriately side-linked) leaf pages of a btree,
* construct the corresponding upper levels. we do this by inserting
* minimum keys from each page into parent pages as needed. the
* format of the internal pages is otherwise the same as for leaf
* pages.
*
* this routine is not called during conventional bulk-loading (in
* which case we can just build the upper levels as we create the
* sorted bottom level). it is only used for index recycling.
*/
void
_bt_upperbuild(Relation index, BlockNumber blk, int level)
_bt_upperbuild(Relation index)
{
Buffer rbuf;
BlockNumber blk;
Page rpage;
BTPageOpaque ropaque;
BTPageState state;
BlockNumber firstblk;
BTItem bti;
BTPageState *state;
BTItem nbti;
OffsetNumber off;
rbuf = _bt_getbuf(index, blk, BT_WRITE);
rpage = BufferGetPage(rbuf);
ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
/*
* if we only have one page on a level, we can just make it the
* root.
* find the first leaf block. while we're at it, clear the
* BTP_ROOT flag that we set while building it (so we could find
* it later).
*/
if (P_RIGHTMOST(ropaque)) {
ropaque->btpo_flags |= BTP_ROOT;
_bt_wrtbuf(index, rbuf);
_bt_metaproot(index, blk);
return;
}
_bt_relbuf(index, rbuf, BT_WRITE);
rbuf = _bt_getroot(index, BT_WRITE);
blk = BufferGetBlockNumber(rbuf);
rpage = BufferGetPage(rbuf);
ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
ropaque->btpo_flags &= ~BTP_ROOT;
_bt_wrtbuf(index, rbuf);
(void) memset((char *) &state, 0, sizeof(BTPageState));
_bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), 0);
state.btps_lastoff = P_HIKEY;
state.btps_lastbti = (BTItem) NULL;
firstblk = BufferGetBlockNumber(state.btps_buf);
state = (BTPageState *) _bt_pagestate(index, 0, 0, true);
/* for each page... */
do {
#if 0
printf("\t\tblk=%d\n", blk);
#endif
rbuf = _bt_getbuf(index, blk, BT_READ);
rpage = BufferGetPage(rbuf);
ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
......@@ -1133,35 +1286,24 @@ _bt_upperbuild(Relation index, BlockNumber blk, int level)
* of the lower page and insert it into a page at this
* level.
*/
off = P_RIGHTMOST(ropaque) ? P_HIKEY : P_FIRSTKEY;
bti = (BTItem) PageGetItem(rpage, PageGetItemId(rpage, off));
nbti = _bt_formitem(&(bti->bti_itup));
ItemPointerSet(&(nbti->bti_itup.t_tid), blk, P_HIKEY);
#ifdef FASTBUILD_DEBUG
nbti = _bt_minitem(rpage, blk, P_RIGHTMOST(ropaque));
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
{
bool isnull;
Datum d = index_getattr(&(nbti->bti_itup), 1,
RelationGetTupleDescriptor(index),
Datum d = index_getattr(&(nbti->bti_itup), 1, index->rd_att,
&isnull);
printf("_bt_upperbuild: inserting <%x> at %d\n",
d, level);
d, state->btps_level);
}
#endif /* FASTBUILD_DEBUG */
_bt_buildadd(index, &state, nbti, 0);
#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
(void) _bt_buildadd(index, state, nbti, 0);
pfree((void *) nbti);
}
blk = ropaque->btpo_next;
_bt_relbuf(index, rbuf, BT_READ);
} while (blk != P_NONE);
/*
* this is the rightmost page, so the ItemId array needs to be
* slid back one slot.
*/
_bt_slideleft(index, state.btps_buf, state.btps_page);
_bt_wrtbuf(index, state.btps_buf);
_bt_upperbuild(index, firstblk, level + 1);
_bt_uppershutdown(index, state);
}
/*
......@@ -1171,26 +1313,5 @@ _bt_upperbuild(Relation index, BlockNumber blk, int level)
void
_bt_leafbuild(Relation index, void *spool)
{
BTSpool *btspool = (BTSpool *) spool;
BlockNumber firstblk;
/*
* merge the runs into btree leaf pages.
*/
firstblk = _bt_merge(index, btspool);
/*
* build the upper levels of the btree.
*/
_bt_upperbuild(index, firstblk, 0);
_bt_merge(index, (BTSpool *) spool);
}
#else /* !FASTBUILD */
void *_bt_spoolinit(Relation index, int ntapes) { return((void *) NULL); }
void _bt_spooldestroy(void *spool) { }
void _bt_spool(Relation index, BTItem btitem, void *spool) { }
void _bt_upperbuild(Relation index, BlockNumber blk, int level) { }
void _bt_leafbuild(Relation index, void *spool) { }
#endif /* !FASTBUILD */
......@@ -6,7 +6,7 @@
*
* Copyright (c) 1994, Regents of the University of California
*
* $Id: nbtree.h,v 1.5 1997/01/10 09:36:33 vadim Exp $
* $Id: nbtree.h,v 1.6 1997/02/12 05:04:28 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -131,6 +131,13 @@ typedef BTStackData *BTStack;
#define BT_INSERTION 0
#define BT_DESCENT 1
/*
* We must classify index modification types for the benefit of
* _bt_adjscans.
*/
#define BT_INSERT 0
#define BT_DELETE 1
/*
* In general, the btree code tries to localize its knowledge about
* page layout to a couple of routines. However, we need a special
......@@ -220,11 +227,7 @@ extern void btdelete(Relation rel, ItemPointer tid);
*/
extern void _bt_regscan(IndexScanDesc scan);
extern void _bt_dropscan(IndexScanDesc scan);
extern void _bt_adjscans(Relation rel, ItemPointer tid);
extern void _bt_scandel(IndexScanDesc scan, BlockNumber blkno,
OffsetNumber offno);
extern bool _bt_scantouched(IndexScanDesc scan, BlockNumber blkno,
OffsetNumber offno);
extern void _bt_adjscans(Relation rel, ItemPointer tid, int op);
/*
* prototypes for functions in nbtsearch.c
......@@ -267,7 +270,7 @@ extern BTItem _bt_formitem(IndexTuple itup);
extern void *_bt_spoolinit(Relation index, int ntapes);
extern void _bt_spooldestroy(void *spool);
extern void _bt_spool(Relation index, BTItem btitem, void *spool);
extern void _bt_upperbuild(Relation index, BlockNumber blk, int level);
extern void _bt_upperbuild(Relation index);
extern void _bt_leafbuild(Relation index, void *spool);
#endif /* NBTREE_H */
# define USE_POSIX_TIME
# define NEED_I386_TAS_ASM
# define HAS_TEST_AND_SET
# if defined(__mips__)
/* # undef HAS_TEST_AND_SET */
# endif
typedef unsigned char slock_t;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment