Commit 9e85183b authored by Tom Lane's avatar Tom Lane

Major overhaul of btree index code. Eliminate special BTP_CHAIN logic for

duplicate keys by letting search go to the left rather than right when an
equal key is seen at an upper tree level.  Fix poor choice of page split
point (leading to insertion failures) that was forced by chaining logic.
Don't store leftmost key in non-leaf pages, since it's not necessary.
Don't create root page until something is first stored in the index, so an
unused index is now 8K not 16K.  (Doesn't seem to be as easy to get rid of
the metadata page, unfortunately.)  Massive cleanup of unreadable code,
fix poor, obsolete, and just plain wrong documentation and comments.
See src/backend/access/nbtree/README for the gory details.
parent c9537ca8
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -12,7 +12,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.61 2000/07/14 22:17:33 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.62 2000/07/21 06:42:32 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -26,6 +26,7 @@
#include "executor/executor.h"
#include "miscadmin.h"
bool BuildingBtree = false; /* see comment in btbuild() */
bool FastBuild = true; /* use sort/build instead of insertion
* build */
......@@ -206,8 +207,8 @@ btbuild(PG_FUNCTION_ARGS)
* btree pages - NULLs greater NOT_NULLs and NULL = NULL is TRUE.
* Sure, it's just rule for placing/finding items and no more -
* keytest'll return FALSE for a = 5 for items having 'a' isNULL.
* Look at _bt_skeycmp, _bt_compare and _bt_itemcmp for how it
* works. - vadim 03/23/97
* Look at _bt_compare for how it works.
* - vadim 03/23/97
*
* if (itup->t_info & INDEX_NULL_MASK) { pfree(itup); continue; }
*/
......@@ -321,14 +322,6 @@ btinsert(PG_FUNCTION_ARGS)
/* generate an index tuple */
itup = index_formtuple(RelationGetDescr(rel), datum, nulls);
itup->t_tid = *ht_ctid;
/*
* See comments in btbuild.
*
* if (itup->t_info & INDEX_NULL_MASK)
* PG_RETURN_POINTER((InsertIndexResult) NULL);
*/
btitem = _bt_formitem(itup);
res = _bt_doinsert(rel, btitem, rel->rd_uniqueindex, heapRel);
......@@ -357,10 +350,10 @@ btgettuple(PG_FUNCTION_ARGS)
if (ItemPointerIsValid(&(scan->currentItemData)))
{
/*
* Restore scan position using heap TID returned by previous call
* to btgettuple(). _bt_restscan() locks buffer.
* to btgettuple(). _bt_restscan() re-grabs the read lock on
* the buffer, too.
*/
_bt_restscan(scan);
res = _bt_next(scan, dir);
......@@ -369,8 +362,9 @@ btgettuple(PG_FUNCTION_ARGS)
res = _bt_first(scan, dir);
/*
* Save heap TID to use it in _bt_restscan. Unlock buffer before
* leaving index !
* Save heap TID to use it in _bt_restscan. Then release the read
* lock on the buffer so that we aren't blocking other backends.
* NOTE: we do keep the pin on the buffer!
*/
if (res)
{
......@@ -419,7 +413,18 @@ btrescan(PG_FUNCTION_ARGS)
so = (BTScanOpaque) scan->opaque;
/* we don't hold a read lock on the current page in the scan */
if (so == NULL) /* if called from btbeginscan */
{
so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer;
so->keyData = (ScanKey) NULL;
if (scan->numberOfKeys > 0)
so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
scan->opaque = so;
scan->flags = 0x0;
}
/* we aren't holding any read locks, but gotta drop the pins */
if (ItemPointerIsValid(iptr = &(scan->currentItemData)))
{
ReleaseBuffer(so->btso_curbuf);
......@@ -427,7 +432,6 @@ btrescan(PG_FUNCTION_ARGS)
ItemPointerSetInvalid(iptr);
}
/* and we don't hold a read lock on the last marked item in the scan */
if (ItemPointerIsValid(iptr = &(scan->currentMarkData)))
{
ReleaseBuffer(so->btso_mrkbuf);
......@@ -435,17 +439,6 @@ btrescan(PG_FUNCTION_ARGS)
ItemPointerSetInvalid(iptr);
}
if (so == NULL) /* if called from btbeginscan */
{
so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer;
so->keyData = (ScanKey) NULL;
if (scan->numberOfKeys > 0)
so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
scan->opaque = so;
scan->flags = 0x0;
}
/*
* Reset the scan keys. Note that keys ordering stuff moved to
* _bt_first. - vadim 05/05/97
......@@ -472,7 +465,7 @@ btmovescan(IndexScanDesc scan, Datum v)
so = (BTScanOpaque) scan->opaque;
/* we don't hold a read lock on the current page in the scan */
/* we aren't holding any read locks, but gotta drop the pin */
if (ItemPointerIsValid(iptr = &(scan->currentItemData)))
{
ReleaseBuffer(so->btso_curbuf);
......@@ -480,7 +473,6 @@ btmovescan(IndexScanDesc scan, Datum v)
ItemPointerSetInvalid(iptr);
}
/* scan->keyData[0].sk_argument = v; */
so->keyData[0].sk_argument = v;
}
......@@ -496,7 +488,7 @@ btendscan(PG_FUNCTION_ARGS)
so = (BTScanOpaque) scan->opaque;
/* we don't hold any read locks */
/* we aren't holding any read locks, but gotta drop the pins */
if (ItemPointerIsValid(iptr = &(scan->currentItemData)))
{
if (BufferIsValid(so->btso_curbuf))
......@@ -534,7 +526,7 @@ btmarkpos(PG_FUNCTION_ARGS)
so = (BTScanOpaque) scan->opaque;
/* we don't hold any read locks */
/* we aren't holding any read locks, but gotta drop the pin */
if (ItemPointerIsValid(iptr = &(scan->currentMarkData)))
{
ReleaseBuffer(so->btso_mrkbuf);
......@@ -542,7 +534,7 @@ btmarkpos(PG_FUNCTION_ARGS)
ItemPointerSetInvalid(iptr);
}
/* bump pin on current buffer */
/* bump pin on current buffer for assignment to mark buffer */
if (ItemPointerIsValid(&(scan->currentItemData)))
{
so->btso_mrkbuf = ReadBuffer(scan->relation,
......@@ -566,7 +558,7 @@ btrestrpos(PG_FUNCTION_ARGS)
so = (BTScanOpaque) scan->opaque;
/* we don't hold any read locks */
/* we aren't holding any read locks, but gotta drop the pin */
if (ItemPointerIsValid(iptr = &(scan->currentItemData)))
{
ReleaseBuffer(so->btso_curbuf);
......@@ -579,7 +571,6 @@ btrestrpos(PG_FUNCTION_ARGS)
{
so->btso_curbuf = ReadBuffer(scan->relation,
BufferGetBlockNumber(so->btso_mrkbuf));
scan->currentItemData = scan->currentMarkData;
so->curHeapIptr = so->mrkHeapIptr;
}
......@@ -603,6 +594,9 @@ btdelete(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
/*
* Restore scan position when btgettuple is called to continue a scan.
*/
static void
_bt_restscan(IndexScanDesc scan)
{
......@@ -618,7 +612,12 @@ _bt_restscan(IndexScanDesc scan)
BTItem item;
BlockNumber blkno;
LockBuffer(buf, BT_READ); /* lock buffer first! */
/*
* Get back the read lock we were holding on the buffer.
* (We still have a reference-count pin on it, though.)
*/
LockBuffer(buf, BT_READ);
page = BufferGetPage(buf);
maxoff = PageGetMaxOffsetNumber(page);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
......@@ -631,43 +630,40 @@ _bt_restscan(IndexScanDesc scan)
*/
if (!ItemPointerIsValid(&target))
{
ItemPointerSetOffsetNumber(&(scan->currentItemData),
OffsetNumberPrev(P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY));
ItemPointerSetOffsetNumber(current,
OffsetNumberPrev(P_FIRSTDATAKEY(opaque)));
return;
}
if (maxoff >= offnum)
{
/*
* if the item is where we left it or has just moved right on this
* page, we're done
* The item we were on may have moved right due to insertions.
* Find it again.
*/
for (;;)
{
/* Check for item on this page */
for (;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
{
item = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
if (item->bti_itup.t_tid.ip_blkid.bi_hi == \
target.ip_blkid.bi_hi && \
item->bti_itup.t_tid.ip_blkid.bi_lo == \
target.ip_blkid.bi_lo && \
if (item->bti_itup.t_tid.ip_blkid.bi_hi ==
target.ip_blkid.bi_hi &&
item->bti_itup.t_tid.ip_blkid.bi_lo ==
target.ip_blkid.bi_lo &&
item->bti_itup.t_tid.ip_posid == target.ip_posid)
{
current->ip_posid = offnum;
return;
}
}
}
/*
* By here, the item we're looking for moved right at least one page
*/
for (;;)
{
if (P_RIGHTMOST(opaque))
elog(FATAL, "_bt_restscan: my bits moved right off the end of the world!\
\n\tRecreate index %s.", RelationGetRelationName(rel));
elog(FATAL, "_bt_restscan: my bits moved right off the end of the world!"
"\n\tRecreate index %s.", RelationGetRelationName(rel));
blkno = opaque->btpo_next;
_bt_relbuf(rel, buf, BT_READ);
......@@ -675,23 +671,8 @@ _bt_restscan(IndexScanDesc scan)
page = BufferGetPage(buf);
maxoff = PageGetMaxOffsetNumber(page);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/* see if it's on this page */
for (offnum = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
{
item = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
if (item->bti_itup.t_tid.ip_blkid.bi_hi == \
target.ip_blkid.bi_hi && \
item->bti_itup.t_tid.ip_blkid.bi_lo == \
target.ip_blkid.bi_lo && \
item->bti_itup.t_tid.ip_posid == target.ip_posid)
{
offnum = P_FIRSTDATAKEY(opaque);
ItemPointerSet(current, blkno, offnum);
so->btso_curbuf = buf;
return;
}
}
}
}
......@@ -8,22 +8,25 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.31 2000/04/12 17:14:49 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.32 2000/07/21 06:42:32 tgl Exp $
*
*
* NOTES
* Because we can be doing an index scan on a relation while we update
* it, we need to avoid missing data that moves around in the index.
* The routines and global variables in this file guarantee that all
* scans in the local address space stay correctly positioned. This
* is all we need to worry about, since write locking guarantees that
* no one else will be on the same page at the same time as we are.
* Insertions and page splits are no problem because _bt_restscan()
* can figure out where the current item moved to, but if a deletion
* happens at or before the current scan position, we'd better do
* something to stay in sync.
*
* The routines in this file handle the problem for deletions issued
* by the current backend. Currently, that's all we need, since
* deletions are only done by VACUUM and it gets an exclusive lock.
*
* The scheme is to manage a list of active scans in the current backend.
* Whenever we add or remove records from an index, or whenever we
* split a leaf page, we check the list of active scans to see if any
* has been affected. A scan is affected only if it is on the same
* relation, and the same page, as the update.
* Whenever we remove a record from an index, we check the list of active
* scans to see if any has been affected. A scan is affected only if it
* is on the same relation, and the same page, as the update.
*
*-------------------------------------------------------------------------
*/
......@@ -111,7 +114,7 @@ _bt_dropscan(IndexScanDesc scan)
/*
* _bt_adjscans() -- adjust all scans in the scan list to compensate
* for a given deletion or insertion
* for a given deletion
*/
void
_bt_adjscans(Relation rel, ItemPointer tid)
......@@ -153,7 +156,7 @@ _bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
{
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
start = P_FIRSTDATAKEY(opaque);
if (ItemPointerGetOffsetNumber(current) == start)
ItemPointerSetInvalid(&(so->curHeapIptr));
else
......@@ -165,7 +168,6 @@ _bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
*/
LockBuffer(buf, BT_READ);
_bt_step(scan, &buf, BackwardScanDirection);
so->btso_curbuf = buf;
if (ItemPointerIsValid(current))
{
Page pg = BufferGetPage(buf);
......@@ -183,10 +185,9 @@ _bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
&& ItemPointerGetBlockNumber(current) == blkno
&& ItemPointerGetOffsetNumber(current) >= offno)
{
page = BufferGetPage(so->btso_mrkbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
start = P_FIRSTDATAKEY(opaque);
if (ItemPointerGetOffsetNumber(current) == start)
ItemPointerSetInvalid(&(so->mrkHeapIptr));
......
This diff is collapsed.
This diff is collapsed.
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtutils.c,v 1.37 2000/05/30 04:24:33 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtutils.c,v 1.38 2000/07/21 06:42:33 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -20,16 +20,13 @@
#include "access/nbtree.h"
#include "executor/execdebug.h"
extern int NIndexTupleProcessed;
/*
* _bt_mkscankey
* Build a scan key that contains comparison data from itup
* as well as comparator routines appropriate to the key datatypes.
*
* The result is intended for use with _bt_skeycmp() or _bt_compare(),
* although it could be used with _bt_itemcmp() or _bt_tuplecompare().
* The result is intended for use with _bt_compare().
*/
ScanKey
_bt_mkscankey(Relation rel, IndexTuple itup)
......@@ -68,8 +65,9 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
* Build a scan key that contains comparator routines appropriate to
* the key datatypes, but no comparison data.
*
* The result can be used with _bt_itemcmp() or _bt_tuplecompare(),
* but not with _bt_skeycmp() or _bt_compare().
* The result cannot be used with _bt_compare(). Currently this
* routine is only called by utils/sort/tuplesort.c, which has its
* own comparison routine.
*/
ScanKey
_bt_mkscankey_nodata(Relation rel)
......@@ -114,7 +112,6 @@ _bt_freestack(BTStack stack)
{
ostack = stack;
stack = stack->bts_parent;
pfree(ostack->bts_btitem);
pfree(ostack);
}
}
......@@ -331,55 +328,16 @@ _bt_formitem(IndexTuple itup)
Size tuplen;
extern Oid newoid();
/*
* see comments in btbuild
*
* if (itup->t_info & INDEX_NULL_MASK) elog(ERROR, "btree indices cannot
* include null keys");
*/
/* make a copy of the index tuple with room for the sequence number */
tuplen = IndexTupleSize(itup);
nbytes_btitem = tuplen + (sizeof(BTItemData) - sizeof(IndexTupleData));
btitem = (BTItem) palloc(nbytes_btitem);
memmove((char *) &(btitem->bti_itup), (char *) itup, tuplen);
memcpy((char *) &(btitem->bti_itup), (char *) itup, tuplen);
return btitem;
}
#ifdef NOT_USED
bool
_bt_checkqual(IndexScanDesc scan, IndexTuple itup)
{
BTScanOpaque so;
so = (BTScanOpaque) scan->opaque;
if (so->numberOfKeys > 0)
return (index_keytest(itup, RelationGetDescr(scan->relation),
so->numberOfKeys, so->keyData));
else
return true;
}
#endif
#ifdef NOT_USED
bool
_bt_checkforkeys(IndexScanDesc scan, IndexTuple itup, Size keysz)
{
BTScanOpaque so;
so = (BTScanOpaque) scan->opaque;
if (keysz > 0 && so->numberOfKeys >= keysz)
return (index_keytest(itup, RelationGetDescr(scan->relation),
keysz, so->keyData));
else
return true;
}
#endif
bool
_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, Size *keysok)
{
......
This diff is collapsed.
This diff is collapsed.
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: bufpage.h,v 1.30 2000/07/03 02:54:21 vadim Exp $
* $Id: bufpage.h,v 1.31 2000/07/21 06:42:39 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -309,7 +309,6 @@ extern Page PageGetTempPage(Page page, Size specialSize);
extern void PageRestoreTempPage(Page tempPage, Page oldPage);
extern void PageRepairFragmentation(Page page);
extern Size PageGetFreeSpace(Page page);
extern void PageManagerModeSet(PageManagerMode mode);
extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment