Commit b4297c17 authored by Neil Conway's avatar Neil Conway

This patch makes some improvements to the rtree index implementation:

(1) Keep a pin on the scan's current buffer and mark buffer. This
avoids the need to do a ReadBuffer() for each tuple produced by the
scan. Since ReadBuffer() is expensive, this is a significant win.

(2) Convert a ReleaseBuffer(); ReadBuffer() pair into
ReleaseAndReadBuffer(). Surely not a huge win, but it saves a lock
acquire/release...

(3) Remove a bunch of duplicated code in rtget.c; make rtnext() handle
both the "initial result" and "subsequent result" cases.

(4) Add support for index tuple killing

(5) Remove rtscancache(): it is dead code, for the same reason that
gistscancache() is dead code (an index scan ought not be invoked with
NoMovementScanDirection).

The end result is about a 10% improvement in rtree index scan perf,
according to contrib/rtree_gist/bench.
parent 1f5299bc
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/rtree/rtget.c,v 1.33 2004/12/31 21:59:26 pgsql Exp $ * $PostgreSQL: pgsql/src/backend/access/rtree/rtget.c,v 1.34 2005/01/18 23:25:43 neilc Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -19,10 +19,8 @@ ...@@ -19,10 +19,8 @@
#include "access/relscan.h" #include "access/relscan.h"
#include "access/rtree.h" #include "access/rtree.h"
static OffsetNumber findnext(IndexScanDesc s, Page p, OffsetNumber n, static OffsetNumber findnext(IndexScanDesc s, OffsetNumber n,
ScanDirection dir); ScanDirection dir);
static bool rtscancache(IndexScanDesc s, ScanDirection dir);
static bool rtfirst(IndexScanDesc s, ScanDirection dir);
static bool rtnext(IndexScanDesc s, ScanDirection dir); static bool rtnext(IndexScanDesc s, ScanDirection dir);
...@@ -31,138 +29,106 @@ rtgettuple(PG_FUNCTION_ARGS) ...@@ -31,138 +29,106 @@ rtgettuple(PG_FUNCTION_ARGS)
{ {
IndexScanDesc s = (IndexScanDesc) PG_GETARG_POINTER(0); IndexScanDesc s = (IndexScanDesc) PG_GETARG_POINTER(0);
ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
bool res; Page page;
OffsetNumber offnum;
/* if we have it cached in the scan desc, just return the value */
if (rtscancache(s, dir))
PG_RETURN_BOOL(true);
/* not cached, so we'll have to do some work */
if (ItemPointerIsValid(&(s->currentItemData)))
res = rtnext(s, dir);
else
res = rtfirst(s, dir);
PG_RETURN_BOOL(res);
}
static bool
rtfirst(IndexScanDesc s, ScanDirection dir)
{
Buffer b;
Page p;
OffsetNumber n;
OffsetNumber maxoff;
RTreePageOpaque po;
RTreeScanOpaque so; RTreeScanOpaque so;
RTSTACK *stk;
BlockNumber blk;
IndexTuple it;
b = ReadBuffer(s->indexRelation, P_ROOT);
p = BufferGetPage(b);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
so = (RTreeScanOpaque) s->opaque; so = (RTreeScanOpaque) s->opaque;
for (;;) /*
* If we've already produced a tuple and the executor has informed
* us that it should be marked "killed", do so know.
*/
if (s->kill_prior_tuple && ItemPointerIsValid(&(s->currentItemData)))
{ {
maxoff = PageGetMaxOffsetNumber(p); offnum = ItemPointerGetOffsetNumber(&(s->currentItemData));
if (ScanDirectionIsBackward(dir)) page = BufferGetPage(so->curbuf);
n = findnext(s, p, maxoff, dir); PageGetItemId(page, offnum)->lp_flags |= LP_DELETE;
else SetBufferCommitInfoNeedsSave(so->curbuf);
n = findnext(s, p, FirstOffsetNumber, dir); }
while (n < FirstOffsetNumber || n > maxoff)
{
ReleaseBuffer(b);
if (so->s_stack == NULL)
return false;
stk = so->s_stack;
b = ReadBuffer(s->indexRelation, stk->rts_blk);
p = BufferGetPage(b);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
maxoff = PageGetMaxOffsetNumber(p);
if (ScanDirectionIsBackward(dir)) /*
n = OffsetNumberPrev(stk->rts_child); * Get the next tuple that matches the search key; if asked to
else * skip killed tuples, find the first non-killed tuple that
n = OffsetNumberNext(stk->rts_child); * matches. Return as soon as we've run out of matches or we've
so->s_stack = stk->rts_parent; * found an acceptable match.
pfree(stk); */
for (;;)
{
bool res = rtnext(s, dir);
n = findnext(s, p, n, dir); if (res == true && s->ignore_killed_tuples)
}
if (po->flags & F_LEAF)
{ {
ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n); offnum = ItemPointerGetOffsetNumber(&(s->currentItemData));
page = BufferGetPage(so->curbuf);
it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); if (ItemIdDeleted(PageGetItemId(page, offnum)))
continue;
s->xs_ctup.t_self = it->t_tid;
ReleaseBuffer(b);
return true;
} }
else
{
stk = (RTSTACK *) palloc(sizeof(RTSTACK));
stk->rts_child = n;
stk->rts_blk = BufferGetBlockNumber(b);
stk->rts_parent = so->s_stack;
so->s_stack = stk;
it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
blk = ItemPointerGetBlockNumber(&(it->t_tid));
ReleaseBuffer(b); PG_RETURN_BOOL(res);
b = ReadBuffer(s->indexRelation, blk);
p = BufferGetPage(b);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
}
} }
} }
static bool static bool
rtnext(IndexScanDesc s, ScanDirection dir) rtnext(IndexScanDesc s, ScanDirection dir)
{ {
Buffer b;
Page p; Page p;
OffsetNumber n; OffsetNumber n;
OffsetNumber maxoff;
RTreePageOpaque po; RTreePageOpaque po;
RTreeScanOpaque so; RTreeScanOpaque so;
RTSTACK *stk;
BlockNumber blk;
IndexTuple it;
blk = ItemPointerGetBlockNumber(&(s->currentItemData)); so = (RTreeScanOpaque) s->opaque;
n = ItemPointerGetOffsetNumber(&(s->currentItemData));
if (ScanDirectionIsForward(dir)) if (!ItemPointerIsValid(&(s->currentItemData)))
n = OffsetNumberNext(n); {
else /* first call: start at the root */
n = OffsetNumberPrev(n); Assert(BufferIsValid(so->curbuf) == false);
so->curbuf = ReadBuffer(s->indexRelation, P_ROOT);
}
b = ReadBuffer(s->indexRelation, blk); p = BufferGetPage(so->curbuf);
p = BufferGetPage(b);
po = (RTreePageOpaque) PageGetSpecialPointer(p); po = (RTreePageOpaque) PageGetSpecialPointer(p);
so = (RTreeScanOpaque) s->opaque;
if (!ItemPointerIsValid(&(s->currentItemData)))
{
/* first call: start at first/last offset */
if (ScanDirectionIsForward(dir))
n = FirstOffsetNumber;
else
n = PageGetMaxOffsetNumber(p);
}
else
{
/* go on to the next offset */
n = ItemPointerGetOffsetNumber(&(s->currentItemData));
if (ScanDirectionIsForward(dir))
n = OffsetNumberNext(n);
else
n = OffsetNumberPrev(n);
}
for (;;) for (;;)
{ {
maxoff = PageGetMaxOffsetNumber(p); IndexTuple it;
n = findnext(s, p, n, dir); RTSTACK *stk;
n = findnext(s, n, dir);
while (n < FirstOffsetNumber || n > maxoff) /* no match on this page, so read in the next stack entry */
if (n == InvalidOffsetNumber)
{ {
ReleaseBuffer(b); /* if out of stack entries, we're done */
if (so->s_stack == NULL) if (so->s_stack == NULL)
{
ReleaseBuffer(so->curbuf);
so->curbuf = InvalidBuffer;
return false; return false;
}
stk = so->s_stack; stk = so->s_stack;
b = ReadBuffer(s->indexRelation, stk->rts_blk); so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation,
p = BufferGetPage(b); stk->rts_blk);
maxoff = PageGetMaxOffsetNumber(p); p = BufferGetPage(so->curbuf);
po = (RTreePageOpaque) PageGetSpecialPointer(p); po = (RTreePageOpaque) PageGetSpecialPointer(p);
if (ScanDirectionIsBackward(dir)) if (ScanDirectionIsBackward(dir))
...@@ -172,33 +138,41 @@ rtnext(IndexScanDesc s, ScanDirection dir) ...@@ -172,33 +138,41 @@ rtnext(IndexScanDesc s, ScanDirection dir)
so->s_stack = stk->rts_parent; so->s_stack = stk->rts_parent;
pfree(stk); pfree(stk);
n = findnext(s, p, n, dir); continue;
} }
if (po->flags & F_LEAF) if (po->flags & F_LEAF)
{ {
ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n); ItemPointerSet(&(s->currentItemData),
BufferGetBlockNumber(so->curbuf),
n);
it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
s->xs_ctup.t_self = it->t_tid; s->xs_ctup.t_self = it->t_tid;
ReleaseBuffer(b);
return true; return true;
} }
else else
{ {
BlockNumber blk;
stk = (RTSTACK *) palloc(sizeof(RTSTACK)); stk = (RTSTACK *) palloc(sizeof(RTSTACK));
stk->rts_child = n; stk->rts_child = n;
stk->rts_blk = BufferGetBlockNumber(b); stk->rts_blk = BufferGetBlockNumber(so->curbuf);
stk->rts_parent = so->s_stack; stk->rts_parent = so->s_stack;
so->s_stack = stk; so->s_stack = stk;
it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
blk = ItemPointerGetBlockNumber(&(it->t_tid)); blk = ItemPointerGetBlockNumber(&(it->t_tid));
ReleaseBuffer(b); /*
b = ReadBuffer(s->indexRelation, blk); * Note that we release the pin on the page as we descend
p = BufferGetPage(b); * down the tree, even though there's a good chance we'll
* eventually need to re-read the buffer later in this
* scan. This may or may not be optimal, but it doesn't
* seem likely to make a huge performance difference
* either way.
*/
so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation, blk);
p = BufferGetPage(so->curbuf);
po = (RTreePageOpaque) PageGetSpecialPointer(p); po = (RTreePageOpaque) PageGetSpecialPointer(p);
if (ScanDirectionIsBackward(dir)) if (ScanDirectionIsBackward(dir))
...@@ -209,17 +183,26 @@ rtnext(IndexScanDesc s, ScanDirection dir) ...@@ -209,17 +183,26 @@ rtnext(IndexScanDesc s, ScanDirection dir)
} }
} }
/*
* Return the offset of the next matching index entry. We begin the
* search at offset "n" and search for matches in the direction
* "dir". If no more matching entries are found on the page,
* InvalidOffsetNumber is returned.
*/
static OffsetNumber static OffsetNumber
findnext(IndexScanDesc s, Page p, OffsetNumber n, ScanDirection dir) findnext(IndexScanDesc s, OffsetNumber n, ScanDirection dir)
{ {
OffsetNumber maxoff; OffsetNumber maxoff;
IndexTuple it; IndexTuple it;
RTreePageOpaque po; RTreePageOpaque po;
RTreeScanOpaque so; RTreeScanOpaque so;
Page p;
so = (RTreeScanOpaque) s->opaque;
p = BufferGetPage(so->curbuf);
maxoff = PageGetMaxOffsetNumber(p); maxoff = PageGetMaxOffsetNumber(p);
po = (RTreePageOpaque) PageGetSpecialPointer(p); po = (RTreePageOpaque) PageGetSpecialPointer(p);
so = (RTreeScanOpaque) s->opaque;
/* /*
* If we modified the index during the scan, we may have a pointer to * If we modified the index during the scan, we may have a pointer to
...@@ -256,28 +239,8 @@ findnext(IndexScanDesc s, Page p, OffsetNumber n, ScanDirection dir) ...@@ -256,28 +239,8 @@ findnext(IndexScanDesc s, Page p, OffsetNumber n, ScanDirection dir)
n = OffsetNumberNext(n); n = OffsetNumberNext(n);
} }
return n; if (n >= FirstOffsetNumber && n <= maxoff)
} return n; /* found a match on this page */
else
static bool return InvalidOffsetNumber; /* no match, go to next page */
rtscancache(IndexScanDesc s, ScanDirection dir)
{
Buffer b;
Page p;
OffsetNumber n;
IndexTuple it;
if (!(ScanDirectionIsNoMovement(dir)
&& ItemPointerIsValid(&(s->currentItemData))))
return false;
b = ReadBuffer(s->indexRelation,
ItemPointerGetBlockNumber(&(s->currentItemData)));
p = BufferGetPage(b);
n = ItemPointerGetOffsetNumber(&(s->currentItemData));
it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
s->xs_ctup.t_self = it->t_tid;
ReleaseBuffer(b);
return true;
} }
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/rtree/rtree.c,v 1.85 2004/12/31 21:59:26 pgsql Exp $ * $PostgreSQL: pgsql/src/backend/access/rtree/rtree.c,v 1.86 2005/01/18 23:25:47 neilc Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -280,12 +280,8 @@ rtdoinsert(Relation r, IndexTuple itup, RTSTATE *rtstate) ...@@ -280,12 +280,8 @@ rtdoinsert(Relation r, IndexTuple itup, RTSTATE *rtstate)
do do
{ {
/* let go of current buffer before getting next */ /* release the current buffer, read in the next one */
if (buffer != InvalidBuffer) buffer = ReleaseAndReadBuffer(buffer, r, blk);
ReleaseBuffer(buffer);
/* get next buffer */
buffer = ReadBuffer(r, blk);
page = (Page) BufferGetPage(buffer); page = (Page) BufferGetPage(buffer);
opaque = (RTreePageOpaque) PageGetSpecialPointer(page); opaque = (RTreePageOpaque) PageGetSpecialPointer(page);
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/rtree/rtscan.c,v 1.56 2004/12/31 21:59:26 pgsql Exp $ * $PostgreSQL: pgsql/src/backend/access/rtree/rtscan.c,v 1.57 2005/01/18 23:25:48 neilc Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -89,12 +89,24 @@ rtrescan(PG_FUNCTION_ARGS) ...@@ -89,12 +89,24 @@ rtrescan(PG_FUNCTION_ARGS)
freestack(p->s_markstk); freestack(p->s_markstk);
p->s_stack = p->s_markstk = NULL; p->s_stack = p->s_markstk = NULL;
p->s_flags = 0x0; p->s_flags = 0x0;
/* drop pins on buffers -- no locks held */
if (BufferIsValid(p->curbuf))
{
ReleaseBuffer(p->curbuf);
p->curbuf = InvalidBuffer;
}
if (BufferIsValid(p->markbuf))
{
ReleaseBuffer(p->markbuf);
p->markbuf = InvalidBuffer;
}
} }
else else
{ {
/* initialize opaque data */ /* initialize opaque data */
p = (RTreeScanOpaque) palloc(sizeof(RTreeScanOpaqueData)); p = (RTreeScanOpaque) palloc(sizeof(RTreeScanOpaqueData));
p->s_stack = p->s_markstk = NULL; p->s_stack = p->s_markstk = NULL;
p->curbuf = p->markbuf = InvalidBuffer;
p->s_internalNKey = s->numberOfKeys; p->s_internalNKey = s->numberOfKeys;
p->s_flags = 0x0; p->s_flags = 0x0;
s->opaque = p; s->opaque = p;
...@@ -175,6 +187,18 @@ rtmarkpos(PG_FUNCTION_ARGS) ...@@ -175,6 +187,18 @@ rtmarkpos(PG_FUNCTION_ARGS)
freestack(p->s_markstk); freestack(p->s_markstk);
p->s_markstk = o; p->s_markstk = o;
/* Update markbuf: make sure to bump ref count on curbuf */
if (BufferIsValid(p->markbuf))
{
ReleaseBuffer(p->markbuf);
p->markbuf = InvalidBuffer;
}
if (BufferIsValid(p->curbuf))
{
IncrBufferRefCount(p->curbuf);
p->markbuf = p->curbuf;
}
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
...@@ -211,6 +235,18 @@ rtrestrpos(PG_FUNCTION_ARGS) ...@@ -211,6 +235,18 @@ rtrestrpos(PG_FUNCTION_ARGS)
freestack(p->s_stack); freestack(p->s_stack);
p->s_stack = o; p->s_stack = o;
/* Update curbuf; be sure to bump ref count on markbuf */
if (BufferIsValid(p->curbuf))
{
ReleaseBuffer(p->curbuf);
p->curbuf = InvalidBuffer;
}
if (BufferIsValid(p->markbuf))
{
IncrBufferRefCount(p->markbuf);
p->curbuf = p->markbuf;
}
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
...@@ -226,11 +262,14 @@ rtendscan(PG_FUNCTION_ARGS) ...@@ -226,11 +262,14 @@ rtendscan(PG_FUNCTION_ARGS)
{ {
freestack(p->s_stack); freestack(p->s_stack);
freestack(p->s_markstk); freestack(p->s_markstk);
if (BufferIsValid(p->curbuf))
ReleaseBuffer(p->curbuf);
if (BufferIsValid(p->markbuf))
ReleaseBuffer(p->markbuf);
pfree(s->opaque); pfree(s->opaque);
} }
rtdropscan(s); rtdropscan(s);
/* XXX don't unset read lock -- two-phase locking */
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/access/rtree.h,v 1.36 2004/12/31 22:03:21 pgsql Exp $ * $PostgreSQL: pgsql/src/include/access/rtree.h,v 1.37 2005/01/18 23:25:55 neilc Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -59,11 +59,14 @@ typedef struct RTSTACK ...@@ -59,11 +59,14 @@ typedef struct RTSTACK
/* /*
* When we're doing a scan, we need to keep track of the parent stack * When we're doing a scan, we need to keep track of the parent stack
* for the marked and current items. Also, rtrees have the following * for the marked and current items. Also, rtrees have the following
* property: if you're looking for the box (1,1,2,2), on the internal * property: if you're looking for the box (1,1,2,2), on the internal
* nodes you have to search for all boxes that *contain* (1,1,2,2), and * nodes you have to search for all boxes that *contain* (1,1,2,2),
* not the ones that match it. We have a private scan key for internal * and not the ones that match it. We have a private scan key for
* nodes in the opaque structure for rtrees for this reason. See * internal nodes in the opaque structure for rtrees for this reason.
* access/index-rtree/rtscan.c and rtstrat.c for how it gets initialized. * See access/index-rtree/rtscan.c and rtstrat.c for how it gets
* initialized. We also keep pins on the scan's current buffer and
* marked buffer, if any: this avoids the need to invoke ReadBuffer()
* for each tuple produced by the index scan.
*/ */
typedef struct RTreeScanOpaqueData typedef struct RTreeScanOpaqueData
...@@ -73,6 +76,8 @@ typedef struct RTreeScanOpaqueData ...@@ -73,6 +76,8 @@ typedef struct RTreeScanOpaqueData
uint16 s_flags; uint16 s_flags;
int s_internalNKey; int s_internalNKey;
ScanKey s_internalKey; ScanKey s_internalKey;
Buffer curbuf;
Buffer markbuf;
} RTreeScanOpaqueData; } RTreeScanOpaqueData;
typedef RTreeScanOpaqueData *RTreeScanOpaque; typedef RTreeScanOpaqueData *RTreeScanOpaque;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment