Commit 70f1482d authored by Tom Lane's avatar Tom Lane

Change seqscan logic so that we check visibility of all tuples on a page

when we first read the page, rather than checking them one at a time.
This allows us to take and release the buffer content lock just once
per page, instead of once per tuple.  Since it's a shared lock the
contention penalty for holding the lock longer shouldn't be too bad.
We can safely do this only when using an MVCC snapshot; else the
assumption that visibility won't change over time is uncool.  Therefore
there are now two code paths depending on the snapshot type.  I also
made the same change in nodeBitmapHeapscan.c, where it can be done always
because we only support MVCC snapshots for bitmap scans anyway.
Also make some incidental cleanups in the APIs of these functions.
Per a suggestion from Qingqing Zhou.
parent 290166f9
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.203 2005/11/22 18:17:06 momjian Exp $ * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.204 2005/11/26 03:03:07 tgl Exp $
* *
* *
* INTERFACE ROUTINES * INTERFACE ROUTINES
...@@ -78,12 +78,17 @@ initscan(HeapScanDesc scan, ScanKey key) ...@@ -78,12 +78,17 @@ initscan(HeapScanDesc scan, ScanKey key)
*/ */
scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd); scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
scan->rs_inited = false;
scan->rs_ctup.t_data = NULL; scan->rs_ctup.t_data = NULL;
ItemPointerSetInvalid(&scan->rs_ctup.t_self);
scan->rs_cbuf = InvalidBuffer; scan->rs_cbuf = InvalidBuffer;
scan->rs_cblock = InvalidBlockNumber;
/* we don't have a marked position... */ /* we don't have a marked position... */
ItemPointerSetInvalid(&(scan->rs_mctid)); ItemPointerSetInvalid(&(scan->rs_mctid));
/* page-at-a-time fields are always invalid when not rs_inited */
/* /*
* copy the scan key, if appropriate * copy the scan key, if appropriate
*/ */
...@@ -93,79 +98,128 @@ initscan(HeapScanDesc scan, ScanKey key) ...@@ -93,79 +98,128 @@ initscan(HeapScanDesc scan, ScanKey key)
pgstat_count_heap_scan(&scan->rs_pgstat_info); pgstat_count_heap_scan(&scan->rs_pgstat_info);
} }
/* ---------------- /*
* heapgettup - fetch next heap tuple * heapgetpage - subroutine for heapgettup()
*
* routine used by heap_getnext() which does most of the
* real work in scanning tuples.
* *
* The passed-in *buffer must be either InvalidBuffer or the pinned * This routine reads and pins the specified page of the relation.
* current page of the scan. If we have to move to another page, * In page-at-a-time mode it performs additional work, namely determining
* we will unpin this buffer (if valid). On return, *buffer is either * which tuples on the page are visible.
* InvalidBuffer or the ID of a pinned buffer.
* ----------------
*/ */
static void static void
heapgettup(Relation relation, heapgetpage(HeapScanDesc scan, BlockNumber page)
int dir,
HeapTuple tuple,
Buffer *buffer,
Snapshot snapshot,
int nkeys,
ScanKey key,
BlockNumber pages)
{ {
ItemId lpp; Buffer buffer;
Snapshot snapshot;
Page dp; Page dp;
BlockNumber page;
int lines; int lines;
int ntup;
OffsetNumber lineoff; OffsetNumber lineoff;
int linesleft; ItemId lpp;
ItemPointer tid;
Assert(page < scan->rs_nblocks);
tid = (tuple->t_data == NULL) ? NULL : &(tuple->t_self); scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
scan->rs_rd,
page);
scan->rs_cblock = page;
if (!scan->rs_pageatatime)
return;
buffer = scan->rs_cbuf;
snapshot = scan->rs_snapshot;
/* /*
* debugging stuff * We must hold share lock on the buffer content while examining
* * tuple visibility. Afterwards, however, the tuples we have found
* check validity of arguments, here and for other functions too * to be visible are guaranteed good as long as we hold the buffer pin.
*
* Note: no locking manipulations needed--this is a local function
*/ */
#ifdef HEAPDEBUGALL LockBuffer(buffer, BUFFER_LOCK_SHARE);
if (ItemPointerIsValid(tid))
elog(DEBUG2, "heapgettup(%s, tid=0x%x[%d,%d], dir=%d, ...)",
RelationGetRelationName(relation), tid, tid->ip_blkid,
tid->ip_posid, dir);
else
elog(DEBUG2, "heapgettup(%s, tid=0x%x, dir=%d, ...)",
RelationGetRelationName(relation), tid, dir);
elog(DEBUG2, "heapgettup(..., b=0x%x, nkeys=%d, key=0x%x", buffer, nkeys, key);
elog(DEBUG2, "heapgettup: relation(%c)=`%s', %p", dp = (Page) BufferGetPage(buffer);
relation->rd_rel->relkind, RelationGetRelationName(relation), lines = PageGetMaxOffsetNumber(dp);
snapshot); ntup = 0;
#endif /* HEAPDEBUGALL */
if (!ItemPointerIsValid(tid)) for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
lineoff <= lines;
lineoff++, lpp++)
{ {
Assert(!PointerIsValid(tid)); if (ItemIdIsUsed(lpp))
tid = NULL; {
HeapTupleData loctup;
bool valid;
loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
loctup.t_len = ItemIdGetLength(lpp);
ItemPointerSet(&(loctup.t_self), page, lineoff);
valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
if (valid)
scan->rs_vistuples[ntup++] = lineoff;
}
} }
tuple->t_tableOid = RelationGetRelid(relation); LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
/* Assert(ntup <= MaxHeapTuplesPerPage);
* return null immediately if relation is empty scan->rs_ntuples = ntup;
*/ }
if (pages == 0)
/* ----------------
* heapgettup - fetch next heap tuple
*
* Initialize the scan if not already done; then advance to the next
* tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
* or set scan->rs_ctup.t_data = NULL if no more tuples.
*
* dir == 0 means "re-fetch the tuple indicated by scan->rs_ctup".
*
* Note: the reason nkeys/key are passed separately, even though they are
* kept in the scan descriptor, is that the caller may not want us to check
* the scankeys.
*
* Note: when we fall off the end of the scan in either direction, we
* reset rs_inited. This means that a further request with the same
* scan direction will restart the scan, which is a bit odd, but a
* request with the opposite scan direction will start a fresh scan
* in the proper direction. The latter is required behavior for cursors,
* while the former case is generally undefined behavior in Postgres
* so we don't care too much.
* ----------------
*/
static void
heapgettup(HeapScanDesc scan,
int dir,
int nkeys,
ScanKey key)
{
HeapTuple tuple = &(scan->rs_ctup);
ItemPointer tid = &(tuple->t_self);
Snapshot snapshot = scan->rs_snapshot;
BlockNumber pages = scan->rs_nblocks;
BlockNumber page;
Page dp;
int lines;
OffsetNumber lineoff;
int linesleft;
ItemId lpp;
if (!scan->rs_inited)
{ {
if (BufferIsValid(*buffer)) /*
ReleaseBuffer(*buffer); * return null immediately if relation is empty
*buffer = InvalidBuffer; */
tuple->t_data = NULL; if (pages == 0)
return; {
Assert(!BufferIsValid(scan->rs_cbuf));
tuple->t_data = NULL;
return;
}
}
else
{
/* resuming scan from tuple indicated by scan->rs_ctup.t_self */
Assert(ItemPointerIsValid(tid));
} }
/* /*
...@@ -174,30 +228,26 @@ heapgettup(Relation relation, ...@@ -174,30 +228,26 @@ heapgettup(Relation relation,
if (dir == 0) if (dir == 0)
{ {
/* /*
* ``no movement'' scan direction: refetch same tuple * ``no movement'' scan direction: refetch prior tuple
*/ */
if (tid == NULL) if (!scan->rs_inited)
{ {
if (BufferIsValid(*buffer)) Assert(!BufferIsValid(scan->rs_cbuf));
ReleaseBuffer(*buffer);
*buffer = InvalidBuffer;
tuple->t_data = NULL; tuple->t_data = NULL;
return; return;
} }
*buffer = ReleaseAndReadBuffer(*buffer, page = ItemPointerGetBlockNumber(tid);
relation, if (page != scan->rs_cblock)
ItemPointerGetBlockNumber(tid)); heapgetpage(scan, page);
LockBuffer(*buffer, BUFFER_LOCK_SHARE);
dp = (Page) BufferGetPage(*buffer); /* Since the tuple was previously fetched, needn't lock page here */
dp = (Page) BufferGetPage(scan->rs_cbuf);
lineoff = ItemPointerGetOffsetNumber(tid); lineoff = ItemPointerGetOffsetNumber(tid);
lpp = PageGetItemId(dp, lineoff); lpp = PageGetItemId(dp, lineoff);
tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
tuple->t_len = ItemIdGetLength(lpp); tuple->t_len = ItemIdGetLength(lpp);
LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
return; return;
} }
...@@ -206,28 +256,23 @@ heapgettup(Relation relation, ...@@ -206,28 +256,23 @@ heapgettup(Relation relation,
/* /*
* reverse scan direction * reverse scan direction
*/ */
if (tid == NULL) if (!scan->rs_inited)
{
page = pages - 1; /* final page */ page = pages - 1; /* final page */
}
else else
{
page = ItemPointerGetBlockNumber(tid); /* current page */ page = ItemPointerGetBlockNumber(tid); /* current page */
}
Assert(page < pages);
*buffer = ReleaseAndReadBuffer(*buffer, if (page != scan->rs_cblock)
relation, heapgetpage(scan, page);
page);
LockBuffer(*buffer, BUFFER_LOCK_SHARE); LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
dp = (Page) BufferGetPage(*buffer); dp = (Page) BufferGetPage(scan->rs_cbuf);
lines = PageGetMaxOffsetNumber(dp); lines = PageGetMaxOffsetNumber(dp);
if (tid == NULL)
if (!scan->rs_inited)
{ {
lineoff = lines; /* final offnum */ lineoff = lines; /* final offnum */
scan->rs_inited = true;
} }
else else
{ {
...@@ -241,10 +286,11 @@ heapgettup(Relation relation, ...@@ -241,10 +286,11 @@ heapgettup(Relation relation,
/* /*
* forward scan direction * forward scan direction
*/ */
if (tid == NULL) if (!scan->rs_inited)
{ {
page = 0; /* first page */ page = 0; /* first page */
lineoff = FirstOffsetNumber; /* first offnum */ lineoff = FirstOffsetNumber; /* first offnum */
scan->rs_inited = true;
} }
else else
{ {
...@@ -253,15 +299,12 @@ heapgettup(Relation relation, ...@@ -253,15 +299,12 @@ heapgettup(Relation relation,
OffsetNumberNext(ItemPointerGetOffsetNumber(tid)); OffsetNumberNext(ItemPointerGetOffsetNumber(tid));
} }
Assert(page < pages); if (page != scan->rs_cblock)
heapgetpage(scan, page);
*buffer = ReleaseAndReadBuffer(*buffer,
relation,
page);
LockBuffer(*buffer, BUFFER_LOCK_SHARE); LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
dp = (Page) BufferGetPage(*buffer); dp = (Page) BufferGetPage(scan->rs_cbuf);
lines = PageGetMaxOffsetNumber(dp); lines = PageGetMaxOffsetNumber(dp);
/* page and lineoff now reference the physically next tid */ /* page and lineoff now reference the physically next tid */
} }
...@@ -269,22 +312,21 @@ heapgettup(Relation relation, ...@@ -269,22 +312,21 @@ heapgettup(Relation relation,
/* 'dir' is now non-zero */ /* 'dir' is now non-zero */
/* /*
* calculate line pointer and number of remaining items to check on this * calculate number of remaining items to check on this page
* page.
*/ */
lpp = PageGetItemId(dp, lineoff);
if (dir < 0) if (dir < 0)
linesleft = lineoff - 1; linesleft = lineoff;
else else
linesleft = lines - lineoff; linesleft = lines - lineoff + 1;
/* /*
* advance the scan until we find a qualifying tuple or run out of stuff * advance the scan until we find a qualifying tuple or run out of stuff
* to scan * to scan
*/ */
lpp = PageGetItemId(dp, lineoff);
for (;;) for (;;)
{ {
while (linesleft >= 0) while (linesleft > 0)
{ {
if (ItemIdIsUsed(lpp)) if (ItemIdIsUsed(lpp))
{ {
...@@ -297,11 +339,17 @@ heapgettup(Relation relation, ...@@ -297,11 +339,17 @@ heapgettup(Relation relation,
/* /*
* if current tuple qualifies, return it. * if current tuple qualifies, return it.
*/ */
HeapTupleSatisfies(tuple, relation, *buffer, (PageHeader) dp, valid = HeapTupleSatisfiesVisibility(tuple,
snapshot, nkeys, key, valid); snapshot,
scan->rs_cbuf);
if (valid && key != NULL)
HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
nkeys, key, valid);
if (valid) if (valid)
{ {
LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
return; return;
} }
} }
...@@ -326,32 +374,31 @@ heapgettup(Relation relation, ...@@ -326,32 +374,31 @@ heapgettup(Relation relation,
* if we get here, it means we've exhausted the items on this page and * if we get here, it means we've exhausted the items on this page and
* it's time to move to the next. * it's time to move to the next.
*/ */
LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
/* /*
* return NULL if we've exhausted all the pages * return NULL if we've exhausted all the pages
*/ */
if ((dir < 0) ? (page == 0) : (page + 1 >= pages)) if ((dir < 0) ? (page == 0) : (page + 1 >= pages))
{ {
if (BufferIsValid(*buffer)) if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(*buffer); ReleaseBuffer(scan->rs_cbuf);
*buffer = InvalidBuffer; scan->rs_cbuf = InvalidBuffer;
scan->rs_cblock = InvalidBlockNumber;
tuple->t_data = NULL; tuple->t_data = NULL;
scan->rs_inited = false;
return; return;
} }
page = (dir < 0) ? (page - 1) : (page + 1); page = (dir < 0) ? (page - 1) : (page + 1);
Assert(page < pages); heapgetpage(scan, page);
*buffer = ReleaseAndReadBuffer(*buffer, LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
relation,
page);
LockBuffer(*buffer, BUFFER_LOCK_SHARE); dp = (Page) BufferGetPage(scan->rs_cbuf);
dp = (Page) BufferGetPage(*buffer);
lines = PageGetMaxOffsetNumber((Page) dp); lines = PageGetMaxOffsetNumber((Page) dp);
linesleft = lines - 1; linesleft = lines;
if (dir < 0) if (dir < 0)
{ {
lineoff = lines; lineoff = lines;
...@@ -365,6 +412,233 @@ heapgettup(Relation relation, ...@@ -365,6 +412,233 @@ heapgettup(Relation relation,
} }
} }
/* ----------------
* heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
*
* Same API as heapgettup, but used in page-at-a-time mode
*
* The internal logic is much the same as heapgettup's too, but there are some
* differences: we do not take the buffer content lock (that only needs to
* happen inside heapgetpage), and we iterate through just the tuples listed
* in rs_vistuples[] rather than all tuples on the page. Notice that
* lineindex is 0-based, where the corresponding loop variable lineoff in
* heapgettup is 1-based.
* ----------------
*/
static void
heapgettup_pagemode(HeapScanDesc scan,
int dir,
int nkeys,
ScanKey key)
{
HeapTuple tuple = &(scan->rs_ctup);
ItemPointer tid = &(tuple->t_self);
BlockNumber pages = scan->rs_nblocks;
BlockNumber page;
Page dp;
int lines;
int lineindex;
OffsetNumber lineoff;
int linesleft;
ItemId lpp;
if (!scan->rs_inited)
{
/*
* return null immediately if relation is empty
*/
if (pages == 0)
{
Assert(!BufferIsValid(scan->rs_cbuf));
tuple->t_data = NULL;
return;
}
}
else
{
/* resuming scan from tuple indicated by scan->rs_ctup.t_self */
Assert(ItemPointerIsValid(tid));
}
/*
* calculate next starting lineindex, given scan direction
*/
if (dir == 0)
{
/*
* ``no movement'' scan direction: refetch prior tuple
*/
if (!scan->rs_inited)
{
Assert(!BufferIsValid(scan->rs_cbuf));
tuple->t_data = NULL;
return;
}
page = ItemPointerGetBlockNumber(tid);
if (page != scan->rs_cblock)
heapgetpage(scan, page);
/* Since the tuple was previously fetched, needn't lock page here */
dp = (Page) BufferGetPage(scan->rs_cbuf);
lineoff = ItemPointerGetOffsetNumber(tid);
lpp = PageGetItemId(dp, lineoff);
tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
tuple->t_len = ItemIdGetLength(lpp);
/* check that rs_cindex is in sync */
Assert(scan->rs_cindex < scan->rs_ntuples);
Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
return;
}
else if (dir < 0)
{
/*
* reverse scan direction
*/
if (!scan->rs_inited)
page = pages - 1; /* final page */
else
page = ItemPointerGetBlockNumber(tid); /* current page */
if (page != scan->rs_cblock)
heapgetpage(scan, page);
dp = (Page) BufferGetPage(scan->rs_cbuf);
lines = scan->rs_ntuples;
if (!scan->rs_inited)
{
lineindex = lines - 1;
scan->rs_inited = true;
}
else
{
lineindex = scan->rs_cindex - 1;
}
/* page and lineindex now reference the previous visible tid */
}
else
{
/*
* forward scan direction
*/
if (!scan->rs_inited)
{
page = 0; /* first page */
lineindex = 0;
scan->rs_inited = true;
}
else
{
page = ItemPointerGetBlockNumber(tid); /* current page */
lineindex = scan->rs_cindex + 1;
}
if (page != scan->rs_cblock)
heapgetpage(scan, page);
dp = (Page) BufferGetPage(scan->rs_cbuf);
lines = scan->rs_ntuples;
/* page and lineindex now reference the next visible tid */
}
/* 'dir' is now non-zero */
/*
* calculate number of remaining items to check on this page
*/
if (dir < 0)
linesleft = lineindex + 1;
else
linesleft = lines - lineindex;
/*
* advance the scan until we find a qualifying tuple or run out of stuff
* to scan
*/
for (;;)
{
while (linesleft > 0)
{
lineoff = scan->rs_vistuples[lineindex];
lpp = PageGetItemId(dp, lineoff);
Assert(ItemIdIsUsed(lpp));
tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
tuple->t_len = ItemIdGetLength(lpp);
ItemPointerSet(&(tuple->t_self), page, lineoff);
/*
* if current tuple qualifies, return it.
*/
if (key != NULL)
{
bool valid;
HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
nkeys, key, valid);
if (valid)
{
scan->rs_cindex = lineindex;
return;
}
}
else
{
scan->rs_cindex = lineindex;
return;
}
/*
* otherwise move to the next item on the page
*/
--linesleft;
if (dir < 0)
{
--lineindex;
}
else
{
++lineindex;
}
}
/*
* if we get here, it means we've exhausted the items on this page and
* it's time to move to the next.
*/
/*
* return NULL if we've exhausted all the pages
*/
if ((dir < 0) ? (page == 0) : (page + 1 >= pages))
{
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
scan->rs_cbuf = InvalidBuffer;
scan->rs_cblock = InvalidBlockNumber;
tuple->t_data = NULL;
scan->rs_inited = false;
return;
}
page = (dir < 0) ? (page - 1) : (page + 1);
heapgetpage(scan, page);
dp = (Page) BufferGetPage(scan->rs_cbuf);
lines = scan->rs_ntuples;
linesleft = lines;
if (dir < 0)
lineindex = lines - 1;
else
lineindex = 0;
}
}
#if defined(DISABLE_COMPLEX_MACRO) #if defined(DISABLE_COMPLEX_MACRO)
/* /*
...@@ -642,6 +916,14 @@ heap_beginscan(Relation relation, Snapshot snapshot, ...@@ -642,6 +916,14 @@ heap_beginscan(Relation relation, Snapshot snapshot,
scan->rs_snapshot = snapshot; scan->rs_snapshot = snapshot;
scan->rs_nkeys = nkeys; scan->rs_nkeys = nkeys;
/*
* we can use page-at-a-time mode if it's an MVCC-safe snapshot
*/
scan->rs_pageatatime = IsMVCCSnapshot(snapshot);
/* we only need to set this up once */
scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
/* /*
* we do this here instead of in initscan() because heap_rescan also calls * we do this here instead of in initscan() because heap_rescan also calls
* initscan() and we don't want to allocate memory again * initscan() and we don't want to allocate memory again
...@@ -741,16 +1023,14 @@ heap_getnext(HeapScanDesc scan, ScanDirection direction) ...@@ -741,16 +1023,14 @@ heap_getnext(HeapScanDesc scan, ScanDirection direction)
/* /*
* Note: we depend here on the -1/0/1 encoding of ScanDirection. * Note: we depend here on the -1/0/1 encoding of ScanDirection.
*/ */
heapgettup(scan->rs_rd, if (scan->rs_pageatatime)
(int) direction, heapgettup_pagemode(scan, (int) direction,
&(scan->rs_ctup), scan->rs_nkeys, scan->rs_key);
&(scan->rs_cbuf), else
scan->rs_snapshot, heapgettup(scan, (int) direction,
scan->rs_nkeys, scan->rs_nkeys, scan->rs_key);
scan->rs_key,
scan->rs_nblocks);
if (scan->rs_ctup.t_data == NULL && !BufferIsValid(scan->rs_cbuf)) if (scan->rs_ctup.t_data == NULL)
{ {
HEAPDEBUG_2; /* heap_getnext returning EOS */ HEAPDEBUG_2; /* heap_getnext returning EOS */
return NULL; return NULL;
...@@ -760,13 +1040,11 @@ heap_getnext(HeapScanDesc scan, ScanDirection direction) ...@@ -760,13 +1040,11 @@ heap_getnext(HeapScanDesc scan, ScanDirection direction)
* if we get here it means we have a new current scan tuple, so point to * if we get here it means we have a new current scan tuple, so point to
* the proper return buffer and return the tuple. * the proper return buffer and return the tuple.
*/ */
HEAPDEBUG_3; /* heap_getnext returning tuple */ HEAPDEBUG_3; /* heap_getnext returning tuple */
if (scan->rs_ctup.t_data != NULL) pgstat_count_heap_getnext(&scan->rs_pgstat_info);
pgstat_count_heap_getnext(&scan->rs_pgstat_info);
return ((scan->rs_ctup.t_data == NULL) ? NULL : &(scan->rs_ctup)); return &(scan->rs_ctup);
} }
/* /*
...@@ -903,8 +1181,7 @@ heap_release_fetch(Relation relation, ...@@ -903,8 +1181,7 @@ heap_release_fetch(Relation relation,
/* /*
* check time qualification of tuple, then release lock * check time qualification of tuple, then release lock
*/ */
HeapTupleSatisfies(tuple, relation, buffer, dp, valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
snapshot, 0, NULL, valid);
LockBuffer(buffer, BUFFER_LOCK_UNLOCK); LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
...@@ -1038,8 +1315,7 @@ heap_get_latest_tid(Relation relation, ...@@ -1038,8 +1315,7 @@ heap_get_latest_tid(Relation relation,
* Check time qualification of tuple; if visible, set it as the new * Check time qualification of tuple; if visible, set it as the new
* result candidate. * result candidate.
*/ */
HeapTupleSatisfies(&tp, relation, buffer, dp, valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
snapshot, 0, NULL, valid);
if (valid) if (valid)
*tid = ctid; *tid = ctid;
...@@ -2439,7 +2715,11 @@ heap_markpos(HeapScanDesc scan) ...@@ -2439,7 +2715,11 @@ heap_markpos(HeapScanDesc scan)
/* Note: no locking manipulations needed */ /* Note: no locking manipulations needed */
if (scan->rs_ctup.t_data != NULL) if (scan->rs_ctup.t_data != NULL)
{
scan->rs_mctid = scan->rs_ctup.t_self; scan->rs_mctid = scan->rs_ctup.t_self;
if (scan->rs_pageatatime)
scan->rs_mindex = scan->rs_cindex;
}
else else
ItemPointerSetInvalid(&scan->rs_mctid); ItemPointerSetInvalid(&scan->rs_mctid);
} }
...@@ -2453,31 +2733,38 @@ heap_restrpos(HeapScanDesc scan) ...@@ -2453,31 +2733,38 @@ heap_restrpos(HeapScanDesc scan)
{ {
/* XXX no amrestrpos checking that ammarkpos called */ /* XXX no amrestrpos checking that ammarkpos called */
/* Note: no locking manipulations needed */
/*
* unpin scan buffers
*/
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
scan->rs_cbuf = InvalidBuffer;
if (!ItemPointerIsValid(&scan->rs_mctid)) if (!ItemPointerIsValid(&scan->rs_mctid))
{ {
scan->rs_ctup.t_data = NULL; scan->rs_ctup.t_data = NULL;
/*
* unpin scan buffers
*/
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
scan->rs_cbuf = InvalidBuffer;
scan->rs_cblock = InvalidBlockNumber;
} }
else else
{ {
/*
* If we reached end of scan, rs_inited will now be false. We must
* reset it to true to keep heapgettup from doing the wrong thing.
*/
scan->rs_inited = true;
scan->rs_ctup.t_self = scan->rs_mctid; scan->rs_ctup.t_self = scan->rs_mctid;
scan->rs_ctup.t_data = (HeapTupleHeader) 0x1; /* for heapgettup */ if (scan->rs_pageatatime)
heapgettup(scan->rs_rd, {
0, scan->rs_cindex = scan->rs_mindex;
&(scan->rs_ctup), heapgettup_pagemode(scan,
&(scan->rs_cbuf), 0, /* "no movement" */
scan->rs_snapshot, 0, /* needn't recheck scan keys */
0, NULL);
NULL, }
scan->rs_nblocks); else
heapgettup(scan,
0, /* "no movement" */
0, /* needn't recheck scan keys */
NULL);
} }
} }
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.5 2005/11/25 04:24:48 tgl Exp $ * $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.6 2005/11/26 03:03:07 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -44,6 +44,7 @@ ...@@ -44,6 +44,7 @@
static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node); static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres);
/* ---------------------------------------------------------------- /* ----------------------------------------------------------------
...@@ -57,7 +58,7 @@ BitmapHeapNext(BitmapHeapScanState *node) ...@@ -57,7 +58,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
{ {
EState *estate; EState *estate;
ExprContext *econtext; ExprContext *econtext;
HeapScanDesc scandesc; HeapScanDesc scan;
Index scanrelid; Index scanrelid;
TIDBitmap *tbm; TIDBitmap *tbm;
TBMIterateResult *tbmres; TBMIterateResult *tbmres;
...@@ -70,7 +71,7 @@ BitmapHeapNext(BitmapHeapScanState *node) ...@@ -70,7 +71,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
estate = node->ss.ps.state; estate = node->ss.ps.state;
econtext = node->ss.ps.ps_ExprContext; econtext = node->ss.ps.ps_ExprContext;
slot = node->ss.ss_ScanTupleSlot; slot = node->ss.ss_ScanTupleSlot;
scandesc = node->ss.ss_currentScanDesc; scan = node->ss.ss_currentScanDesc;
scanrelid = ((BitmapHeapScan *) node->ss.ps.plan)->scan.scanrelid; scanrelid = ((BitmapHeapScan *) node->ss.ps.plan)->scan.scanrelid;
tbm = node->tbm; tbm = node->tbm;
tbmres = node->tbmres; tbmres = node->tbmres;
...@@ -123,6 +124,9 @@ BitmapHeapNext(BitmapHeapScanState *node) ...@@ -123,6 +124,9 @@ BitmapHeapNext(BitmapHeapScanState *node)
for (;;) for (;;)
{ {
Page dp;
ItemId lp;
/* /*
* Get next page of results if needed * Get next page of results if needed
*/ */
...@@ -141,134 +145,199 @@ BitmapHeapNext(BitmapHeapScanState *node) ...@@ -141,134 +145,199 @@ BitmapHeapNext(BitmapHeapScanState *node)
* AccessShareLock before performing any of the indexscans, but * AccessShareLock before performing any of the indexscans, but
* let's be safe.) * let's be safe.)
*/ */
if (tbmres->blockno >= scandesc->rs_nblocks) if (tbmres->blockno >= scan->rs_nblocks)
{ {
node->tbmres = tbmres = NULL; node->tbmres = tbmres = NULL;
continue; continue;
} }
/* /*
* Acquire pin on the current heap page. We'll hold the pin until * Fetch the current heap page and identify candidate tuples.
* done looking at the page. We trade in any pin we held before.
*/ */
scandesc->rs_cbuf = ReleaseAndReadBuffer(scandesc->rs_cbuf, bitgetpage(scan, tbmres);
scandesc->rs_rd,
tbmres->blockno);
/*
* Determine how many entries we need to look at on this page. If
* the bitmap is lossy then we need to look at each physical item
* pointer; otherwise we just look through the offsets listed in
* tbmres.
*/
if (tbmres->ntuples >= 0)
{
/* non-lossy case */
node->minslot = 0;
node->maxslot = tbmres->ntuples - 1;
}
else
{
/* lossy case */
Page dp;
LockBuffer(scandesc->rs_cbuf, BUFFER_LOCK_SHARE);
dp = (Page) BufferGetPage(scandesc->rs_cbuf);
node->minslot = FirstOffsetNumber;
node->maxslot = PageGetMaxOffsetNumber(dp);
LockBuffer(scandesc->rs_cbuf, BUFFER_LOCK_UNLOCK);
}
/* /*
* Set curslot to first slot to examine * Set rs_cindex to first slot to examine
*/ */
node->curslot = node->minslot; scan->rs_cindex = 0;
} }
else else
{ {
/* /*
* Continuing in previously obtained page; advance curslot * Continuing in previously obtained page; advance rs_cindex
*/ */
node->curslot++; scan->rs_cindex++;
} }
/* /*
* Out of range? If so, nothing more to look at on this page * Out of range? If so, nothing more to look at on this page
*/ */
if (node->curslot < node->minslot || node->curslot > node->maxslot) if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples)
{ {
node->tbmres = tbmres = NULL; node->tbmres = tbmres = NULL;
continue; continue;
} }
/* /*
* Okay to try to fetch the tuple * Okay to fetch the tuple
*/
targoffset = scan->rs_vistuples[scan->rs_cindex];
dp = (Page) BufferGetPage(scan->rs_cbuf);
lp = PageGetItemId(dp, targoffset);
Assert(ItemIdIsUsed(lp));
scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
scan->rs_ctup.t_len = ItemIdGetLength(lp);
ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
pgstat_count_heap_fetch(&scan->rs_pgstat_info);
/*
* Set up the result slot to point to this tuple. Note that the
* slot acquires a pin on the buffer.
*/ */
ExecStoreTuple(&scan->rs_ctup,
slot,
scan->rs_cbuf,
false);
/*
* If we are using lossy info, we have to recheck the qual
* conditions at every tuple.
*/
if (tbmres->ntuples < 0)
{
econtext->ecxt_scantuple = slot;
ResetExprContext(econtext);
if (!ExecQual(node->bitmapqualorig, econtext, false))
{
/* Fails recheck, so drop it and loop back for another */
ExecClearTuple(slot);
continue;
}
}
/* OK to return this tuple */
return slot;
}
/*
* if we get here it means we are at the end of the scan..
*/
return ExecClearTuple(slot);
}
/*
* bitgetpage - subroutine for BitmapHeapNext()
*
* This routine reads and pins the specified page of the relation, then
* builds an array indicating which tuples on the page are both potentially
* interesting according to the bitmap, and visible according to the snapshot.
*/
static void
bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
{
BlockNumber page = tbmres->blockno;
Buffer buffer;
Snapshot snapshot;
Page dp;
int ntup;
int curslot;
int minslot;
int maxslot;
int maxoff;
/*
* Acquire pin on the target heap page, trading in any pin we held before.
*/
Assert(page < scan->rs_nblocks);
scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
scan->rs_rd,
page);
buffer = scan->rs_cbuf;
snapshot = scan->rs_snapshot;
/*
* We must hold share lock on the buffer content while examining
* tuple visibility. Afterwards, however, the tuples we have found
* to be visible are guaranteed good as long as we hold the buffer pin.
*/
LockBuffer(buffer, BUFFER_LOCK_SHARE);
dp = (Page) BufferGetPage(buffer);
maxoff = PageGetMaxOffsetNumber(dp);
/*
* Determine how many entries we need to look at on this page. If
* the bitmap is lossy then we need to look at each physical item
* pointer; otherwise we just look through the offsets listed in
* tbmres.
*/
if (tbmres->ntuples >= 0)
{
/* non-lossy case */
minslot = 0;
maxslot = tbmres->ntuples - 1;
}
else
{
/* lossy case */
minslot = FirstOffsetNumber;
maxslot = maxoff;
}
ntup = 0;
for (curslot = minslot; curslot <= maxslot; curslot++)
{
OffsetNumber targoffset;
ItemId lp;
HeapTupleData loctup;
bool valid;
if (tbmres->ntuples >= 0) if (tbmres->ntuples >= 0)
{ {
/* non-lossy case */ /* non-lossy case */
targoffset = tbmres->offsets[node->curslot]; targoffset = tbmres->offsets[curslot];
} }
else else
{ {
/* lossy case */ /* lossy case */
targoffset = (OffsetNumber) node->curslot; targoffset = (OffsetNumber) curslot;
} }
ItemPointerSet(&scandesc->rs_ctup.t_self, tbmres->blockno, targoffset);
/* /*
* Fetch the heap tuple and see if it matches the snapshot. We use * We'd better check for out-of-range offnum in case of VACUUM since
* heap_release_fetch to avoid useless bufmgr traffic. * the TID was obtained.
*/ */
if (heap_release_fetch(scandesc->rs_rd, if (targoffset < FirstOffsetNumber || targoffset > maxoff)
scandesc->rs_snapshot, continue;
&scandesc->rs_ctup,
&scandesc->rs_cbuf,
true,
&scandesc->rs_pgstat_info))
{
/*
* Set up the result slot to point to this tuple. Note that the
* slot acquires a pin on the buffer.
*/
ExecStoreTuple(&scandesc->rs_ctup,
slot,
scandesc->rs_cbuf,
false);
/* lp = PageGetItemId(dp, targoffset);
* If we are using lossy info, we have to recheck the qual
* conditions at every tuple.
*/
if (tbmres->ntuples < 0)
{
econtext->ecxt_scantuple = slot;
ResetExprContext(econtext);
if (!ExecQual(node->bitmapqualorig, econtext, false))
{
/* Fails recheck, so drop it and loop back for another */
ExecClearTuple(slot);
continue;
}
}
/* OK to return this tuple */ /*
return slot; * Must check for deleted tuple.
} */
if (!ItemIdIsUsed(lp))
continue;
/* /*
* Failed the snap, so loop back and try again. * check time qualification of tuple, remember it if valid
*/ */
loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
loctup.t_len = ItemIdGetLength(lp);
ItemPointerSet(&(loctup.t_self), page, targoffset);
valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
if (valid)
scan->rs_vistuples[ntup++] = targoffset;
} }
/* LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
* if we get here it means we are at the end of the scan..
*/ Assert(ntup <= MaxHeapTuplesPerPage);
return ExecClearTuple(slot); scan->rs_ntuples = ntup;
} }
/* ---------------------------------------------------------------- /* ----------------------------------------------------------------
...@@ -403,6 +472,12 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate) ...@@ -403,6 +472,12 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate)
Oid reloid; Oid reloid;
Relation currentRelation; Relation currentRelation;
/*
* Assert caller didn't ask for an unsafe snapshot --- see comments
* at head of file.
*/
Assert(IsMVCCSnapshot(estate->es_snapshot));
/* /*
* create state structure * create state structure
*/ */
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.41 2005/10/15 02:49:42 momjian Exp $ * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.42 2005/11/26 03:03:07 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -26,14 +26,23 @@ typedef struct HeapScanDescData ...@@ -26,14 +26,23 @@ typedef struct HeapScanDescData
int rs_nkeys; /* number of scan keys */ int rs_nkeys; /* number of scan keys */
ScanKey rs_key; /* array of scan key descriptors */ ScanKey rs_key; /* array of scan key descriptors */
BlockNumber rs_nblocks; /* number of blocks to scan */ BlockNumber rs_nblocks; /* number of blocks to scan */
bool rs_pageatatime; /* verify visibility page-at-a-time? */
/* scan current state */ /* scan current state */
bool rs_inited; /* false = scan not init'd yet */
HeapTupleData rs_ctup; /* current tuple in scan, if any */ HeapTupleData rs_ctup; /* current tuple in scan, if any */
BlockNumber rs_cblock; /* current block # in scan, if any */
Buffer rs_cbuf; /* current buffer in scan, if any */ Buffer rs_cbuf; /* current buffer in scan, if any */
/* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ /* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
ItemPointerData rs_mctid; /* marked scan position, if any */ ItemPointerData rs_mctid; /* marked scan position, if any */
PgStat_Info rs_pgstat_info; /* statistics collector hook */ PgStat_Info rs_pgstat_info; /* statistics collector hook */
/* these fields only used in page-at-a-time mode */
int rs_cindex; /* current tuple's index in vistuples */
int rs_mindex; /* marked tuple's saved index */
int rs_ntuples; /* number of visible tuples on page */
OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]; /* their offsets */
} HeapScanDescData; } HeapScanDescData;
typedef HeapScanDescData *HeapScanDesc; typedef HeapScanDescData *HeapScanDesc;
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/access/valid.h,v 1.36 2004/12/31 22:03:21 pgsql Exp $ * $PostgreSQL: pgsql/src/include/access/valid.h,v 1.37 2005/11/26 03:03:07 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -65,37 +65,4 @@ do \ ...@@ -65,37 +65,4 @@ do \
} \ } \
} while (0) } while (0)
/*
* HeapTupleSatisfies
*
* res is set TRUE if the HeapTuple satisfies the timequal and keytest,
* otherwise it is set FALSE. Note that the hint bits in the HeapTuple's
* t_infomask may be updated as a side effect.
*
* on 8/21/92 mao says: i rearranged the tests here to do keytest before
* SatisfiesTimeQual. profiling indicated that even for vacuumed relations,
* time qual checking was more expensive than key testing. time qual is
* least likely to fail, too. we should really add the time qual test to
* the restriction and optimize it in the normal way. this has interactions
* with joey's expensive function work.
*/
#define HeapTupleSatisfies(tuple, \
relation, \
buffer, \
disk_page, \
snapshot, \
nKeys, \
key, \
res) \
do \
{ \
if ((key) != NULL) \
HeapKeyTest(tuple, RelationGetDescr(relation), nKeys, key, res); \
else \
(res) = true; \
\
if ((res) && (relation)->rd_rel->relkind != RELKIND_UNCATALOGED) \
(res) = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer); \
} while (0)
#endif /* VALID_H */ #endif /* VALID_H */
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.142 2005/11/25 19:47:50 tgl Exp $ * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.143 2005/11/26 03:03:07 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -958,9 +958,6 @@ typedef struct BitmapIndexScanState ...@@ -958,9 +958,6 @@ typedef struct BitmapIndexScanState
* bitmapqualorig execution state for bitmapqualorig expressions * bitmapqualorig execution state for bitmapqualorig expressions
* tbm bitmap obtained from child index scan(s) * tbm bitmap obtained from child index scan(s)
* tbmres current-page data * tbmres current-page data
* curslot current tbmres index or tuple offset on page
* minslot lowest tbmres index or tuple offset to try
* maxslot highest tbmres index or tuple offset to try
* ---------------- * ----------------
*/ */
typedef struct BitmapHeapScanState typedef struct BitmapHeapScanState
...@@ -969,9 +966,6 @@ typedef struct BitmapHeapScanState ...@@ -969,9 +966,6 @@ typedef struct BitmapHeapScanState
List *bitmapqualorig; List *bitmapqualorig;
TIDBitmap *tbm; TIDBitmap *tbm;
TBMIterateResult *tbmres; TBMIterateResult *tbmres;
int curslot;
int minslot;
int maxslot;
} BitmapHeapScanState; } BitmapHeapScanState;
/* ---------------- /* ----------------
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/utils/tqual.h,v 1.59 2005/10/15 02:49:46 momjian Exp $ * $PostgreSQL: pgsql/src/include/utils/tqual.h,v 1.60 2005/11/26 03:03:07 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -55,6 +55,15 @@ typedef SnapshotData *Snapshot; ...@@ -55,6 +55,15 @@ typedef SnapshotData *Snapshot;
extern DLLIMPORT Snapshot SnapshotDirty; extern DLLIMPORT Snapshot SnapshotDirty;
/* This macro encodes the knowledge of which snapshots are MVCC-safe */
#define IsMVCCSnapshot(snapshot) \
((snapshot) != SnapshotNow && \
(snapshot) != SnapshotSelf && \
(snapshot) != SnapshotAny && \
(snapshot) != SnapshotToast && \
(snapshot) != SnapshotDirty)
extern DLLIMPORT Snapshot SerializableSnapshot; extern DLLIMPORT Snapshot SerializableSnapshot;
extern DLLIMPORT Snapshot LatestSnapshot; extern DLLIMPORT Snapshot LatestSnapshot;
extern DLLIMPORT Snapshot ActiveSnapshot; extern DLLIMPORT Snapshot ActiveSnapshot;
...@@ -69,8 +78,9 @@ extern TransactionId RecentGlobalXmin; ...@@ -69,8 +78,9 @@ extern TransactionId RecentGlobalXmin;
* True iff heap tuple satisfies a time qual. * True iff heap tuple satisfies a time qual.
* *
* Notes: * Notes:
* Assumes heap tuple is valid. * Assumes heap tuple is valid.
* Beware of multiple evaluations of snapshot argument. * Beware of multiple evaluations of snapshot argument.
* Hint bits in the HeapTuple's t_infomask may be updated as a side effect.
*/ */
#define HeapTupleSatisfiesVisibility(tuple, snapshot, buffer) \ #define HeapTupleSatisfiesVisibility(tuple, snapshot, buffer) \
((snapshot) == SnapshotNow ? \ ((snapshot) == SnapshotNow ? \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment