Commit 73c954d2 authored by Andres Freund's avatar Andres Freund

tableam: sample scan.

This moves sample scan support to below tableam. It's not optional as
there is, in contrast to e.g. bitmap heap scans, no alternative way to
perform tablesample queries. If an AM can't deal with the block based
API, it will have to throw an ERROR.

The tableam callbacks for this are block based, but given the current
TsmRoutine interface, that seems to be required.

The new interface doesn't require TsmRoutines to perform visibility
checks anymore - that requires the TsmRoutine to know details about
the AM, which we want to avoid.  To continue to allow taking the
returned number of tuples account SampleScanState now has a donetuples
field (which previously e.g. existed in SystemRowsSamplerData), which
is only incremented after the visibility check succeeds.

Author: Andres Freund
Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
parent 4bb50236
......@@ -28,7 +28,6 @@
#include "postgres.h"
#include "access/heapam.h"
#include "access/relscan.h"
#include "access/tsmapi.h"
#include "catalog/pg_type.h"
......@@ -46,7 +45,6 @@ typedef struct
{
uint32 seed; /* random seed */
int64 ntuples; /* number of tuples to return */
int64 donetuples; /* number of tuples already returned */
OffsetNumber lt; /* last tuple returned from current block */
BlockNumber doneblocks; /* number of already-scanned blocks */
BlockNumber lb; /* last block visited */
......@@ -67,11 +65,10 @@ static void system_rows_beginsamplescan(SampleScanState *node,
Datum *params,
int nparams,
uint32 seed);
static BlockNumber system_rows_nextsampleblock(SampleScanState *node);
static BlockNumber system_rows_nextsampleblock(SampleScanState *node, BlockNumber nblocks);
static OffsetNumber system_rows_nextsampletuple(SampleScanState *node,
BlockNumber blockno,
OffsetNumber maxoffset);
static bool SampleOffsetVisible(OffsetNumber tupoffset, HeapScanDesc scan);
static uint32 random_relative_prime(uint32 n, SamplerRandomState randstate);
......@@ -187,7 +184,6 @@ system_rows_beginsamplescan(SampleScanState *node,
sampler->seed = seed;
sampler->ntuples = ntuples;
sampler->donetuples = 0;
sampler->lt = InvalidOffsetNumber;
sampler->doneblocks = 0;
/* lb will be initialized during first NextSampleBlock call */
......@@ -206,11 +202,9 @@ system_rows_beginsamplescan(SampleScanState *node,
* Uses linear probing algorithm for picking next block.
*/
static BlockNumber
system_rows_nextsampleblock(SampleScanState *node)
system_rows_nextsampleblock(SampleScanState *node, BlockNumber nblocks)
{
SystemRowsSamplerData *sampler = (SystemRowsSamplerData *) node->tsm_state;
TableScanDesc scan = node->ss.ss_currentScanDesc;
HeapScanDesc hscan = (HeapScanDesc) scan;
/* First call within scan? */
if (sampler->doneblocks == 0)
......@@ -222,14 +216,14 @@ system_rows_nextsampleblock(SampleScanState *node)
SamplerRandomState randstate;
/* If relation is empty, there's nothing to scan */
if (hscan->rs_nblocks == 0)
if (nblocks == 0)
return InvalidBlockNumber;
/* We only need an RNG during this setup step */
sampler_random_init_state(sampler->seed, randstate);
/* Compute nblocks/firstblock/step only once per query */
sampler->nblocks = hscan->rs_nblocks;
sampler->nblocks = nblocks;
/* Choose random starting block within the relation */
/* (Actually this is the predecessor of the first block visited) */
......@@ -246,7 +240,7 @@ system_rows_nextsampleblock(SampleScanState *node)
/* If we've read all blocks or returned all needed tuples, we're done */
if (++sampler->doneblocks > sampler->nblocks ||
sampler->donetuples >= sampler->ntuples)
node->donetuples >= sampler->ntuples)
return InvalidBlockNumber;
/*
......@@ -259,7 +253,7 @@ system_rows_nextsampleblock(SampleScanState *node)
{
/* Advance lb, using uint64 arithmetic to forestall overflow */
sampler->lb = ((uint64) sampler->lb + sampler->step) % sampler->nblocks;
} while (sampler->lb >= hscan->rs_nblocks);
} while (sampler->lb >= nblocks);
return sampler->lb;
}
......@@ -279,23 +273,12 @@ system_rows_nextsampletuple(SampleScanState *node,
OffsetNumber maxoffset)
{
SystemRowsSamplerData *sampler = (SystemRowsSamplerData *) node->tsm_state;
TableScanDesc scan = node->ss.ss_currentScanDesc;
HeapScanDesc hscan = (HeapScanDesc) scan;
OffsetNumber tupoffset = sampler->lt;
/* Quit if we've returned all needed tuples */
if (sampler->donetuples >= sampler->ntuples)
if (node->donetuples >= sampler->ntuples)
return InvalidOffsetNumber;
/*
* Because we should only count visible tuples as being returned, we need
* to search for a visible tuple rather than just let the core code do it.
*/
/* We rely on the data accumulated in pagemode access */
Assert(scan->rs_pageatatime);
for (;;)
{
/* Advance to next possible offset on page */
if (tupoffset == InvalidOffsetNumber)
tupoffset = FirstOffsetNumber;
......@@ -304,52 +287,13 @@ system_rows_nextsampletuple(SampleScanState *node,
/* Done? */
if (tupoffset > maxoffset)
{
tupoffset = InvalidOffsetNumber;
break;
}
/* Found a candidate? */
if (SampleOffsetVisible(tupoffset, hscan))
{
sampler->donetuples++;
break;
}
}
sampler->lt = tupoffset;
return tupoffset;
}
/*
* Check if tuple offset is visible
*
* In pageatatime mode, heapgetpage() already did visibility checks,
* so just look at the info it left in rs_vistuples[].
*/
static bool
SampleOffsetVisible(OffsetNumber tupoffset, HeapScanDesc scan)
{
int start = 0,
end = scan->rs_ntuples - 1;
while (start <= end)
{
int mid = (start + end) / 2;
OffsetNumber curoffset = scan->rs_vistuples[mid];
if (tupoffset == curoffset)
return true;
else if (tupoffset < curoffset)
end = mid - 1;
else
start = mid + 1;
}
return false;
}
/*
* Compute greatest common divisor of two uint32's.
*/
......
......@@ -26,7 +26,6 @@
#include <math.h>
#include "access/heapam.h"
#include "access/relscan.h"
#include "access/tsmapi.h"
#include "catalog/pg_type.h"
......@@ -66,7 +65,7 @@ static void system_time_beginsamplescan(SampleScanState *node,
Datum *params,
int nparams,
uint32 seed);
static BlockNumber system_time_nextsampleblock(SampleScanState *node);
static BlockNumber system_time_nextsampleblock(SampleScanState *node, BlockNumber nblocks);
static OffsetNumber system_time_nextsampletuple(SampleScanState *node,
BlockNumber blockno,
OffsetNumber maxoffset);
......@@ -213,11 +212,9 @@ system_time_beginsamplescan(SampleScanState *node,
* Uses linear probing algorithm for picking next block.
*/
static BlockNumber
system_time_nextsampleblock(SampleScanState *node)
system_time_nextsampleblock(SampleScanState *node, BlockNumber nblocks)
{
SystemTimeSamplerData *sampler = (SystemTimeSamplerData *) node->tsm_state;
TableScanDesc scan = node->ss.ss_currentScanDesc;
HeapScanDesc hscan = (HeapScanDesc) scan;
instr_time cur_time;
/* First call within scan? */
......@@ -230,14 +227,14 @@ system_time_nextsampleblock(SampleScanState *node)
SamplerRandomState randstate;
/* If relation is empty, there's nothing to scan */
if (hscan->rs_nblocks == 0)
if (nblocks == 0)
return InvalidBlockNumber;
/* We only need an RNG during this setup step */
sampler_random_init_state(sampler->seed, randstate);
/* Compute nblocks/firstblock/step only once per query */
sampler->nblocks = hscan->rs_nblocks;
sampler->nblocks = nblocks;
/* Choose random starting block within the relation */
/* (Actually this is the predecessor of the first block visited) */
......@@ -273,7 +270,7 @@ system_time_nextsampleblock(SampleScanState *node)
{
/* Advance lb, using uint64 arithmetic to forestall overflow */
sampler->lb = ((uint64) sampler->lb + sampler->step) % sampler->nblocks;
} while (sampler->lb >= hscan->rs_nblocks);
} while (sampler->lb >= nblocks);
return sampler->lb;
}
......
......@@ -227,7 +227,7 @@ BeginSampleScan (SampleScanState *node,
<para>
<programlisting>
BlockNumber
NextSampleBlock (SampleScanState *node);
NextSampleBlock (SampleScanState *node, BlockNumber nblocks);
</programlisting>
Returns the block number of the next page to be scanned, or
......@@ -262,10 +262,9 @@ NextSampleTuple (SampleScanState *node,
numbers in the range <literal>1 .. maxoffset</literal> actually contain valid
tuples. This is not normally a problem since the core code ignores
requests to sample missing or invisible tuples; that should not result in
any bias in the sample. However, if necessary, the function can
examine <literal>node-&gt;ss.ss_currentScanDesc-&gt;rs_vistuples[]</literal>
to identify which tuples are valid and visible. (This
requires <literal>node-&gt;use_pagemode</literal> to be <literal>true</literal>.)
any bias in the sample. However, if necessary, the function can use
<literal>node-&gt;donetuples</literal> to examine how many of the tuples
it returned were vlaid and visible.
</para>
</note>
......
......@@ -28,6 +28,7 @@
#include "access/multixact.h"
#include "access/rewriteheap.h"
#include "access/tableam.h"
#include "access/tsmapi.h"
#include "access/xact.h"
#include "catalog/catalog.h"
#include "catalog/index.h"
......@@ -52,6 +53,9 @@ static void reform_and_rewrite_tuple(HeapTuple tuple,
Relation OldHeap, Relation NewHeap,
Datum *values, bool *isnull, RewriteState rwstate);
static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
HeapTuple tuple,
OffsetNumber tupoffset);
static const TableAmRoutine heapam_methods;
......@@ -1943,6 +1947,181 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
}
/* ------------------------------------------------------------------------
* Executor related callbacks for the heap AM
* ------------------------------------------------------------------------
*/
static bool
heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate)
{
HeapScanDesc hscan = (HeapScanDesc) scan;
TsmRoutine *tsm = scanstate->tsmroutine;
BlockNumber blockno;
/* return false immediately if relation is empty */
if (hscan->rs_nblocks == 0)
return false;
if (tsm->NextSampleBlock)
{
blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks);
hscan->rs_cblock = blockno;
}
else
{
/* scanning table sequentially */
if (hscan->rs_cblock == InvalidBlockNumber)
{
Assert(!hscan->rs_inited);
blockno = hscan->rs_startblock;
}
else
{
Assert(hscan->rs_inited);
blockno = hscan->rs_cblock + 1;
if (blockno >= hscan->rs_nblocks)
{
/* wrap to begining of rel, might not have started at 0 */
blockno = 0;
}
/*
* Report our new scan position for synchronization purposes.
*
* Note: we do this before checking for end of scan so that the
* final state of the position hint is back at the start of the
* rel. That's not strictly necessary, but otherwise when you run
* the same query multiple times the starting position would shift
* a little bit backwards on every invocation, which is confusing.
* We don't guarantee any specific ordering in general, though.
*/
if (scan->rs_syncscan)
ss_report_location(scan->rs_rd, blockno);
if (blockno == hscan->rs_startblock)
{
blockno = InvalidBlockNumber;
}
}
}
if (!BlockNumberIsValid(blockno))
{
if (BufferIsValid(hscan->rs_cbuf))
ReleaseBuffer(hscan->rs_cbuf);
hscan->rs_cbuf = InvalidBuffer;
hscan->rs_cblock = InvalidBlockNumber;
hscan->rs_inited = false;
return false;
}
heapgetpage(scan, blockno);
hscan->rs_inited = true;
return true;
}
static bool
heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
TupleTableSlot *slot)
{
HeapScanDesc hscan = (HeapScanDesc) scan;
TsmRoutine *tsm = scanstate->tsmroutine;
BlockNumber blockno = hscan->rs_cblock;
bool pagemode = scan->rs_pageatatime;
Page page;
bool all_visible;
OffsetNumber maxoffset;
/*
* When not using pagemode, we must lock the buffer during tuple
* visibility checks.
*/
if (!pagemode)
LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
page = (Page) BufferGetPage(hscan->rs_cbuf);
all_visible = PageIsAllVisible(page) &&
!scan->rs_snapshot->takenDuringRecovery;
maxoffset = PageGetMaxOffsetNumber(page);
for (;;)
{
OffsetNumber tupoffset;
CHECK_FOR_INTERRUPTS();
/* Ask the tablesample method which tuples to check on this page. */
tupoffset = tsm->NextSampleTuple(scanstate,
blockno,
maxoffset);
if (OffsetNumberIsValid(tupoffset))
{
ItemId itemid;
bool visible;
HeapTuple tuple = &(hscan->rs_ctup);
/* Skip invalid tuple pointers. */
itemid = PageGetItemId(page, tupoffset);
if (!ItemIdIsNormal(itemid))
continue;
tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
tuple->t_len = ItemIdGetLength(itemid);
ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
if (all_visible)
visible = true;
else
visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf,
tuple, tupoffset);
/* in pagemode, heapgetpage did this for us */
if (!pagemode)
CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
hscan->rs_cbuf, scan->rs_snapshot);
/* Try next tuple from same page. */
if (!visible)
continue;
/* Found visible tuple, return it. */
if (!pagemode)
LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf);
/* Count successfully-fetched tuples as heap fetches */
pgstat_count_heap_getnext(scan->rs_rd);
return true;
}
else
{
/*
* If we get here, it means we've exhausted the items on this page
* and it's time to move to the next.
*/
if (!pagemode)
LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
ExecClearTuple(slot);
return false;
}
}
Assert(0);
}
/* ----------------------------------------------------------------------------
* Helper functions for the above.
* ----------------------------------------------------------------------------
......@@ -1991,6 +2170,53 @@ reform_and_rewrite_tuple(HeapTuple tuple,
heap_freetuple(copiedTuple);
}
/*
* Check visibility of the tuple.
*/
static bool
SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
HeapTuple tuple,
OffsetNumber tupoffset)
{
HeapScanDesc hscan = (HeapScanDesc) scan;
if (scan->rs_pageatatime)
{
/*
* In pageatatime mode, heapgetpage() already did visibility checks,
* so just look at the info it left in rs_vistuples[].
*
* We use a binary search over the known-sorted array. Note: we could
* save some effort if we insisted that NextSampleTuple select tuples
* in increasing order, but it's not clear that there would be enough
* gain to justify the restriction.
*/
int start = 0,
end = hscan->rs_ntuples - 1;
while (start <= end)
{
int mid = (start + end) / 2;
OffsetNumber curoffset = hscan->rs_vistuples[mid];
if (tupoffset == curoffset)
return true;
else if (tupoffset < curoffset)
end = mid - 1;
else
start = mid + 1;
}
return false;
}
else
{
/* Otherwise, we have to check the tuple individually. */
return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot,
buffer);
}
}
/* ------------------------------------------------------------------------
* Definition of the heap table access method.
......@@ -2039,6 +2265,9 @@ static const TableAmRoutine heapam_methods = {
.index_validate_scan = heapam_index_validate_scan,
.relation_estimate_size = heapam_estimate_rel_size,
.scan_sample_next_block = heapam_scan_sample_next_block,
.scan_sample_next_tuple = heapam_scan_sample_next_tuple
};
......
......@@ -89,6 +89,9 @@ GetTableAmRoutine(Oid amhandler)
Assert(routine->index_validate_scan != NULL);
Assert(routine->relation_estimate_size != NULL);
Assert(routine->scan_sample_next_block != NULL);
Assert(routine->scan_sample_next_tuple != NULL);
return routine;
}
......
......@@ -26,7 +26,6 @@
#include <math.h>
#include "access/heapam.h"
#include "access/relscan.h"
#include "access/tsmapi.h"
#include "catalog/pg_type.h"
......@@ -56,7 +55,7 @@ static void system_beginsamplescan(SampleScanState *node,
Datum *params,
int nparams,
uint32 seed);
static BlockNumber system_nextsampleblock(SampleScanState *node);
static BlockNumber system_nextsampleblock(SampleScanState *node, BlockNumber nblocks);
static OffsetNumber system_nextsampletuple(SampleScanState *node,
BlockNumber blockno,
OffsetNumber maxoffset);
......@@ -177,11 +176,9 @@ system_beginsamplescan(SampleScanState *node,
* Select next block to sample.
*/
static BlockNumber
system_nextsampleblock(SampleScanState *node)
system_nextsampleblock(SampleScanState *node, BlockNumber nblocks)
{
SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
TableScanDesc scan = node->ss.ss_currentScanDesc;
HeapScanDesc hscan = (HeapScanDesc) scan;
BlockNumber nextblock = sampler->nextblock;
uint32 hashinput[2];
......@@ -200,7 +197,7 @@ system_nextsampleblock(SampleScanState *node)
* Loop over block numbers until finding suitable block or reaching end of
* relation.
*/
for (; nextblock < hscan->rs_nblocks; nextblock++)
for (; nextblock < nblocks; nextblock++)
{
uint32 hash;
......@@ -212,7 +209,7 @@ system_nextsampleblock(SampleScanState *node)
break;
}
if (nextblock < hscan->rs_nblocks)
if (nextblock < nblocks)
{
/* Found a suitable block; remember where we should start next time */
sampler->nextblock = nextblock + 1;
......
......@@ -14,7 +14,6 @@
*/
#include "postgres.h"
#include "access/heapam.h"
#include "access/relscan.h"
#include "access/tableam.h"
#include "access/tsmapi.h"
......@@ -29,9 +28,7 @@
static TupleTableSlot *SampleNext(SampleScanState *node);
static void tablesample_init(SampleScanState *scanstate);
static HeapTuple tablesample_getnext(SampleScanState *scanstate);
static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset,
HeapScanDesc scan);
static TupleTableSlot *tablesample_getnext(SampleScanState *scanstate);
/* ----------------------------------------------------------------
* Scan Support
......@@ -47,10 +44,6 @@ static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset,
static TupleTableSlot *
SampleNext(SampleScanState *node)
{
HeapTuple tuple;
TupleTableSlot *slot;
HeapScanDesc hscan;
/*
* if this is first call within a scan, initialize
*/
......@@ -60,19 +53,7 @@ SampleNext(SampleScanState *node)
/*
* get the next tuple, and store it in our result slot
*/
tuple = tablesample_getnext(node);
slot = node->ss.ss_ScanTupleSlot;
hscan = (HeapScanDesc) node->ss.ss_currentScanDesc;
if (tuple)
ExecStoreBufferHeapTuple(tuple, /* tuple to store */
slot, /* slot to store in */
hscan->rs_cbuf); /* tuple's buffer */
else
ExecClearTuple(slot);
return slot;
return tablesample_getnext(node);
}
/*
......@@ -237,6 +218,9 @@ ExecReScanSampleScan(SampleScanState *node)
{
/* Remember we need to do BeginSampleScan again (if we did it at all) */
node->begun = false;
node->done = false;
node->haveblock = false;
node->donetuples = 0;
ExecScanReScan(&node->ss);
}
......@@ -258,6 +242,7 @@ tablesample_init(SampleScanState *scanstate)
int i;
ListCell *arg;
scanstate->donetuples = 0;
params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum));
i = 0;
......@@ -345,225 +330,49 @@ tablesample_init(SampleScanState *scanstate)
/*
* Get next tuple from TABLESAMPLE method.
*
* Note: an awful lot of this is copied-and-pasted from heapam.c. It would
* perhaps be better to refactor to share more code.
*/
static HeapTuple
static TupleTableSlot *
tablesample_getnext(SampleScanState *scanstate)
{
TsmRoutine *tsm = scanstate->tsmroutine;
TableScanDesc scan = scanstate->ss.ss_currentScanDesc;
HeapScanDesc hscan = (HeapScanDesc) scan;
HeapTuple tuple = &(hscan->rs_ctup);
Snapshot snapshot = scan->rs_snapshot;
bool pagemode = scan->rs_pageatatime;
BlockNumber blockno;
Page page;
bool all_visible;
OffsetNumber maxoffset;
if (!hscan->rs_inited)
{
/*
* return null immediately if relation is empty
*/
if (hscan->rs_nblocks == 0)
{
Assert(!BufferIsValid(hscan->rs_cbuf));
tuple->t_data = NULL;
return NULL;
}
if (tsm->NextSampleBlock)
{
blockno = tsm->NextSampleBlock(scanstate);
if (!BlockNumberIsValid(blockno))
{
tuple->t_data = NULL;
return NULL;
}
}
else
blockno = hscan->rs_startblock;
Assert(blockno < hscan->rs_nblocks);
heapgetpage(scan, blockno);
hscan->rs_inited = true;
}
else
{
/* continue from previously returned page/tuple */
blockno = hscan->rs_cblock; /* current page */
}
TupleTableSlot *slot = scanstate->ss.ss_ScanTupleSlot;
/*
* When not using pagemode, we must lock the buffer during tuple
* visibility checks.
*/
if (!pagemode)
LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
ExecClearTuple(slot);
page = (Page) BufferGetPage(hscan->rs_cbuf);
all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
maxoffset = PageGetMaxOffsetNumber(page);
if (scanstate->done)
return NULL;
for (;;)
{
OffsetNumber tupoffset;
bool finished;
CHECK_FOR_INTERRUPTS();
/* Ask the tablesample method which tuples to check on this page. */
tupoffset = tsm->NextSampleTuple(scanstate,
blockno,
maxoffset);
if (OffsetNumberIsValid(tupoffset))
{
ItemId itemid;
bool visible;
/* Skip invalid tuple pointers. */
itemid = PageGetItemId(page, tupoffset);
if (!ItemIdIsNormal(itemid))
continue;
tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
tuple->t_len = ItemIdGetLength(itemid);
ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
if (all_visible)
visible = true;
else
visible = SampleTupleVisible(tuple, tupoffset, hscan);
/* in pagemode, heapgetpage did this for us */
if (!pagemode)
CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
hscan->rs_cbuf, snapshot);
if (visible)
if (!scanstate->haveblock)
{
/* Found visible tuple, return it. */
if (!pagemode)
LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
break;
}
else
if (!table_scan_sample_next_block(scan, scanstate))
{
/* Try next tuple from same page. */
continue;
}
}
/*
* if we get here, it means we've exhausted the items on this page and
* it's time to move to the next.
*/
if (!pagemode)
LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
if (tsm->NextSampleBlock)
{
blockno = tsm->NextSampleBlock(scanstate);
Assert(!scan->rs_syncscan);
finished = !BlockNumberIsValid(blockno);
}
else
{
/* Without NextSampleBlock, just do a plain forward seqscan. */
blockno++;
if (blockno >= hscan->rs_nblocks)
blockno = 0;
scanstate->haveblock = false;
scanstate->done = true;
/*
* Report our new scan position for synchronization purposes.
*
* Note: we do this before checking for end of scan so that the
* final state of the position hint is back at the start of the
* rel. That's not strictly necessary, but otherwise when you run
* the same query multiple times the starting position would shift
* a little bit backwards on every invocation, which is confusing.
* We don't guarantee any specific ordering in general, though.
*/
if (scan->rs_syncscan)
ss_report_location(scan->rs_rd, blockno);
finished = (blockno == hscan->rs_startblock);
}
/*
* Reached end of scan?
*/
if (finished)
{
if (BufferIsValid(hscan->rs_cbuf))
ReleaseBuffer(hscan->rs_cbuf);
hscan->rs_cbuf = InvalidBuffer;
hscan->rs_cblock = InvalidBlockNumber;
tuple->t_data = NULL;
hscan->rs_inited = false;
/* exhausted relation */
return NULL;
}
Assert(blockno < hscan->rs_nblocks);
heapgetpage(scan, blockno);
/* Re-establish state for new page */
if (!pagemode)
LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
page = (Page) BufferGetPage(hscan->rs_cbuf);
all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
maxoffset = PageGetMaxOffsetNumber(page);
scanstate->haveblock = true;
}
/* Count successfully-fetched tuples as heap fetches */
pgstat_count_heap_getnext(scan->rs_rd);
return &(hscan->rs_ctup);
}
/*
* Check visibility of the tuple.
*/
static bool
SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan)
{
if (scan->rs_base.rs_pageatatime)
if (!table_scan_sample_next_tuple(scan, scanstate, slot))
{
/*
* In pageatatime mode, heapgetpage() already did visibility checks,
* so just look at the info it left in rs_vistuples[].
*
* We use a binary search over the known-sorted array. Note: we could
* save some effort if we insisted that NextSampleTuple select tuples
* in increasing order, but it's not clear that there would be enough
* gain to justify the restriction.
* If we get here, it means we've exhausted the items on this page
* and it's time to move to the next.
*/
int start = 0,
end = scan->rs_ntuples - 1;
while (start <= end)
{
int mid = (start + end) / 2;
OffsetNumber curoffset = scan->rs_vistuples[mid];
if (tupoffset == curoffset)
return true;
else if (tupoffset < curoffset)
end = mid - 1;
else
start = mid + 1;
scanstate->haveblock = false;
continue;
}
return false;
}
else
{
/* Otherwise, we have to check the tuple individually. */
return HeapTupleSatisfiesVisibility(tuple,
scan->rs_base.rs_snapshot,
scan->rs_cbuf);
/* Found visible tuple, return it. */
break;
}
scanstate->donetuples++;
return slot;
}
......@@ -30,6 +30,7 @@ extern bool synchronize_seqscans;
struct BulkInsertStateData;
struct IndexInfo;
struct IndexBuildCallback;
struct SampleScanState;
struct VacuumParams;
struct ValidateIndexState;
......@@ -519,6 +520,56 @@ typedef struct TableAmRoutine
BlockNumber *pages, double *tuples,
double *allvisfrac);
/* ------------------------------------------------------------------------
* Executor related functions.
* ------------------------------------------------------------------------
*/
/*
* Acquire the next block in a sample scan. Return false if the sample
* scan is finished, true otherwise.
*
* Typically this will first determine the target block by call the
* TsmRoutine's NextSampleBlock() callback if not NULL, or alternatively
* perform a sequential scan over all blocks. The determined block is
* then typically read and pinned.
*
* As the TsmRoutine interface is block based, a block needs to be passed
* to NextSampleBlock(). If that's not appropriate for an AM, it
* internally needs to perform mapping between the internal and a block
* based representation.
*
* Note that it's not acceptable to hold deadlock prone resources such as
* lwlocks until scan_sample_next_tuple() has exhausted the tuples on the
* block - the tuple is likely to be returned to an upper query node, and
* the next call could be off a long while. Holding buffer pins etc is
* obviously OK.
*
* Currently it is required to implement this interface, as there's no
* alternative way (contrary e.g. to bitmap scans) to implement sample
* scans. If infeasible to implement the AM may raise an error.
*/
bool (*scan_sample_next_block) (TableScanDesc scan,
struct SampleScanState *scanstate);
/*
* This callback, only called after scan_sample_next_block has returned
* true, should determine the next tuple to be returned from the selected
* block using the TsmRoutine's NextSampleTuple() callback.
*
* The callback needs to perform visibility checks, and only return
* visible tuples. That obviously can mean calling NextSampletuple()
* multiple times.
*
* The TsmRoutine interface assumes that there's a maximum offset on a
* given page, so if that doesn't apply to an AM, it needs to emulate that
* assumption somehow.
*/
bool (*scan_sample_next_tuple) (TableScanDesc scan,
struct SampleScanState *scanstate,
TupleTableSlot *slot);
} TableAmRoutine;
......@@ -1339,6 +1390,43 @@ table_relation_estimate_size(Relation rel, int32 *attr_widths,
}
/* ----------------------------------------------------------------------------
* Executor related functionality
* ----------------------------------------------------------------------------
*/
/*
* Acquire the next block in a sample scan. Returns false if the sample scan
* is finished, true otherwise.
*
* This will call the TsmRoutine's NextSampleBlock() callback if necessary
* (i.e. NextSampleBlock is not NULL), or perform a sequential scan over the
* underlying relation.
*/
static inline bool
table_scan_sample_next_block(TableScanDesc scan,
struct SampleScanState *scanstate)
{
return scan->rs_rd->rd_tableam->scan_sample_next_block(scan, scanstate);
}
/*
* Fetch the next sample tuple into `slot` and return true if a visible tuple
* was found, false otherwise. table_scan_sample_next_block() needs to
* previously have selected a block (i.e. returned true).
*
* This will call the TsmRoutine's NextSampleTuple() callback.
*/
static inline bool
table_scan_sample_next_tuple(TableScanDesc scan,
struct SampleScanState *scanstate,
TupleTableSlot *slot)
{
return scan->rs_rd->rd_tableam->scan_sample_next_tuple(scan, scanstate,
slot);
}
/* ----------------------------------------------------------------------------
* Functions to make modifications a bit simpler.
* ----------------------------------------------------------------------------
......
......@@ -34,7 +34,8 @@ typedef void (*BeginSampleScan_function) (SampleScanState *node,
int nparams,
uint32 seed);
typedef BlockNumber (*NextSampleBlock_function) (SampleScanState *node);
typedef BlockNumber (*NextSampleBlock_function) (SampleScanState *node,
BlockNumber nblocks);
typedef OffsetNumber (*NextSampleTuple_function) (SampleScanState *node,
BlockNumber blockno,
......
......@@ -1305,6 +1305,9 @@ typedef struct SampleScanState
bool use_pagemode; /* use page-at-a-time visibility checking? */
bool begun; /* false means need to call BeginSampleScan */
uint32 seed; /* random seed */
int64 donetuples; /* number of tuples already returned */
bool haveblock; /* has a block for sampling been determined */
bool done; /* exhausted all tuples? */
} SampleScanState;
/*
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment