Commit 147e3722 authored by Andres Freund's avatar Andres Freund

tableam: Avoid relying on relation size to determine validity of tids.

Instead add a tableam callback to do so. To avoid adding per
validation overhead, pass a scan to tuple_tid_valid. In heap's case
we'd otherwise incurred a RelationGetNumberOfBlocks() call for each
tid - which'd have added noticable overhead to nodeTidscan.c.

Author: Andres Freund
Reviewed-By: Ashwin Agrawal
Discussion: https://postgr.es/m/20190515185447.gno2jtqxyktylyvs@alap3.anarazel.de
parent 7f44ede5
...@@ -1654,8 +1654,8 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, ...@@ -1654,8 +1654,8 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
/* /*
* heap_get_latest_tid - get the latest tid of a specified tuple * heap_get_latest_tid - get the latest tid of a specified tuple
* *
* Actually, this gets the latest version that is visible according to * Actually, this gets the latest version that is visible according to the
* the passed snapshot. You can pass SnapshotDirty to get the very latest, * scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
* possibly uncommitted version. * possibly uncommitted version.
* *
* *tid is both an input and an output parameter: it is updated to * *tid is both an input and an output parameter: it is updated to
...@@ -1663,28 +1663,20 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, ...@@ -1663,28 +1663,20 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
* if no version of the row passes the snapshot test. * if no version of the row passes the snapshot test.
*/ */
void void
heap_get_latest_tid(Relation relation, heap_get_latest_tid(TableScanDesc sscan,
Snapshot snapshot,
ItemPointer tid) ItemPointer tid)
{ {
BlockNumber blk; Relation relation = sscan->rs_rd;
Snapshot snapshot = sscan->rs_snapshot;
ItemPointerData ctid; ItemPointerData ctid;
TransactionId priorXmax; TransactionId priorXmax;
/* this is to avoid Assert failures on bad input */
if (!ItemPointerIsValid(tid))
return;
/* /*
* Since this can be called with user-supplied TID, don't trust the input * table_get_latest_tid verified that the passed in tid is valid. Assume
* too much. (RelationGetNumberOfBlocks is an expensive check, so we * that t_ctid links are valid however - there shouldn't be invalid ones
* don't check t_ctid links again this way. Note that it would not do to * in the table.
* call it just once and save the result, either.)
*/ */
blk = ItemPointerGetBlockNumber(tid); Assert(ItemPointerIsValid(tid));
if (blk >= RelationGetNumberOfBlocks(relation))
elog(ERROR, "block number %u is out of range for relation \"%s\"",
blk, RelationGetRelationName(relation));
/* /*
* Loop to chase down t_ctid links. At top of loop, ctid is the tuple we * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
......
...@@ -204,6 +204,15 @@ heapam_fetch_row_version(Relation relation, ...@@ -204,6 +204,15 @@ heapam_fetch_row_version(Relation relation,
return false; return false;
} }
static bool
heapam_tuple_tid_valid(TableScanDesc scan, ItemPointer tid)
{
HeapScanDesc hscan = (HeapScanDesc) scan;
return ItemPointerIsValid(tid) &&
ItemPointerGetBlockNumber(tid) < hscan->rs_nblocks;
}
static bool static bool
heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
Snapshot snapshot) Snapshot snapshot)
...@@ -2568,6 +2577,7 @@ static const TableAmRoutine heapam_methods = { ...@@ -2568,6 +2577,7 @@ static const TableAmRoutine heapam_methods = {
.tuple_fetch_row_version = heapam_fetch_row_version, .tuple_fetch_row_version = heapam_fetch_row_version,
.tuple_get_latest_tid = heap_get_latest_tid, .tuple_get_latest_tid = heap_get_latest_tid,
.tuple_tid_valid = heapam_tuple_tid_valid,
.tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot, .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
.compute_xid_horizon_for_tuples = heap_compute_xid_horizon_for_tuples, .compute_xid_horizon_for_tuples = heap_compute_xid_horizon_for_tuples,
......
...@@ -213,6 +213,33 @@ table_index_fetch_tuple_check(Relation rel, ...@@ -213,6 +213,33 @@ table_index_fetch_tuple_check(Relation rel,
} }
/* ------------------------------------------------------------------------
* Functions for non-modifying operations on individual tuples
* ------------------------------------------------------------------------
*/
void
table_get_latest_tid(TableScanDesc scan, ItemPointer tid)
{
Relation rel = scan->rs_rd;
const TableAmRoutine *tableam = rel->rd_tableam;
/*
* Since this can be called with user-supplied TID, don't trust the input
* too much.
*/
if (!tableam->tuple_tid_valid(scan, tid))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("tid (%u, %u) is not valid for relation for relation \"%s\"",
ItemPointerGetBlockNumberNoCheck(tid),
ItemPointerGetOffsetNumberNoCheck(tid),
RelationGetRelationName(rel))));
return tableam->tuple_get_latest_tid(scan, tid);
}
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
* Functions to make modifications a bit simpler. * Functions to make modifications a bit simpler.
* ---------------------------------------------------------------------------- * ----------------------------------------------------------------------------
......
...@@ -129,19 +129,23 @@ static void ...@@ -129,19 +129,23 @@ static void
TidListEval(TidScanState *tidstate) TidListEval(TidScanState *tidstate)
{ {
ExprContext *econtext = tidstate->ss.ps.ps_ExprContext; ExprContext *econtext = tidstate->ss.ps.ps_ExprContext;
BlockNumber nblocks; TableScanDesc scan;
ItemPointerData *tidList; ItemPointerData *tidList;
int numAllocTids; int numAllocTids;
int numTids; int numTids;
ListCell *l; ListCell *l;
/* /*
* We silently discard any TIDs that are out of range at the time of scan * Start scan on-demand - initializing a scan isn't free (e.g. heap stats
* start. (Since we hold at least AccessShareLock on the table, it won't * the size of the table), so it makes sense to delay that until needed -
* be possible for someone to truncate away the blocks we intend to * the node might never get executed.
* visit.)
*/ */
nblocks = RelationGetNumberOfBlocks(tidstate->ss.ss_currentRelation); if (tidstate->ss.ss_currentScanDesc == NULL)
tidstate->ss.ss_currentScanDesc =
table_beginscan(tidstate->ss.ss_currentRelation,
tidstate->ss.ps.state->es_snapshot,
0, NULL);
scan = tidstate->ss.ss_currentScanDesc;
/* /*
* We initialize the array with enough slots for the case that all quals * We initialize the array with enough slots for the case that all quals
...@@ -165,10 +169,19 @@ TidListEval(TidScanState *tidstate) ...@@ -165,10 +169,19 @@ TidListEval(TidScanState *tidstate)
DatumGetPointer(ExecEvalExprSwitchContext(tidexpr->exprstate, DatumGetPointer(ExecEvalExprSwitchContext(tidexpr->exprstate,
econtext, econtext,
&isNull)); &isNull));
if (!isNull && if (isNull)
ItemPointerIsValid(itemptr) && continue;
ItemPointerGetBlockNumber(itemptr) < nblocks)
{ /*
* We silently discard any TIDs that the AM considers invalid
* (E.g. for heap, they could be out of range at the time of scan
* start. Since we hold at least AccessShareLock on the table, it
* won't be possible for someone to truncate away the blocks we
* intend to visit.).
*/
if (!table_tuple_tid_valid(scan, itemptr))
continue;
if (numTids >= numAllocTids) if (numTids >= numAllocTids)
{ {
numAllocTids *= 2; numAllocTids *= 2;
...@@ -178,7 +191,6 @@ TidListEval(TidScanState *tidstate) ...@@ -178,7 +191,6 @@ TidListEval(TidScanState *tidstate)
} }
tidList[numTids++] = *itemptr; tidList[numTids++] = *itemptr;
} }
}
else if (tidexpr->exprstate && tidexpr->isarray) else if (tidexpr->exprstate && tidexpr->isarray)
{ {
Datum arraydatum; Datum arraydatum;
...@@ -206,14 +218,16 @@ TidListEval(TidScanState *tidstate) ...@@ -206,14 +218,16 @@ TidListEval(TidScanState *tidstate)
} }
for (i = 0; i < ndatums; i++) for (i = 0; i < ndatums; i++)
{ {
if (!ipnulls[i]) if (ipnulls[i])
{ continue;
itemptr = (ItemPointer) DatumGetPointer(ipdatums[i]); itemptr = (ItemPointer) DatumGetPointer(ipdatums[i]);
if (ItemPointerIsValid(itemptr) &&
ItemPointerGetBlockNumber(itemptr) < nblocks) if (!table_tuple_tid_valid(scan, itemptr))
continue;
tidList[numTids++] = *itemptr; tidList[numTids++] = *itemptr;
} }
}
pfree(ipdatums); pfree(ipdatums);
pfree(ipnulls); pfree(ipnulls);
} }
...@@ -306,6 +320,7 @@ TidNext(TidScanState *node) ...@@ -306,6 +320,7 @@ TidNext(TidScanState *node)
EState *estate; EState *estate;
ScanDirection direction; ScanDirection direction;
Snapshot snapshot; Snapshot snapshot;
TableScanDesc scan;
Relation heapRelation; Relation heapRelation;
TupleTableSlot *slot; TupleTableSlot *slot;
ItemPointerData *tidList; ItemPointerData *tidList;
...@@ -327,6 +342,7 @@ TidNext(TidScanState *node) ...@@ -327,6 +342,7 @@ TidNext(TidScanState *node)
if (node->tss_TidList == NULL) if (node->tss_TidList == NULL)
TidListEval(node); TidListEval(node);
scan = node->ss.ss_currentScanDesc;
tidList = node->tss_TidList; tidList = node->tss_TidList;
numTids = node->tss_NumTids; numTids = node->tss_NumTids;
...@@ -365,7 +381,7 @@ TidNext(TidScanState *node) ...@@ -365,7 +381,7 @@ TidNext(TidScanState *node)
* current according to our snapshot. * current according to our snapshot.
*/ */
if (node->tss_isCurrentOf) if (node->tss_isCurrentOf)
table_get_latest_tid(heapRelation, snapshot, &tid); table_get_latest_tid(scan, &tid);
if (table_fetch_row_version(heapRelation, &tid, snapshot, slot)) if (table_fetch_row_version(heapRelation, &tid, snapshot, slot))
return slot; return slot;
...@@ -442,6 +458,10 @@ ExecReScanTidScan(TidScanState *node) ...@@ -442,6 +458,10 @@ ExecReScanTidScan(TidScanState *node)
node->tss_NumTids = 0; node->tss_NumTids = 0;
node->tss_TidPtr = -1; node->tss_TidPtr = -1;
/* not really necessary, but seems good form */
if (node->ss.ss_currentScanDesc)
table_rescan(node->ss.ss_currentScanDesc, NULL);
ExecScanReScan(&node->ss); ExecScanReScan(&node->ss);
} }
...@@ -455,6 +475,9 @@ ExecReScanTidScan(TidScanState *node) ...@@ -455,6 +475,9 @@ ExecReScanTidScan(TidScanState *node)
void void
ExecEndTidScan(TidScanState *node) ExecEndTidScan(TidScanState *node)
{ {
if (node->ss.ss_currentScanDesc)
table_endscan(node->ss.ss_currentScanDesc);
/* /*
* Free the exprcontext * Free the exprcontext
*/ */
......
...@@ -358,6 +358,7 @@ currtid_byreloid(PG_FUNCTION_ARGS) ...@@ -358,6 +358,7 @@ currtid_byreloid(PG_FUNCTION_ARGS)
Relation rel; Relation rel;
AclResult aclresult; AclResult aclresult;
Snapshot snapshot; Snapshot snapshot;
TableScanDesc scan;
result = (ItemPointer) palloc(sizeof(ItemPointerData)); result = (ItemPointer) palloc(sizeof(ItemPointerData));
if (!reloid) if (!reloid)
...@@ -380,7 +381,9 @@ currtid_byreloid(PG_FUNCTION_ARGS) ...@@ -380,7 +381,9 @@ currtid_byreloid(PG_FUNCTION_ARGS)
ItemPointerCopy(tid, result); ItemPointerCopy(tid, result);
snapshot = RegisterSnapshot(GetLatestSnapshot()); snapshot = RegisterSnapshot(GetLatestSnapshot());
table_get_latest_tid(rel, snapshot, result); scan = table_beginscan(rel, snapshot, 0, NULL);
table_get_latest_tid(scan, result);
table_endscan(scan);
UnregisterSnapshot(snapshot); UnregisterSnapshot(snapshot);
table_close(rel, AccessShareLock); table_close(rel, AccessShareLock);
...@@ -398,6 +401,7 @@ currtid_byrelname(PG_FUNCTION_ARGS) ...@@ -398,6 +401,7 @@ currtid_byrelname(PG_FUNCTION_ARGS)
Relation rel; Relation rel;
AclResult aclresult; AclResult aclresult;
Snapshot snapshot; Snapshot snapshot;
TableScanDesc scan;
relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
rel = table_openrv(relrv, AccessShareLock); rel = table_openrv(relrv, AccessShareLock);
...@@ -415,7 +419,9 @@ currtid_byrelname(PG_FUNCTION_ARGS) ...@@ -415,7 +419,9 @@ currtid_byrelname(PG_FUNCTION_ARGS)
ItemPointerCopy(tid, result); ItemPointerCopy(tid, result);
snapshot = RegisterSnapshot(GetLatestSnapshot()); snapshot = RegisterSnapshot(GetLatestSnapshot());
table_get_latest_tid(rel, snapshot, result); scan = table_beginscan(rel, snapshot, 0, NULL);
table_get_latest_tid(scan, result);
table_endscan(scan);
UnregisterSnapshot(snapshot); UnregisterSnapshot(snapshot);
table_close(rel, AccessShareLock); table_close(rel, AccessShareLock);
......
...@@ -134,8 +134,7 @@ extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation, ...@@ -134,8 +134,7 @@ extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation,
Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple,
bool *all_dead, bool first_call); bool *all_dead, bool first_call);
extern void heap_get_latest_tid(Relation relation, Snapshot snapshot, extern void heap_get_latest_tid(TableScanDesc scan, ItemPointer tid);
ItemPointer tid);
extern void setLastTid(const ItemPointer tid); extern void setLastTid(const ItemPointer tid);
extern BulkInsertState GetBulkInsertState(void); extern BulkInsertState GetBulkInsertState(void);
......
...@@ -308,12 +308,17 @@ typedef struct TableAmRoutine ...@@ -308,12 +308,17 @@ typedef struct TableAmRoutine
Snapshot snapshot, Snapshot snapshot,
TupleTableSlot *slot); TupleTableSlot *slot);
/*
* Is tid valid for a scan of this relation.
*/
bool (*tuple_tid_valid) (TableScanDesc scan,
ItemPointer tid);
/* /*
* Return the latest version of the tuple at `tid`, by updating `tid` to * Return the latest version of the tuple at `tid`, by updating `tid` to
* point at the newest version. * point at the newest version.
*/ */
void (*tuple_get_latest_tid) (Relation rel, void (*tuple_get_latest_tid) (TableScanDesc scan,
Snapshot snapshot,
ItemPointer tid); ItemPointer tid);
/* /*
...@@ -548,10 +553,10 @@ typedef struct TableAmRoutine ...@@ -548,10 +553,10 @@ typedef struct TableAmRoutine
/* /*
* See table_relation_size(). * See table_relation_size().
* *
* Note that currently a few callers use the MAIN_FORKNUM size to vet the * Note that currently a few callers use the MAIN_FORKNUM size to figure
* validity of tids (e.g. nodeTidscans.c), and others use it to figure out * out the range of potentially interesting blocks (brin, analyze). It's
* the range of potentially interesting blocks (brin, analyze). The * probable that we'll need to revise the interface for those at some
* abstraction around this will need to be improved in the near future. * point.
*/ */
uint64 (*relation_size) (Relation rel, ForkNumber forkNumber); uint64 (*relation_size) (Relation rel, ForkNumber forkNumber);
...@@ -986,15 +991,25 @@ table_fetch_row_version(Relation rel, ...@@ -986,15 +991,25 @@ table_fetch_row_version(Relation rel,
} }
/* /*
* Return the latest version of the tuple at `tid`, by updating `tid` to * Verify that `tid` is a potentially valid tuple identifier. That doesn't
* point at the newest version. * mean that the pointed to row needs to exist or be visible, but that
* attempting to fetch the row (e.g. with table_get_latest_tid() or
* table_fetch_row_version()) should not error out if called with that tid.
*
* `scan` needs to have been started via table_beginscan().
*/ */
static inline void static inline bool
table_get_latest_tid(Relation rel, Snapshot snapshot, ItemPointer tid) table_tuple_tid_valid(TableScanDesc scan, ItemPointer tid)
{ {
rel->rd_tableam->tuple_get_latest_tid(rel, snapshot, tid); return scan->rs_rd->rd_tableam->tuple_tid_valid(scan, tid);
} }
/*
* Return the latest version of the tuple at `tid`, by updating `tid` to
* point at the newest version.
*/
extern void table_get_latest_tid(TableScanDesc scan, ItemPointer tid);
/* /*
* Return true iff tuple in slot satisfies the snapshot. * Return true iff tuple in slot satisfies the snapshot.
* *
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment