From 147e3722f7e531f15ba389a4d518efe8cd0bd736 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 17 May 2019 18:52:01 -0700 Subject: [PATCH] tableam: Avoid relying on relation size to determine validity of tids. Instead add a tableam callback to do so. To avoid adding per validation overhead, pass a scan to tuple_tid_valid. In heap's case we'd otherwise incurred a RelationGetNumberOfBlocks() call for each tid - which'd have added noticable overhead to nodeTidscan.c. Author: Andres Freund Reviewed-By: Ashwin Agrawal Discussion: https://postgr.es/m/20190515185447.gno2jtqxyktylyvs@alap3.anarazel.de --- src/backend/access/heap/heapam.c | 26 +++------ src/backend/access/heap/heapam_handler.c | 10 ++++ src/backend/access/table/tableam.c | 27 +++++++++ src/backend/executor/nodeTidscan.c | 73 ++++++++++++++++-------- src/backend/utils/adt/tid.c | 10 +++- src/include/access/heapam.h | 3 +- src/include/access/tableam.h | 37 ++++++++---- 7 files changed, 129 insertions(+), 57 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index ec9853603f..d8d4f3b1f5 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1654,8 +1654,8 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, /* * heap_get_latest_tid - get the latest tid of a specified tuple * - * Actually, this gets the latest version that is visible according to - * the passed snapshot. You can pass SnapshotDirty to get the very latest, + * Actually, this gets the latest version that is visible according to the + * scan's snapshot. Create a scan using SnapshotDirty to get the very latest, * possibly uncommitted version. * * *tid is both an input and an output parameter: it is updated to @@ -1663,28 +1663,20 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, * if no version of the row passes the snapshot test. */ void -heap_get_latest_tid(Relation relation, - Snapshot snapshot, +heap_get_latest_tid(TableScanDesc sscan, ItemPointer tid) { - BlockNumber blk; + Relation relation = sscan->rs_rd; + Snapshot snapshot = sscan->rs_snapshot; ItemPointerData ctid; TransactionId priorXmax; - /* this is to avoid Assert failures on bad input */ - if (!ItemPointerIsValid(tid)) - return; - /* - * Since this can be called with user-supplied TID, don't trust the input - * too much. (RelationGetNumberOfBlocks is an expensive check, so we - * don't check t_ctid links again this way. Note that it would not do to - * call it just once and save the result, either.) + * table_get_latest_tid verified that the passed in tid is valid. Assume + * that t_ctid links are valid however - there shouldn't be invalid ones + * in the table. */ - blk = ItemPointerGetBlockNumber(tid); - if (blk >= RelationGetNumberOfBlocks(relation)) - elog(ERROR, "block number %u is out of range for relation \"%s\"", - blk, RelationGetRelationName(relation)); + Assert(ItemPointerIsValid(tid)); /* * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 9aa468295a..35553c7c92 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -204,6 +204,15 @@ heapam_fetch_row_version(Relation relation, return false; } +static bool +heapam_tuple_tid_valid(TableScanDesc scan, ItemPointer tid) +{ + HeapScanDesc hscan = (HeapScanDesc) scan; + + return ItemPointerIsValid(tid) && + ItemPointerGetBlockNumber(tid) < hscan->rs_nblocks; +} + static bool heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, Snapshot snapshot) @@ -2568,6 +2577,7 @@ static const TableAmRoutine heapam_methods = { .tuple_fetch_row_version = heapam_fetch_row_version, .tuple_get_latest_tid = heap_get_latest_tid, + .tuple_tid_valid = heapam_tuple_tid_valid, .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot, .compute_xid_horizon_for_tuples = heap_compute_xid_horizon_for_tuples, diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index baba1ea699..6e46befdfd 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -213,6 +213,33 @@ table_index_fetch_tuple_check(Relation rel, } +/* ------------------------------------------------------------------------ + * Functions for non-modifying operations on individual tuples + * ------------------------------------------------------------------------ + */ + +void +table_get_latest_tid(TableScanDesc scan, ItemPointer tid) +{ + Relation rel = scan->rs_rd; + const TableAmRoutine *tableam = rel->rd_tableam; + + /* + * Since this can be called with user-supplied TID, don't trust the input + * too much. + */ + if (!tableam->tuple_tid_valid(scan, tid)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("tid (%u, %u) is not valid for relation for relation \"%s\"", + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid), + RelationGetRelationName(rel)))); + + return tableam->tuple_get_latest_tid(scan, tid); +} + + /* ---------------------------------------------------------------------------- * Functions to make modifications a bit simpler. * ---------------------------------------------------------------------------- diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c index 156be56a57..93335864a1 100644 --- a/src/backend/executor/nodeTidscan.c +++ b/src/backend/executor/nodeTidscan.c @@ -129,19 +129,23 @@ static void TidListEval(TidScanState *tidstate) { ExprContext *econtext = tidstate->ss.ps.ps_ExprContext; - BlockNumber nblocks; + TableScanDesc scan; ItemPointerData *tidList; int numAllocTids; int numTids; ListCell *l; /* - * We silently discard any TIDs that are out of range at the time of scan - * start. (Since we hold at least AccessShareLock on the table, it won't - * be possible for someone to truncate away the blocks we intend to - * visit.) + * Start scan on-demand - initializing a scan isn't free (e.g. heap stats + * the size of the table), so it makes sense to delay that until needed - + * the node might never get executed. */ - nblocks = RelationGetNumberOfBlocks(tidstate->ss.ss_currentRelation); + if (tidstate->ss.ss_currentScanDesc == NULL) + tidstate->ss.ss_currentScanDesc = + table_beginscan(tidstate->ss.ss_currentRelation, + tidstate->ss.ps.state->es_snapshot, + 0, NULL); + scan = tidstate->ss.ss_currentScanDesc; /* * We initialize the array with enough slots for the case that all quals @@ -165,19 +169,27 @@ TidListEval(TidScanState *tidstate) DatumGetPointer(ExecEvalExprSwitchContext(tidexpr->exprstate, econtext, &isNull)); - if (!isNull && - ItemPointerIsValid(itemptr) && - ItemPointerGetBlockNumber(itemptr) < nblocks) + if (isNull) + continue; + + /* + * We silently discard any TIDs that the AM considers invalid + * (E.g. for heap, they could be out of range at the time of scan + * start. Since we hold at least AccessShareLock on the table, it + * won't be possible for someone to truncate away the blocks we + * intend to visit.). + */ + if (!table_tuple_tid_valid(scan, itemptr)) + continue; + + if (numTids >= numAllocTids) { - if (numTids >= numAllocTids) - { - numAllocTids *= 2; - tidList = (ItemPointerData *) - repalloc(tidList, - numAllocTids * sizeof(ItemPointerData)); - } - tidList[numTids++] = *itemptr; + numAllocTids *= 2; + tidList = (ItemPointerData *) + repalloc(tidList, + numAllocTids * sizeof(ItemPointerData)); } + tidList[numTids++] = *itemptr; } else if (tidexpr->exprstate && tidexpr->isarray) { @@ -206,13 +218,15 @@ TidListEval(TidScanState *tidstate) } for (i = 0; i < ndatums; i++) { - if (!ipnulls[i]) - { - itemptr = (ItemPointer) DatumGetPointer(ipdatums[i]); - if (ItemPointerIsValid(itemptr) && - ItemPointerGetBlockNumber(itemptr) < nblocks) - tidList[numTids++] = *itemptr; - } + if (ipnulls[i]) + continue; + + itemptr = (ItemPointer) DatumGetPointer(ipdatums[i]); + + if (!table_tuple_tid_valid(scan, itemptr)) + continue; + + tidList[numTids++] = *itemptr; } pfree(ipdatums); pfree(ipnulls); @@ -306,6 +320,7 @@ TidNext(TidScanState *node) EState *estate; ScanDirection direction; Snapshot snapshot; + TableScanDesc scan; Relation heapRelation; TupleTableSlot *slot; ItemPointerData *tidList; @@ -327,6 +342,7 @@ TidNext(TidScanState *node) if (node->tss_TidList == NULL) TidListEval(node); + scan = node->ss.ss_currentScanDesc; tidList = node->tss_TidList; numTids = node->tss_NumTids; @@ -365,7 +381,7 @@ TidNext(TidScanState *node) * current according to our snapshot. */ if (node->tss_isCurrentOf) - table_get_latest_tid(heapRelation, snapshot, &tid); + table_get_latest_tid(scan, &tid); if (table_fetch_row_version(heapRelation, &tid, snapshot, slot)) return slot; @@ -442,6 +458,10 @@ ExecReScanTidScan(TidScanState *node) node->tss_NumTids = 0; node->tss_TidPtr = -1; + /* not really necessary, but seems good form */ + if (node->ss.ss_currentScanDesc) + table_rescan(node->ss.ss_currentScanDesc, NULL); + ExecScanReScan(&node->ss); } @@ -455,6 +475,9 @@ ExecReScanTidScan(TidScanState *node) void ExecEndTidScan(TidScanState *node) { + if (node->ss.ss_currentScanDesc) + table_endscan(node->ss.ss_currentScanDesc); + /* * Free the exprcontext */ diff --git a/src/backend/utils/adt/tid.c b/src/backend/utils/adt/tid.c index 6ab26d8ea8..1aab30b6aa 100644 --- a/src/backend/utils/adt/tid.c +++ b/src/backend/utils/adt/tid.c @@ -358,6 +358,7 @@ currtid_byreloid(PG_FUNCTION_ARGS) Relation rel; AclResult aclresult; Snapshot snapshot; + TableScanDesc scan; result = (ItemPointer) palloc(sizeof(ItemPointerData)); if (!reloid) @@ -380,7 +381,9 @@ currtid_byreloid(PG_FUNCTION_ARGS) ItemPointerCopy(tid, result); snapshot = RegisterSnapshot(GetLatestSnapshot()); - table_get_latest_tid(rel, snapshot, result); + scan = table_beginscan(rel, snapshot, 0, NULL); + table_get_latest_tid(scan, result); + table_endscan(scan); UnregisterSnapshot(snapshot); table_close(rel, AccessShareLock); @@ -398,6 +401,7 @@ currtid_byrelname(PG_FUNCTION_ARGS) Relation rel; AclResult aclresult; Snapshot snapshot; + TableScanDesc scan; relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); rel = table_openrv(relrv, AccessShareLock); @@ -415,7 +419,9 @@ currtid_byrelname(PG_FUNCTION_ARGS) ItemPointerCopy(tid, result); snapshot = RegisterSnapshot(GetLatestSnapshot()); - table_get_latest_tid(rel, snapshot, result); + scan = table_beginscan(rel, snapshot, 0, NULL); + table_get_latest_tid(scan, result); + table_endscan(scan); UnregisterSnapshot(snapshot); table_close(rel, AccessShareLock); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 77e5e603b0..6b8c7020c8 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -134,8 +134,7 @@ extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, bool *all_dead, bool first_call); -extern void heap_get_latest_tid(Relation relation, Snapshot snapshot, - ItemPointer tid); +extern void heap_get_latest_tid(TableScanDesc scan, ItemPointer tid); extern void setLastTid(const ItemPointer tid); extern BulkInsertState GetBulkInsertState(void); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index c5d6460203..8fbeb02033 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -308,12 +308,17 @@ typedef struct TableAmRoutine Snapshot snapshot, TupleTableSlot *slot); + /* + * Is tid valid for a scan of this relation. + */ + bool (*tuple_tid_valid) (TableScanDesc scan, + ItemPointer tid); + /* * Return the latest version of the tuple at `tid`, by updating `tid` to * point at the newest version. */ - void (*tuple_get_latest_tid) (Relation rel, - Snapshot snapshot, + void (*tuple_get_latest_tid) (TableScanDesc scan, ItemPointer tid); /* @@ -548,10 +553,10 @@ typedef struct TableAmRoutine /* * See table_relation_size(). * - * Note that currently a few callers use the MAIN_FORKNUM size to vet the - * validity of tids (e.g. nodeTidscans.c), and others use it to figure out - * the range of potentially interesting blocks (brin, analyze). The - * abstraction around this will need to be improved in the near future. + * Note that currently a few callers use the MAIN_FORKNUM size to figure + * out the range of potentially interesting blocks (brin, analyze). It's + * probable that we'll need to revise the interface for those at some + * point. */ uint64 (*relation_size) (Relation rel, ForkNumber forkNumber); @@ -985,15 +990,25 @@ table_fetch_row_version(Relation rel, return rel->rd_tableam->tuple_fetch_row_version(rel, tid, snapshot, slot); } +/* + * Verify that `tid` is a potentially valid tuple identifier. That doesn't + * mean that the pointed to row needs to exist or be visible, but that + * attempting to fetch the row (e.g. with table_get_latest_tid() or + * table_fetch_row_version()) should not error out if called with that tid. + * + * `scan` needs to have been started via table_beginscan(). + */ +static inline bool +table_tuple_tid_valid(TableScanDesc scan, ItemPointer tid) +{ + return scan->rs_rd->rd_tableam->tuple_tid_valid(scan, tid); +} + /* * Return the latest version of the tuple at `tid`, by updating `tid` to * point at the newest version. */ -static inline void -table_get_latest_tid(Relation rel, Snapshot snapshot, ItemPointer tid) -{ - rel->rd_tableam->tuple_get_latest_tid(rel, snapshot, tid); -} +extern void table_get_latest_tid(TableScanDesc scan, ItemPointer tid); /* * Return true iff tuple in slot satisfies the snapshot.