Don't to predicate lock for analyze scans, refactor scan option passing.

Before this commit, when ANALYZE was run on a table and serializable
was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION
LEVEL SERIALIZABLE, or default_transaction_isolation being set to
serializable) a null pointer dereference lead to a crash.

The analyze scan doesn't need a snapshot (nor predicate locking), but
before this commit a scan only contained information about being a
bitmap or sample scan.

Refactor the option passing to the scan_begin callback to use a
bitmask instead. Alternatively we could have added a new boolean
parameter, but that seems harder to read. Even before this issue
various people (Heikki, Tom, Robert) suggested doing so.

These changes don't change the scan APIs outside of tableam. The flags
argument could be exposed, it's not necessary to fix this
problem. Also the wrapper table_beginscan* functions encapsulate most
of that complexity.

After these changes fixing the bug is trivial, just don't acquire
predicate lock for analyze style scans. That was already done for
bitmap heap scans.  Add an assert that a snapshot is passed when
acquiring the predicate lock, so this kind of bug doesn't require
running with serializable.

Also add a comment about sample scans currently requiring predicate
locking the entire relation, that previously wasn't remarked upon.

Reported-By: Joe Wildish
Author: Andres Freund
Discussion:
    https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx
    https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de
    https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
This commit is contained in:
Andres Freund 2019-05-19 15:10:28 -07:00
parent bd1592e857
commit c3b23ae457
8 changed files with 160 additions and 102 deletions

View File

@ -245,8 +245,8 @@ initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) &&
scan->rs_nblocks > NBuffers / 4)
{
allow_strat = scan->rs_base.rs_allow_strat;
allow_sync = scan->rs_base.rs_allow_sync;
allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0;
allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
}
else
allow_strat = allow_sync = false;
@ -267,7 +267,10 @@ initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
if (scan->rs_base.rs_parallel != NULL)
{
/* For parallel scan, believe whatever ParallelTableScanDesc says. */
scan->rs_base.rs_syncscan = scan->rs_base.rs_parallel->phs_syncscan;
if (scan->rs_base.rs_parallel->phs_syncscan)
scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
else
scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
}
else if (keep_startblock)
{
@ -276,16 +279,19 @@ initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
* so that rewinding a cursor doesn't generate surprising results.
* Reset the active syncscan setting, though.
*/
scan->rs_base.rs_syncscan = (allow_sync && synchronize_seqscans);
if (allow_sync && synchronize_seqscans)
scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
else
scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
}
else if (allow_sync && synchronize_seqscans)
{
scan->rs_base.rs_syncscan = true;
scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks);
}
else
{
scan->rs_base.rs_syncscan = false;
scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
scan->rs_startblock = 0;
}
@ -305,11 +311,11 @@ initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
/*
* Currently, we don't have a stats counter for bitmap heap scans (but the
* underlying bitmap index scans will be counted) or sample scans (we only
* update stats for tuple fetches there)
* Currently, we only have a stats counter for sequential heap scans (but
* e.g for bitmap scans the underlying bitmap index scans will be counted,
* and for sample scans we update stats for tuple fetches).
*/
if (!scan->rs_base.rs_bitmapscan && !scan->rs_base.rs_samplescan)
if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
pgstat_count_heap_scan(scan->rs_base.rs_rd);
}
@ -325,7 +331,8 @@ heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlk
HeapScanDesc scan = (HeapScanDesc) sscan;
Assert(!scan->rs_inited); /* else too late to change */
Assert(!scan->rs_base.rs_syncscan); /* else rs_startblock is significant */
/* else rs_startblock is significant */
Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC));
/* Check startBlk is valid (but allow case of zero blocks...) */
Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
@ -375,7 +382,7 @@ heapgetpage(TableScanDesc sscan, BlockNumber page)
RBM_NORMAL, scan->rs_strategy);
scan->rs_cblock = page;
if (!scan->rs_base.rs_pageatatime)
if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE))
return;
buffer = scan->rs_cbuf;
@ -574,7 +581,7 @@ heapgettup(HeapScanDesc scan,
* time, and much more likely that we'll just bollix things for
* forward scanners.
*/
scan->rs_base.rs_syncscan = false;
scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
/* start from last page of the scan */
if (scan->rs_startblock > 0)
page = scan->rs_startblock - 1;
@ -738,7 +745,7 @@ heapgettup(HeapScanDesc scan,
* a little bit backwards on every invocation, which is confusing.
* We don't guarantee any specific ordering in general, though.
*/
if (scan->rs_base.rs_syncscan)
if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
ss_report_location(scan->rs_base.rs_rd, page);
}
@ -885,7 +892,7 @@ heapgettup_pagemode(HeapScanDesc scan,
* time, and much more likely that we'll just bollix things for
* forward scanners.
*/
scan->rs_base.rs_syncscan = false;
scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
/* start from last page of the scan */
if (scan->rs_startblock > 0)
page = scan->rs_startblock - 1;
@ -1037,7 +1044,7 @@ heapgettup_pagemode(HeapScanDesc scan,
* a little bit backwards on every invocation, which is confusing.
* We don't guarantee any specific ordering in general, though.
*/
if (scan->rs_base.rs_syncscan)
if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
ss_report_location(scan->rs_base.rs_rd, page);
}
@ -1125,12 +1132,7 @@ TableScanDesc
heap_beginscan(Relation relation, Snapshot snapshot,
int nkeys, ScanKey key,
ParallelTableScanDesc parallel_scan,
bool allow_strat,
bool allow_sync,
bool allow_pagemode,
bool is_bitmapscan,
bool is_samplescan,
bool temp_snap)
uint32 flags)
{
HeapScanDesc scan;
@ -1151,33 +1153,39 @@ heap_beginscan(Relation relation, Snapshot snapshot,
scan->rs_base.rs_rd = relation;
scan->rs_base.rs_snapshot = snapshot;
scan->rs_base.rs_nkeys = nkeys;
scan->rs_base.rs_bitmapscan = is_bitmapscan;
scan->rs_base.rs_samplescan = is_samplescan;
scan->rs_strategy = NULL; /* set in initscan */
scan->rs_base.rs_allow_strat = allow_strat;
scan->rs_base.rs_allow_sync = allow_sync;
scan->rs_base.rs_temp_snap = temp_snap;
scan->rs_base.rs_flags = flags;
scan->rs_base.rs_parallel = parallel_scan;
scan->rs_strategy = NULL; /* set in initscan */
/*
* we can use page-at-a-time mode if it's an MVCC-safe snapshot
* Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
*/
scan->rs_base.rs_pageatatime =
allow_pagemode && snapshot && IsMVCCSnapshot(snapshot);
if (!(snapshot && IsMVCCSnapshot(snapshot)))
scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
/*
* For a seqscan in a serializable transaction, acquire a predicate lock
* on the entire relation. This is required not only to lock all the
* matching tuples, but also to conflict with new insertions into the
* table. In an indexscan, we take page locks on the index pages covering
* the range specified in the scan qual, but in a heap scan there is
* nothing more fine-grained to lock. A bitmap scan is a different story,
* there we have already scanned the index and locked the index pages
* covering the predicate. But in that case we still have to lock any
* matching heap tuples.
* For seqscan and sample scans in a serializable transaction, acquire a
* predicate lock on the entire relation. This is required not only to
* lock all the matching tuples, but also to conflict with new insertions
* into the table. In an indexscan, we take page locks on the index pages
* covering the range specified in the scan qual, but in a heap scan there
* is nothing more fine-grained to lock. A bitmap scan is a different
* story, there we have already scanned the index and locked the index
* pages covering the predicate. But in that case we still have to lock
* any matching heap tuples. For sample scan we could optimize the locking
* to be at least page-level granularity, but we'd need to add per-tuple
* locking for that.
*/
if (!is_bitmapscan)
if (scan->rs_base.rs_flags & (SO_TYPE_SEQSCAN | SO_TYPE_SAMPLESCAN))
{
/*
* Ensure a missing snapshot is noticed reliably, even if the
* isolation mode means predicate locking isn't performed (and
* therefore the snapshot isn't used here).
*/
Assert(snapshot);
PredicateLockRelation(relation, snapshot);
}
/* we only need to set this up once */
scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
@ -1204,10 +1212,21 @@ heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
if (set_params)
{
scan->rs_base.rs_allow_strat = allow_strat;
scan->rs_base.rs_allow_sync = allow_sync;
scan->rs_base.rs_pageatatime =
allow_pagemode && IsMVCCSnapshot(scan->rs_base.rs_snapshot);
if (allow_strat)
scan->rs_base.rs_flags |= SO_ALLOW_STRAT;
else
scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT;
if (allow_sync)
scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
else
scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
if (allow_pagemode && scan->rs_base.rs_snapshot &&
IsMVCCSnapshot(scan->rs_base.rs_snapshot))
scan->rs_base.rs_flags |= SO_ALLOW_PAGEMODE;
else
scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
}
/*
@ -1246,7 +1265,7 @@ heap_endscan(TableScanDesc sscan)
if (scan->rs_strategy != NULL)
FreeAccessStrategy(scan->rs_strategy);
if (scan->rs_base.rs_temp_snap)
if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
UnregisterSnapshot(scan->rs_base.rs_snapshot);
pfree(scan);
@ -1288,7 +1307,7 @@ heap_getnext(TableScanDesc sscan, ScanDirection direction)
HEAPDEBUG_1; /* heap_getnext( info ) */
if (scan->rs_base.rs_pageatatime)
if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
heapgettup_pagemode(scan, direction,
scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
else
@ -1335,11 +1354,10 @@ heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *s
HEAPAMSLOTDEBUG_1; /* heap_getnextslot( info ) */
if (scan->rs_base.rs_pageatatime)
heapgettup_pagemode(scan, direction,
scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
else
heapgettup(scan, direction, scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
if (scan->rs_ctup.t_data == NULL)
{

View File

@ -2323,7 +2323,7 @@ heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate)
* a little bit backwards on every invocation, which is confusing.
* We don't guarantee any specific ordering in general, though.
*/
if (scan->rs_syncscan)
if (scan->rs_flags & SO_ALLOW_SYNC)
ss_report_location(scan->rs_rd, blockno);
if (blockno == hscan->rs_startblock)
@ -2357,7 +2357,7 @@ heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
HeapScanDesc hscan = (HeapScanDesc) scan;
TsmRoutine *tsm = scanstate->tsmroutine;
BlockNumber blockno = hscan->rs_cblock;
bool pagemode = scan->rs_pageatatime;
bool pagemode = (scan->rs_flags & SO_ALLOW_PAGEMODE) != 0;
Page page;
bool all_visible;
@ -2504,7 +2504,7 @@ SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
{
HeapScanDesc hscan = (HeapScanDesc) scan;
if (scan->rs_pageatatime)
if (scan->rs_flags & SO_ALLOW_PAGEMODE)
{
/*
* In pageatatime mode, heapgetpage() already did visibility checks,

View File

@ -93,12 +93,13 @@ table_slot_create(Relation relation, List **reglist)
TableScanDesc
table_beginscan_catalog(Relation relation, int nkeys, struct ScanKeyData *key)
{
uint32 flags = SO_TYPE_SEQSCAN |
SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT;
Oid relid = RelationGetRelid(relation);
Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key,
NULL, true, true, true, false,
false, true);
NULL, flags);
}
void
@ -108,7 +109,7 @@ table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot)
RegisterSnapshot(snapshot);
scan->rs_snapshot = snapshot;
scan->rs_temp_snap = true;
scan->rs_flags |= SO_TEMP_SNAPSHOT;
}
@ -156,6 +157,8 @@ TableScanDesc
table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan)
{
Snapshot snapshot;
uint32 flags = SO_TYPE_SEQSCAN |
SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
Assert(RelationGetRelid(relation) == parallel_scan->phs_relid);
@ -165,6 +168,7 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan)
snapshot = RestoreSnapshot((char *) parallel_scan +
parallel_scan->phs_snapshot_off);
RegisterSnapshot(snapshot);
flags |= SO_TEMP_SNAPSHOT;
}
else
{
@ -173,9 +177,7 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan)
}
return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
parallel_scan, true, true, true,
false, false,
!parallel_scan->phs_snapshot_any);
parallel_scan, flags);
}

View File

@ -110,12 +110,7 @@ typedef enum
extern TableScanDesc heap_beginscan(Relation relation, Snapshot snapshot,
int nkeys, ScanKey key,
ParallelTableScanDesc parallel_scan,
bool allow_strat,
bool allow_sync,
bool allow_pagemode,
bool is_bitmapscan,
bool is_samplescan,
bool temp_snap);
uint32 flags);
extern void heap_setscanlimits(TableScanDesc scan, BlockNumber startBlk,
BlockNumber endBlk);
extern void heapgetpage(TableScanDesc scan, BlockNumber page);

View File

@ -35,13 +35,12 @@ typedef struct TableScanDescData
struct SnapshotData *rs_snapshot; /* snapshot to see */
int rs_nkeys; /* number of scan keys */
struct ScanKeyData *rs_key; /* array of scan key descriptors */
bool rs_bitmapscan; /* true if this is really a bitmap scan */
bool rs_samplescan; /* true if this is really a sample scan */
bool rs_pageatatime; /* verify visibility page-at-a-time? */
bool rs_allow_strat; /* allow or disallow use of access strategy */
bool rs_allow_sync; /* allow or disallow use of syncscan */
bool rs_temp_snap; /* unregister snapshot at scan end? */
bool rs_syncscan; /* report location to syncscan logic? */
/*
* Information about type and behaviour of the scan, a bitmask of members
* of the ScanOptions enum (see tableam.h).
*/
uint32 rs_flags;
struct ParallelTableScanDescData *rs_parallel; /* parallel scan
* information */

View File

@ -39,6 +39,28 @@ struct TBMIterateResult;
struct VacuumParams;
struct ValidateIndexState;
/*
* Bitmask values for the flags argument to the scan_begin callback.
*/
typedef enum ScanOptions
{
/* one of SO_TYPE_* may be specified */
SO_TYPE_SEQSCAN = 1 << 0,
SO_TYPE_BITMAPSCAN = 1 << 1,
SO_TYPE_SAMPLESCAN = 1 << 2,
SO_TYPE_ANALYZE = 1 << 3,
/* several of SO_ALLOW_* may be specified */
/* allow or disallow use of access strategy */
SO_ALLOW_STRAT = 1 << 4,
/* report location to syncscan logic? */
SO_ALLOW_SYNC = 1 << 5,
/* verify visibility page-at-a-time? */
SO_ALLOW_PAGEMODE = 1 << 6,
/* unregister snapshot at scan end? */
SO_TEMP_SNAPSHOT = 1 << 7
} ScanOptions;
/*
* Result codes for table_{update,delete,lock_tuple}, and for visibility
@ -78,7 +100,6 @@ typedef enum TM_Result
TM_WouldBlock
} TM_Result;
/*
* When table_update, table_delete, or table_lock_tuple fail because the target
* tuple is already outdated, they fill in this struct to provide information
@ -170,26 +191,17 @@ typedef struct TableAmRoutine
* parallelscan_initialize(), and has to be for the same relation. Will
* only be set coming from table_beginscan_parallel().
*
* allow_{strat, sync, pagemode} specify whether a scan strategy,
* synchronized scans, or page mode may be used (although not every AM
* will support those).
*
* is_{bitmapscan, samplescan} specify whether the scan is intended to
* support those types of scans.
*
* if temp_snap is true, the snapshot will need to be deallocated at
* scan_end.
* `flags` is a bitmask indicating the type of scan (ScanOptions's
* SO_TYPE_*, currently only one may be specified), options controlling
* the scan's behaviour (ScanOptions's SO_ALLOW_*, several may be
* specified, an AM may ignore unsupported ones) and whether the snapshot
* needs to be deallocated at scan_end (ScanOptions's SO_TEMP_SNAPSHOT).
*/
TableScanDesc (*scan_begin) (Relation rel,
Snapshot snapshot,
int nkeys, struct ScanKeyData *key,
ParallelTableScanDesc pscan,
bool allow_strat,
bool allow_sync,
bool allow_pagemode,
bool is_bitmapscan,
bool is_samplescan,
bool temp_snap);
uint32 flags);
/*
* Release resources and deallocate scan. If TableScanDesc.temp_snap,
@ -715,8 +727,10 @@ static inline TableScanDesc
table_beginscan(Relation rel, Snapshot snapshot,
int nkeys, struct ScanKeyData *key)
{
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL,
true, true, true, false, false, false);
uint32 flags = SO_TYPE_SEQSCAN |
SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
}
/*
@ -738,9 +752,14 @@ table_beginscan_strat(Relation rel, Snapshot snapshot,
int nkeys, struct ScanKeyData *key,
bool allow_strat, bool allow_sync)
{
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL,
allow_strat, allow_sync, true,
false, false, false);
uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_PAGEMODE;
if (allow_strat)
flags |= SO_ALLOW_STRAT;
if (allow_sync)
flags |= SO_ALLOW_SYNC;
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
}
/*
@ -753,8 +772,9 @@ static inline TableScanDesc
table_beginscan_bm(Relation rel, Snapshot snapshot,
int nkeys, struct ScanKeyData *key)
{
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL,
false, false, true, true, false, false);
uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE;
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
}
/*
@ -770,9 +790,16 @@ table_beginscan_sampling(Relation rel, Snapshot snapshot,
bool allow_strat, bool allow_sync,
bool allow_pagemode)
{
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL,
allow_strat, allow_sync, allow_pagemode,
false, true, false);
uint32 flags = SO_TYPE_SAMPLESCAN;
if (allow_strat)
flags |= SO_ALLOW_STRAT;
if (allow_sync)
flags |= SO_ALLOW_SYNC;
if (allow_pagemode)
flags |= SO_ALLOW_PAGEMODE;
return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
}
/*
@ -783,9 +810,9 @@ table_beginscan_sampling(Relation rel, Snapshot snapshot,
static inline TableScanDesc
table_beginscan_analyze(Relation rel)
{
return rel->rd_tableam->scan_begin(rel, NULL, 0, NULL, NULL,
true, false, true,
false, true, false);
uint32 flags = SO_TYPE_ANALYZE;
return rel->rd_tableam->scan_begin(rel, NULL, 0, NULL, NULL, flags);
}
/*

View File

@ -163,6 +163,14 @@ ERROR: relation "does_not_exist" does not exist
VACUUM (SKIP_LOCKED) vactst;
VACUUM (SKIP_LOCKED, FULL) vactst;
ANALYZE (SKIP_LOCKED) vactst;
-- ensure VACUUM and ANALYZE don't have a problem with serializable
SET default_transaction_isolation = serializable;
VACUUM vactst;
ANALYZE vactst;
RESET default_transaction_isolation;
BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;
ANALYZE vactst;
COMMIT;
DROP TABLE vaccluster;
DROP TABLE vactst;
DROP TABLE vacparted;

View File

@ -124,6 +124,15 @@ VACUUM (SKIP_LOCKED) vactst;
VACUUM (SKIP_LOCKED, FULL) vactst;
ANALYZE (SKIP_LOCKED) vactst;
-- ensure VACUUM and ANALYZE don't have a problem with serializable
SET default_transaction_isolation = serializable;
VACUUM vactst;
ANALYZE vactst;
RESET default_transaction_isolation;
BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE;
ANALYZE vactst;
COMMIT;
DROP TABLE vaccluster;
DROP TABLE vactst;
DROP TABLE vacparted;