diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 39eab1e28d..33dac14edc 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -921,6 +921,173 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, pfree(isnull); } +static bool +heapam_scan_analyze_next_block(TableScanDesc sscan, BlockNumber blockno, + BufferAccessStrategy bstrategy) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + + /* + * We must maintain a pin on the target page's buffer to ensure that + * concurrent activity - e.g. HOT pruning - doesn't delete tuples out from + * under us. Hence, pin the page until we are done looking at it. We + * also choose to hold sharelock on the buffer throughout --- we could + * release and re-acquire sharelock for each tuple, but since we aren't + * doing much work per tuple, the extra lock traffic is probably better + * avoided. + */ + scan->rs_cblock = blockno; + scan->rs_cindex = FirstOffsetNumber; + scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, + blockno, RBM_NORMAL, bstrategy); + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + /* in heap all blocks can contain tuples, so always return true */ + return true; +} + +static bool +heapam_scan_analyze_next_tuple(TableScanDesc sscan, TransactionId OldestXmin, + double *liverows, double *deadrows, + TupleTableSlot *slot) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + Page targpage; + OffsetNumber maxoffset; + BufferHeapTupleTableSlot *hslot; + + Assert(TTS_IS_BUFFERTUPLE(slot)); + + hslot = (BufferHeapTupleTableSlot *) slot; + targpage = BufferGetPage(scan->rs_cbuf); + maxoffset = PageGetMaxOffsetNumber(targpage); + + /* Inner loop over all tuples on the selected page */ + for (; scan->rs_cindex <= maxoffset; scan->rs_cindex++) + { + ItemId itemid; + HeapTuple targtuple = &hslot->base.tupdata; + bool sample_it = false; + + itemid = PageGetItemId(targpage, scan->rs_cindex); + + /* + * We ignore unused and redirect line pointers. DEAD line pointers + * should be counted as dead, because we need vacuum to run to get rid + * of them. Note that this rule agrees with the way that + * heap_page_prune() counts things. + */ + if (!ItemIdIsNormal(itemid)) + { + if (ItemIdIsDead(itemid)) + *deadrows += 1; + continue; + } + + ItemPointerSet(&targtuple->t_self, scan->rs_cblock, scan->rs_cindex); + + targtuple->t_tableOid = RelationGetRelid(scan->rs_base.rs_rd); + targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid); + targtuple->t_len = ItemIdGetLength(itemid); + + switch (HeapTupleSatisfiesVacuum(targtuple, OldestXmin, scan->rs_cbuf)) + { + case HEAPTUPLE_LIVE: + sample_it = true; + *liverows += 1; + break; + + case HEAPTUPLE_DEAD: + case HEAPTUPLE_RECENTLY_DEAD: + /* Count dead and recently-dead rows */ + *deadrows += 1; + break; + + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * Insert-in-progress rows are not counted. We assume that + * when the inserting transaction commits or aborts, it will + * send a stats message to increment the proper count. This + * works right only if that transaction ends after we finish + * analyzing the table; if things happen in the other order, + * its stats update will be overwritten by ours. However, the + * error will be large only if the other transaction runs long + * enough to insert many tuples, so assuming it will finish + * after us is the safer option. + * + * A special case is that the inserting transaction might be + * our own. In this case we should count and sample the row, + * to accommodate users who load a table and analyze it in one + * transaction. (pgstat_report_analyze has to adjust the + * numbers we send to the stats collector to make this come + * out right.) + */ + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data))) + { + sample_it = true; + *liverows += 1; + } + break; + + case HEAPTUPLE_DELETE_IN_PROGRESS: + + /* + * We count and sample delete-in-progress rows the same as + * live ones, so that the stats counters come out right if the + * deleting transaction commits after us, per the same + * reasoning given above. + * + * If the delete was done by our own transaction, however, we + * must count the row as dead to make pgstat_report_analyze's + * stats adjustments come out right. (Note: this works out + * properly when the row was both inserted and deleted in our + * xact.) + * + * The net effect of these choices is that we act as though an + * IN_PROGRESS transaction hasn't happened yet, except if it + * is our own transaction, which we assume has happened. + * + * This approach ensures that we behave sanely if we see both + * the pre-image and post-image rows for a row being updated + * by a concurrent transaction: we will sample the pre-image + * but not the post-image. We also get sane results if the + * concurrent transaction never commits. + */ + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data))) + deadrows += 1; + else + { + sample_it = true; + liverows += 1; + } + break; + + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + + if (sample_it) + { + ExecStoreBufferHeapTuple(targtuple, slot, scan->rs_cbuf); + scan->rs_cindex++; + + /* note that we leave the buffer locked here! */ + return true; + } + } + + /* Now release the lock and pin on the page */ + UnlockReleaseBuffer(scan->rs_cbuf); + scan->rs_cbuf = InvalidBuffer; + + /* also prevent old slot contents from having pin on page */ + ExecClearTuple(slot); + + return false; +} + static double heapam_index_build_range_scan(Relation heapRelation, Relation indexRelation, @@ -1743,6 +1910,9 @@ static const TableAmRoutine heapam_methods = { .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate, .relation_copy_data = heapam_relation_copy_data, .relation_copy_for_cluster = heapam_relation_copy_for_cluster, + .relation_vacuum = heap_vacuum_rel, + .scan_analyze_next_block = heapam_scan_analyze_next_block, + .scan_analyze_next_tuple = heapam_scan_analyze_next_tuple, .index_build_range_scan = heapam_index_build_range_scan, .index_validate_scan = heapam_index_validate_scan, }; diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 3465713d10..e0ec62c88c 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -17,11 +17,11 @@ #include #include "access/genam.h" -#include "access/heapam.h" #include "access/multixact.h" #include "access/relation.h" #include "access/sysattr.h" #include "access/table.h" +#include "access/tableam.h" #include "access/transam.h" #include "access/tupconvert.h" #include "access/tuptoaster.h" @@ -1014,6 +1014,8 @@ acquire_sample_rows(Relation onerel, int elevel, TransactionId OldestXmin; BlockSamplerData bs; ReservoirStateData rstate; + TupleTableSlot *slot; + TableScanDesc scan; Assert(targrows > 0); @@ -1027,193 +1029,68 @@ acquire_sample_rows(Relation onerel, int elevel, /* Prepare for sampling rows */ reservoir_init_selection_state(&rstate, targrows); + scan = table_beginscan_analyze(onerel); + slot = table_slot_create(onerel, NULL); + /* Outer loop over blocks to sample */ while (BlockSampler_HasMore(&bs)) { BlockNumber targblock = BlockSampler_Next(&bs); - Buffer targbuffer; - Page targpage; - OffsetNumber targoffset, - maxoffset; vacuum_delay_point(); - /* - * We must maintain a pin on the target page's buffer to ensure that - * the maxoffset value stays good (else concurrent VACUUM might delete - * tuples out from under us). Hence, pin the page until we are done - * looking at it. We also choose to hold sharelock on the buffer - * throughout --- we could release and re-acquire sharelock for each - * tuple, but since we aren't doing much work per tuple, the extra - * lock traffic is probably better avoided. - */ - targbuffer = ReadBufferExtended(onerel, MAIN_FORKNUM, targblock, - RBM_NORMAL, vac_strategy); - LockBuffer(targbuffer, BUFFER_LOCK_SHARE); - targpage = BufferGetPage(targbuffer); - maxoffset = PageGetMaxOffsetNumber(targpage); + if (!table_scan_analyze_next_block(scan, targblock, vac_strategy)) + continue; - /* Inner loop over all tuples on the selected page */ - for (targoffset = FirstOffsetNumber; targoffset <= maxoffset; targoffset++) + while (table_scan_analyze_next_tuple(scan, OldestXmin, &liverows, &deadrows, slot)) { - ItemId itemid; - HeapTupleData targtuple; - bool sample_it = false; - - itemid = PageGetItemId(targpage, targoffset); - /* - * We ignore unused and redirect line pointers. DEAD line - * pointers should be counted as dead, because we need vacuum to - * run to get rid of them. Note that this rule agrees with the - * way that heap_page_prune() counts things. + * The first targrows sample rows are simply copied into the + * reservoir. Then we start replacing tuples in the sample until + * we reach the end of the relation. This algorithm is from Jeff + * Vitter's paper (see full citation below). It works by + * repeatedly computing the number of tuples to skip before + * selecting a tuple, which replaces a randomly chosen element of + * the reservoir (current set of tuples). At all times the + * reservoir is a true random sample of the tuples we've passed + * over so far, so when we fall off the end of the relation we're + * done. */ - if (!ItemIdIsNormal(itemid)) - { - if (ItemIdIsDead(itemid)) - deadrows += 1; - continue; - } - - ItemPointerSet(&targtuple.t_self, targblock, targoffset); - - targtuple.t_tableOid = RelationGetRelid(onerel); - targtuple.t_data = (HeapTupleHeader) PageGetItem(targpage, itemid); - targtuple.t_len = ItemIdGetLength(itemid); - - switch (HeapTupleSatisfiesVacuum(&targtuple, - OldestXmin, - targbuffer)) - { - case HEAPTUPLE_LIVE: - sample_it = true; - liverows += 1; - break; - - case HEAPTUPLE_DEAD: - case HEAPTUPLE_RECENTLY_DEAD: - /* Count dead and recently-dead rows */ - deadrows += 1; - break; - - case HEAPTUPLE_INSERT_IN_PROGRESS: - - /* - * Insert-in-progress rows are not counted. We assume - * that when the inserting transaction commits or aborts, - * it will send a stats message to increment the proper - * count. This works right only if that transaction ends - * after we finish analyzing the table; if things happen - * in the other order, its stats update will be - * overwritten by ours. However, the error will be large - * only if the other transaction runs long enough to - * insert many tuples, so assuming it will finish after us - * is the safer option. - * - * A special case is that the inserting transaction might - * be our own. In this case we should count and sample - * the row, to accommodate users who load a table and - * analyze it in one transaction. (pgstat_report_analyze - * has to adjust the numbers we send to the stats - * collector to make this come out right.) - */ - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple.t_data))) - { - sample_it = true; - liverows += 1; - } - break; - - case HEAPTUPLE_DELETE_IN_PROGRESS: - - /* - * We count and sample delete-in-progress rows the same as - * live ones, so that the stats counters come out right if - * the deleting transaction commits after us, per the same - * reasoning given above. - * - * If the delete was done by our own transaction, however, - * we must count the row as dead to make - * pgstat_report_analyze's stats adjustments come out - * right. (Note: this works out properly when the row was - * both inserted and deleted in our xact.) - * - * The net effect of these choices is that we act as - * though an IN_PROGRESS transaction hasn't happened yet, - * except if it is our own transaction, which we assume - * has happened. - * - * This approach ensures that we behave sanely if we see - * both the pre-image and post-image rows for a row being - * updated by a concurrent transaction: we will sample the - * pre-image but not the post-image. We also get sane - * results if the concurrent transaction never commits. - */ - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple.t_data))) - deadrows += 1; - else - { - sample_it = true; - liverows += 1; - } - break; - - default: - elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); - break; - } - - if (sample_it) + if (numrows < targrows) + rows[numrows++] = ExecCopySlotHeapTuple(slot); + else { /* - * The first targrows sample rows are simply copied into the - * reservoir. Then we start replacing tuples in the sample - * until we reach the end of the relation. This algorithm is - * from Jeff Vitter's paper (see full citation below). It - * works by repeatedly computing the number of tuples to skip - * before selecting a tuple, which replaces a randomly chosen - * element of the reservoir (current set of tuples). At all - * times the reservoir is a true random sample of the tuples - * we've passed over so far, so when we fall off the end of - * the relation we're done. + * t in Vitter's paper is the number of records already + * processed. If we need to compute a new S value, we must + * use the not-yet-incremented value of samplerows as t. */ - if (numrows < targrows) - rows[numrows++] = heap_copytuple(&targtuple); - else + if (rowstoskip < 0) + rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows); + + if (rowstoskip <= 0) { /* - * t in Vitter's paper is the number of records already - * processed. If we need to compute a new S value, we - * must use the not-yet-incremented value of samplerows as - * t. + * Found a suitable tuple, so save it, replacing one old + * tuple at random */ - if (rowstoskip < 0) - rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows); + int k = (int) (targrows * sampler_random_fract(rstate.randstate)); - if (rowstoskip <= 0) - { - /* - * Found a suitable tuple, so save it, replacing one - * old tuple at random - */ - int k = (int) (targrows * sampler_random_fract(rstate.randstate)); - - Assert(k >= 0 && k < targrows); - heap_freetuple(rows[k]); - rows[k] = heap_copytuple(&targtuple); - } - - rowstoskip -= 1; + Assert(k >= 0 && k < targrows); + heap_freetuple(rows[k]); + rows[k] = ExecCopySlotHeapTuple(slot); } - samplerows += 1; + rowstoskip -= 1; } - } - /* Now release the lock and pin on the page */ - UnlockReleaseBuffer(targbuffer); + samplerows += 1; + } } + ExecDropSingleTupleTableSlot(slot); + table_endscan(scan); + /* * If we didn't find as many tuples as we wanted then we're done. No sort * is needed, since they're already in order. diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 10df766f1c..fd2e47ffc4 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1758,7 +1758,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params) cluster_rel(relid, InvalidOid, cluster_options); } else - heap_vacuum_rel(onerel, params, vac_strategy); + table_relation_vacuum(onerel, params, vac_strategy); /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index c571f8a899..1f0eb2fdb7 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -30,6 +30,7 @@ extern bool synchronize_seqscans; struct BulkInsertStateData; struct IndexInfo; struct IndexBuildCallback; +struct VacuumParams; struct ValidateIndexState; @@ -396,9 +397,9 @@ typedef struct TableAmRoutine /* * This callback needs to remove all contents from `rel`'s current - * relfilenode. No provisions for transactional behaviour need to be - * made. Often this can be implemented by truncating the underlying - * storage to its minimal size. + * relfilenode. No provisions for transactional behaviour need to be made. + * Often this can be implemented by truncating the underlying storage to + * its minimal size. * * See also table_relation_nontransactional_truncate(). */ @@ -418,6 +419,59 @@ typedef struct TableAmRoutine TransactionId OldestXmin, TransactionId FreezeXid, MultiXactId MultiXactCutoff, double *num_tuples, double *tups_vacuumed, double *tups_recently_dead); + /* + * React to VACUUM command on the relation. The VACUUM might be user + * triggered or by autovacuum. The specific actions performed by the AM + * will depend heavily on the individual AM. + * + * On entry a transaction is already established, and the relation is + * locked with a ShareUpdateExclusive lock. + * + * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through + * this routine, even if (in the latter case), part of the same VACUUM + * command. + * + * There probably, in the future, needs to be a separate callback to + * integrate with autovacuum's scheduling. + */ + void (*relation_vacuum) (Relation onerel, struct VacuumParams *params, + BufferAccessStrategy bstrategy); + + /* + * Prepare to analyze block `blockno` of `scan`. The scan has been started + * with table_beginscan_analyze(). See also + * table_scan_analyze_next_block(). + * + * The callback may acquire resources like locks that are held until + * table_scan_analyze_next_tuple() returns false. It e.g. can make sense + * to hold a lock until all tuples on a block have been analyzed by + * scan_analyze_next_tuple. + * + * The callback can return false if the block is not suitable for + * sampling, e.g. because it's a metapage that could never contain tuples. + * + * XXX: This obviously is primarily suited for block-based AMs. It's not + * clear what a good interface for non block based AMs would be, so don't + * try to invent one yet. + */ + bool (*scan_analyze_next_block) (TableScanDesc scan, + BlockNumber blockno, + BufferAccessStrategy bstrategy); + + /* + * See table_scan_analyze_next_tuple(). + * + * Not every AM might have a meaningful concept of dead rows, in which + * case it's OK to not increment *deadrows - but note that that may + * influence autovacuum scheduling (see comment for relation_vacuum + * callback). + */ + bool (*scan_analyze_next_tuple) (TableScanDesc scan, + TransactionId OldestXmin, + double *liverows, + double *deadrows, + TupleTableSlot *slot); + /* see table_index_build_range_scan for reference about parameters */ double (*index_build_range_scan) (Relation heap_rel, Relation index_rel, @@ -1078,6 +1132,60 @@ table_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, tups_recently_dead); } +/* + * Perform VACUUM on the relation. The VACUUM can be user triggered or by + * autovacuum. The specific actions performed by the AM will depend heavily on + * the individual AM. + + * On entry a transaction needs to already been established, and the + * transaction is locked with a ShareUpdateExclusive lock. + * + * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through this + * routine, even if (in the latter case), part of the same VACUUM command. + */ +static inline void +table_relation_vacuum(Relation rel, struct VacuumParams *params, + BufferAccessStrategy bstrategy) +{ + rel->rd_tableam->relation_vacuum(rel, params, bstrategy); +} + +/* + * Prepare to analyze block `blockno` of `scan`. The scan needs to have been + * started with table_beginscan_analyze(). Note that this routine might + * acquire resources like locks that are held until + * table_scan_analyze_next_tuple() returns false. + * + * Returns false if block is unsuitable for sampling, true otherwise. + */ +static inline bool +table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno, + BufferAccessStrategy bstrategy) +{ + return scan->rs_rd->rd_tableam->scan_analyze_next_block(scan, blockno, + bstrategy); +} + +/* + * Iterate over tuples tuples in the block selected with + * table_scan_analyze_next_block() (which needs to have returned true, and + * this routine may not have returned false for the same block before). If a + * tuple that's suitable for sampling is found, true is returned and a tuple + * is stored in `slot`. + * + * *liverows and *deadrows are incremented according to the encountered + * tuples. + */ +static inline bool +table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, + double *liverows, double *deadrows, + TupleTableSlot *slot) +{ + return scan->rs_rd->rd_tableam->scan_analyze_next_tuple(scan, OldestXmin, + liverows, deadrows, + slot); +} + /* * table_index_build_range_scan - scan the table to find tuples to be indexed *