diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index f09ad8f271..e676969ed1 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -144,6 +144,10 @@ typedef struct LVRelState Relation *indrels; int nindexes; + /* Aggressive VACUUM (scan all unfrozen pages)? */ + bool aggressive; + /* Use visibility map to skip? (disabled by DISABLE_PAGE_SKIPPING) */ + bool skipwithvm; /* Wraparound failsafe has been triggered? */ bool failsafe_active; /* Consider index vacuuming bypass optimization? */ @@ -168,6 +172,8 @@ typedef struct LVRelState /* VACUUM operation's cutoff for freezing XIDs and MultiXactIds */ TransactionId FreezeLimit; MultiXactId MultiXactCutoff; + /* Are FreezeLimit/MultiXactCutoff still valid? */ + bool freeze_cutoffs_valid; /* Error reporting state */ char *relnamespace; @@ -189,10 +195,8 @@ typedef struct LVRelState */ VacDeadItems *dead_items; /* TIDs whose index tuples we'll delete */ BlockNumber rel_pages; /* total number of pages */ - BlockNumber scanned_pages; /* number of pages we examined */ - BlockNumber pinskipped_pages; /* # of pages skipped due to a pin */ - BlockNumber frozenskipped_pages; /* # of frozen pages we skipped */ - BlockNumber tupcount_pages; /* # pages whose tuples we counted */ + BlockNumber scanned_pages; /* # pages examined (not skipped via VM) */ + BlockNumber frozenskipped_pages; /* # frozen pages skipped via VM */ BlockNumber removed_pages; /* # pages removed by relation truncation */ BlockNumber lpdead_item_pages; /* # pages with LP_DEAD items */ BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ @@ -205,6 +209,7 @@ typedef struct LVRelState /* Instrumentation counters */ int num_index_scans; + /* Counters that follow are only for scanned_pages */ int64 tuples_deleted; /* # deleted from table */ int64 lpdead_items; /* # deleted from indexes */ int64 new_dead_tuples; /* new estimated total # of dead items in @@ -241,19 +246,22 @@ typedef struct LVSavedErrInfo /* non-export function prototypes */ -static void lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, - bool aggressive); +static void lazy_scan_heap(LVRelState *vacrel, int nworkers); +static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, + BlockNumber blkno, Page page, + bool sharelock, Buffer vmbuffer); static void lazy_scan_prune(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, GlobalVisState *vistest, LVPagePruneState *prunestate); +static bool lazy_scan_noprune(LVRelState *vacrel, Buffer buf, + BlockNumber blkno, Page page, + bool *hastup, bool *recordfreespace); static void lazy_vacuum(LVRelState *vacrel); static bool lazy_vacuum_all_indexes(LVRelState *vacrel); static void lazy_vacuum_heap_rel(LVRelState *vacrel); static int lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, int index, Buffer *vmbuffer); -static bool lazy_check_needs_freeze(Buffer buf, bool *hastup, - LVRelState *vacrel); static bool lazy_check_wraparound_failsafe(LVRelState *vacrel); static void lazy_cleanup_all_indexes(LVRelState *vacrel); static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel, @@ -308,16 +316,15 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, int usecs; double read_rate, write_rate; - bool aggressive; /* should we scan all unfrozen pages? */ - bool scanned_all_unfrozen; /* actually scanned all such pages? */ + bool aggressive, + skipwithvm; + BlockNumber orig_rel_pages; char **indnames = NULL; TransactionId xidFullScanLimit; MultiXactId mxactFullScanLimit; BlockNumber new_rel_pages; BlockNumber new_rel_allvisible; double new_live_tuples; - TransactionId new_frozen_xid; - MultiXactId new_min_multi; ErrorContextCallback errcallback; PgStat_Counter startreadtime = 0; PgStat_Counter startwritetime = 0; @@ -360,8 +367,16 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, xidFullScanLimit); aggressive |= MultiXactIdPrecedesOrEquals(rel->rd_rel->relminmxid, mxactFullScanLimit); + skipwithvm = true; if (params->options & VACOPT_DISABLE_PAGE_SKIPPING) + { + /* + * Force aggressive mode, and disable skipping blocks using the + * visibility map (even those set all-frozen) + */ aggressive = true; + skipwithvm = false; + } /* * Setup error traceback support for ereport() first. The idea is to set @@ -424,6 +439,8 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, Assert(params->index_cleanup != VACOPTVALUE_UNSPECIFIED); Assert(params->truncate != VACOPTVALUE_UNSPECIFIED && params->truncate != VACOPTVALUE_AUTO); + vacrel->aggressive = aggressive; + vacrel->skipwithvm = skipwithvm; vacrel->failsafe_active = false; vacrel->consider_bypass_optimization = true; vacrel->do_index_vacuuming = true; @@ -455,35 +472,23 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vacrel->OldestXmin = OldestXmin; vacrel->FreezeLimit = FreezeLimit; vacrel->MultiXactCutoff = MultiXactCutoff; + /* Track if cutoffs became invalid (possible in !aggressive case only) */ + vacrel->freeze_cutoffs_valid = true; /* * Call lazy_scan_heap to perform all required heap pruning, index * vacuuming, and heap vacuuming (plus related processing) */ - lazy_scan_heap(vacrel, params, aggressive); + lazy_scan_heap(vacrel, params->nworkers); /* Done with indexes */ vac_close_indexes(vacrel->nindexes, vacrel->indrels, NoLock); /* - * Compute whether we actually scanned the all unfrozen pages. If we did, - * we can adjust relfrozenxid and relminmxid. - * - * NB: We need to check this before truncating the relation, because that - * will change ->rel_pages. - */ - if ((vacrel->scanned_pages + vacrel->frozenskipped_pages) - < vacrel->rel_pages) - { - Assert(!aggressive); - scanned_all_unfrozen = false; - } - else - scanned_all_unfrozen = true; - - /* - * Optionally truncate the relation. + * Optionally truncate the relation. But remember the relation size used + * by lazy_scan_heap for later first. */ + orig_rel_pages = vacrel->rel_pages; if (should_attempt_truncation(vacrel)) { update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_TRUNCATE, @@ -509,28 +514,43 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * * For safety, clamp relallvisible to be not more than what we're setting * relpages to. - * - * Also, don't change relfrozenxid/relminmxid if we skipped any pages, - * since then we don't know for certain that all tuples have a newer xmin. */ - new_rel_pages = vacrel->rel_pages; + new_rel_pages = vacrel->rel_pages; /* After possible rel truncation */ new_live_tuples = vacrel->new_live_tuples; visibilitymap_count(rel, &new_rel_allvisible, NULL); if (new_rel_allvisible > new_rel_pages) new_rel_allvisible = new_rel_pages; - new_frozen_xid = scanned_all_unfrozen ? FreezeLimit : InvalidTransactionId; - new_min_multi = scanned_all_unfrozen ? MultiXactCutoff : InvalidMultiXactId; - - vac_update_relstats(rel, - new_rel_pages, - new_live_tuples, - new_rel_allvisible, - vacrel->nindexes > 0, - new_frozen_xid, - new_min_multi, - false); + /* + * Aggressive VACUUM must reliably advance relfrozenxid (and relminmxid). + * We are able to advance relfrozenxid in a non-aggressive VACUUM too, + * provided we didn't skip any all-visible (not all-frozen) pages using + * the visibility map, and assuming that we didn't fail to get a cleanup + * lock that made it unsafe with respect to FreezeLimit (or perhaps our + * MultiXactCutoff) established for VACUUM operation. + * + * NB: We must use orig_rel_pages, not vacrel->rel_pages, since we want + * the rel_pages used by lazy_scan_heap, which won't match when we + * happened to truncate the relation afterwards. + */ + if (vacrel->scanned_pages + vacrel->frozenskipped_pages < orig_rel_pages || + !vacrel->freeze_cutoffs_valid) + { + /* Cannot advance relfrozenxid/relminmxid */ + Assert(!aggressive); + vac_update_relstats(rel, new_rel_pages, new_live_tuples, + new_rel_allvisible, vacrel->nindexes > 0, + InvalidTransactionId, InvalidMultiXactId, false); + } + else + { + Assert(vacrel->scanned_pages + vacrel->frozenskipped_pages == + orig_rel_pages); + vac_update_relstats(rel, new_rel_pages, new_live_tuples, + new_rel_allvisible, vacrel->nindexes > 0, + FreezeLimit, MultiXactCutoff, false); + } /* * Report results to the stats collector, too. @@ -558,7 +578,6 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, { StringInfoData buf; char *msgfmt; - BlockNumber orig_rel_pages; TimestampDifference(starttime, endtime, &secs, &usecs); @@ -610,10 +629,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vacrel->relnamespace, vacrel->relname, vacrel->num_index_scans); - appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"), + appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped frozen\n"), vacrel->removed_pages, vacrel->rel_pages, - vacrel->pinskipped_pages, vacrel->frozenskipped_pages); appendStringInfo(&buf, _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable, oldest xmin: %u\n"), @@ -621,7 +639,6 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, (long long) vacrel->new_rel_tuples, (long long) vacrel->new_dead_tuples, OldestXmin); - orig_rel_pages = vacrel->rel_pages + vacrel->removed_pages; if (orig_rel_pages > 0) { if (vacrel->do_index_vacuuming) @@ -738,7 +755,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * supply. */ static void -lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) +lazy_scan_heap(LVRelState *vacrel, int nworkers) { VacDeadItems *dead_items; BlockNumber nblocks, @@ -757,14 +774,9 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) GlobalVisState *vistest; nblocks = RelationGetNumberOfBlocks(vacrel->rel); - next_unskippable_block = 0; - next_failsafe_block = 0; - next_fsm_block_to_vacuum = 0; vacrel->rel_pages = nblocks; vacrel->scanned_pages = 0; - vacrel->pinskipped_pages = 0; vacrel->frozenskipped_pages = 0; - vacrel->tupcount_pages = 0; vacrel->removed_pages = 0; vacrel->lpdead_item_pages = 0; vacrel->nonempty_pages = 0; @@ -788,14 +800,16 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) * dangerously old. */ lazy_check_wraparound_failsafe(vacrel); + next_failsafe_block = 0; /* * Allocate the space for dead_items. Note that this handles parallel * VACUUM initialization as part of allocating shared memory space used * for dead_items. */ - dead_items_alloc(vacrel, params->nworkers); + dead_items_alloc(vacrel, nworkers); dead_items = vacrel->dead_items; + next_fsm_block_to_vacuum = 0; /* Report that we're scanning the heap, advertising total # of blocks */ initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP; @@ -804,7 +818,9 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) pgstat_progress_update_multi_param(3, initprog_index, initprog_val); /* - * Except when aggressive is set, we want to skip pages that are + * Set things up for skipping blocks using visibility map. + * + * Except when vacrel->aggressive is set, we want to skip pages that are * all-visible according to the visibility map, but only when we can skip * at least SKIP_PAGES_THRESHOLD consecutive pages. Since we're reading * sequentially, the OS should be doing readahead for us, so there's no @@ -813,8 +829,8 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) * page means that we can't update relfrozenxid, so we only want to do it * if we can skip a goodly number of pages. * - * When aggressive is set, we can't skip pages just because they are - * all-visible, but we can still skip pages that are all-frozen, since + * When vacrel->aggressive is set, we can't skip pages just because they + * are all-visible, but we can still skip pages that are all-frozen, since * such pages do not need freezing and do not affect the value that we can * safely set for relfrozenxid or relminmxid. * @@ -837,17 +853,9 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) * just added to that page are necessarily newer than the GlobalXmin we * computed, so they'll have no effect on the value to which we can safely * set relfrozenxid. A similar argument applies for MXIDs and relminmxid. - * - * We will scan the table's last page, at least to the extent of - * determining whether it has tuples or not, even if it should be skipped - * according to the above rules; except when we've already determined that - * it's not worth trying to truncate the table. This avoids having - * lazy_truncate_heap() take access-exclusive lock on the table to attempt - * a truncation that just fails immediately because there are tuples in - * the last page. This is worth avoiding mainly because such a lock must - * be replayed on any hot standby, where it can be disruptive. */ - if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0) + next_unskippable_block = 0; + if (vacrel->skipwithvm) { while (next_unskippable_block < nblocks) { @@ -856,7 +864,7 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) vmstatus = visibilitymap_get_status(vacrel->rel, next_unskippable_block, &vmbuffer); - if (aggressive) + if (vacrel->aggressive) { if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0) break; @@ -883,13 +891,6 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) bool all_visible_according_to_vm = false; LVPagePruneState prunestate; - /* - * Consider need to skip blocks. See note above about forcing - * scanning of last page. - */ -#define FORCE_CHECK_PAGE() \ - (blkno == nblocks - 1 && should_attempt_truncation(vacrel)) - pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP, @@ -899,7 +900,7 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) { /* Time to advance next_unskippable_block */ next_unskippable_block++; - if ((params->options & VACOPT_DISABLE_PAGE_SKIPPING) == 0) + if (vacrel->skipwithvm) { while (next_unskippable_block < nblocks) { @@ -908,7 +909,7 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) vmskipflags = visibilitymap_get_status(vacrel->rel, next_unskippable_block, &vmbuffer); - if (aggressive) + if (vacrel->aggressive) { if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0) break; @@ -937,19 +938,24 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) * it's not all-visible. But in an aggressive vacuum we know only * that it's not all-frozen, so it might still be all-visible. */ - if (aggressive && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer)) + if (vacrel->aggressive && + VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer)) all_visible_according_to_vm = true; } else { /* - * The current block is potentially skippable; if we've seen a - * long enough run of skippable blocks to justify skipping it, and - * we're not forced to check it, then go ahead and skip. - * Otherwise, the page must be at least all-visible if not - * all-frozen, so we can set all_visible_according_to_vm = true. + * The current page can be skipped if we've seen a long enough run + * of skippable blocks to justify skipping it -- provided it's not + * the last page in the relation (according to rel_pages/nblocks). + * + * We always scan the table's last page to determine whether it + * has tuples or not, even if it would otherwise be skipped. This + * avoids having lazy_truncate_heap() take access-exclusive lock + * on the table to attempt a truncation that just fails + * immediately because there are tuples on the last page. */ - if (skipping_blocks && !FORCE_CHECK_PAGE()) + if (skipping_blocks && blkno < nblocks - 1) { /* * Tricky, tricky. If this is in aggressive vacuum, the page @@ -958,18 +964,32 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) * careful to count it as a skipped all-frozen page in that * case, or else we'll think we can't update relfrozenxid and * relminmxid. If it's not an aggressive vacuum, we don't - * know whether it was all-frozen, so we have to recheck; but - * in this case an approximate answer is OK. + * know whether it was initially all-frozen, so we have to + * recheck. */ - if (aggressive || VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer)) + if (vacrel->aggressive || + VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer)) vacrel->frozenskipped_pages++; continue; } + + /* + * SKIP_PAGES_THRESHOLD (threshold for skipping) was not + * crossed, or this is the last page. Scan the page, even + * though it's all-visible (and possibly even all-frozen). + */ all_visible_according_to_vm = true; } vacuum_delay_point(); + /* + * We're not skipping this page using the visibility map, and so it is + * (by definition) a scanned page. Any tuples from this page are now + * guaranteed to be counted below, after some preparatory checks. + */ + vacrel->scanned_pages++; + /* * Regularly check if wraparound failsafe should trigger. * @@ -1024,174 +1044,78 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) } /* - * Set up visibility map page as needed. - * * Pin the visibility map page in case we need to mark the page * all-visible. In most cases this will be very cheap, because we'll - * already have the correct page pinned anyway. However, it's - * possible that (a) next_unskippable_block is covered by a different - * VM page than the current block or (b) we released our pin and did a - * cycle of index vacuuming. + * already have the correct page pinned anyway. */ visibilitymap_pin(vacrel->rel, blkno, &vmbuffer); + /* Finished preparatory checks. Actually scan the page. */ buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL, vacrel->bstrategy); + page = BufferGetPage(buf); /* - * We need buffer cleanup lock so that we can prune HOT chains and - * defragment the page. + * We need a buffer cleanup lock to prune HOT chains and defragment + * the page in lazy_scan_prune. But when it's not possible to acquire + * a cleanup lock right away, we may be able to settle for reduced + * processing using lazy_scan_noprune. */ if (!ConditionalLockBufferForCleanup(buf)) { - bool hastup; + bool hastup, + recordfreespace; - /* - * If we're not performing an aggressive scan to guard against XID - * wraparound, and we don't want to forcibly check the page, then - * it's OK to skip vacuuming pages we get a lock conflict on. They - * will be dealt with in some future vacuum. - */ - if (!aggressive && !FORCE_CHECK_PAGE()) - { - ReleaseBuffer(buf); - vacrel->pinskipped_pages++; - continue; - } - - /* - * Read the page with share lock to see if any xids on it need to - * be frozen. If not we just skip the page, after updating our - * scan statistics. If there are some, we wait for cleanup lock. - * - * We could defer the lock request further by remembering the page - * and coming back to it later, or we could even register - * ourselves for multiple buffers and then service whichever one - * is received first. For now, this seems good enough. - * - * If we get here with aggressive false, then we're just forcibly - * checking the page, and so we don't want to insist on getting - * the lock; we only need to know if the page contains tuples, so - * that we can update nonempty_pages correctly. It's convenient - * to use lazy_check_needs_freeze() for both situations, though. - */ LockBuffer(buf, BUFFER_LOCK_SHARE); - if (!lazy_check_needs_freeze(buf, &hastup, vacrel)) + + /* Check for new or empty pages before lazy_scan_noprune call */ + if (lazy_scan_new_or_empty(vacrel, buf, blkno, page, true, + vmbuffer)) { - UnlockReleaseBuffer(buf); - vacrel->scanned_pages++; - vacrel->pinskipped_pages++; - if (hastup) - vacrel->nonempty_pages = blkno + 1; + /* Processed as new/empty page (lock and pin released) */ continue; } - if (!aggressive) + + /* Collect LP_DEAD items in dead_items array, count tuples */ + if (lazy_scan_noprune(vacrel, buf, blkno, page, &hastup, + &recordfreespace)) { + Size freespace; + /* - * Here, we must not advance scanned_pages; that would amount - * to claiming that the page contains no freezable tuples. + * Processed page successfully (without cleanup lock) -- just + * need to perform rel truncation and FSM steps, much like the + * lazy_scan_prune case. Don't bother trying to match its + * visibility map setting steps, though. */ - UnlockReleaseBuffer(buf); - vacrel->pinskipped_pages++; if (hastup) vacrel->nonempty_pages = blkno + 1; + if (recordfreespace) + freespace = PageGetHeapFreeSpace(page); + UnlockReleaseBuffer(buf); + if (recordfreespace) + RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); continue; } + + /* + * lazy_scan_noprune could not do all required processing. Wait + * for a cleanup lock, and call lazy_scan_prune in the usual way. + */ + Assert(vacrel->aggressive); LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); - /* drop through to normal processing */ } - /* - * By here we definitely have enough dead_items space for whatever - * LP_DEAD tids are on this page, we have the visibility map page set - * up in case we need to set this page's all_visible/all_frozen bit, - * and we have a cleanup lock. Any tuples on this page are now sure - * to be "counted" by this VACUUM. - * - * One last piece of preamble needs to take place before we can prune: - * we need to consider new and empty pages. - */ - vacrel->scanned_pages++; - vacrel->tupcount_pages++; - - page = BufferGetPage(buf); - - if (PageIsNew(page)) + /* Check for new or empty pages before lazy_scan_prune call */ + if (lazy_scan_new_or_empty(vacrel, buf, blkno, page, false, vmbuffer)) { - /* - * All-zeroes pages can be left over if either a backend extends - * the relation by a single page, but crashes before the newly - * initialized page has been written out, or when bulk-extending - * the relation (which creates a number of empty pages at the tail - * end of the relation, but enters them into the FSM). - * - * Note we do not enter the page into the visibilitymap. That has - * the downside that we repeatedly visit this page in subsequent - * vacuums, but otherwise we'll never not discover the space on a - * promoted standby. The harm of repeated checking ought to - * normally not be too bad - the space usually should be used at - * some point, otherwise there wouldn't be any regular vacuums. - * - * Make sure these pages are in the FSM, to ensure they can be - * reused. Do that by testing if there's any space recorded for - * the page. If not, enter it. We do so after releasing the lock - * on the heap page, the FSM is approximate, after all. - */ - UnlockReleaseBuffer(buf); - - if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0) - { - Size freespace = BLCKSZ - SizeOfPageHeaderData; - - RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); - } - continue; - } - - if (PageIsEmpty(page)) - { - Size freespace = PageGetHeapFreeSpace(page); - - /* - * Empty pages are always all-visible and all-frozen (note that - * the same is currently not true for new pages, see above). - */ - if (!PageIsAllVisible(page)) - { - START_CRIT_SECTION(); - - /* mark buffer dirty before writing a WAL record */ - MarkBufferDirty(buf); - - /* - * It's possible that another backend has extended the heap, - * initialized the page, and then failed to WAL-log the page - * due to an ERROR. Since heap extension is not WAL-logged, - * recovery might try to replay our record setting the page - * all-visible and find that the page isn't initialized, which - * will cause a PANIC. To prevent that, check whether the - * page has been previously WAL-logged, and if not, do that - * now. - */ - if (RelationNeedsWAL(vacrel->rel) && - PageGetLSN(page) == InvalidXLogRecPtr) - log_newpage_buffer(buf, true); - - PageSetAllVisible(page); - visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr, - vmbuffer, InvalidTransactionId, - VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN); - END_CRIT_SECTION(); - } - - UnlockReleaseBuffer(buf); - RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); + /* Processed as new/empty page (lock and pin released) */ continue; } /* - * Prune and freeze tuples. + * Prune, freeze, and count tuples. * * Accumulates details of remaining LP_DEAD line pointers on page in * dead_items array. This includes LP_DEAD line pointers that we @@ -1399,7 +1323,7 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) /* now we can compute the new value for pg_class.reltuples */ vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, nblocks, - vacrel->tupcount_pages, + vacrel->scanned_pages, vacrel->live_tuples); /* @@ -1448,6 +1372,137 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) update_index_statistics(vacrel); } +/* + * lazy_scan_new_or_empty() -- lazy_scan_heap() new/empty page handling. + * + * Must call here to handle both new and empty pages before calling + * lazy_scan_prune or lazy_scan_noprune, since they're not prepared to deal + * with new or empty pages. + * + * It's necessary to consider new pages as a special case, since the rules for + * maintaining the visibility map and FSM with empty pages are a little + * different (though new pages can be truncated based on the usual rules). + * + * Empty pages are not really a special case -- they're just heap pages that + * have no allocated tuples (including even LP_UNUSED items). You might + * wonder why we need to handle them here all the same. It's only necessary + * because of a corner-case involving a hard crash during heap relation + * extension. If we ever make relation-extension crash safe, then it should + * no longer be necessary to deal with empty pages here (or new pages, for + * that matter). + * + * Caller must hold at least a shared lock. We might need to escalate the + * lock in that case, so the type of lock caller holds needs to be specified + * using 'sharelock' argument. + * + * Returns false in common case where caller should go on to call + * lazy_scan_prune (or lazy_scan_noprune). Otherwise returns true, indicating + * that lazy_scan_heap is done processing the page, releasing lock on caller's + * behalf. + */ +static bool +lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, + Page page, bool sharelock, Buffer vmbuffer) +{ + Size freespace; + + if (PageIsNew(page)) + { + /* + * All-zeroes pages can be left over if either a backend extends the + * relation by a single page, but crashes before the newly initialized + * page has been written out, or when bulk-extending the relation + * (which creates a number of empty pages at the tail end of the + * relation), and then enters them into the FSM. + * + * Note we do not enter the page into the visibilitymap. That has the + * downside that we repeatedly visit this page in subsequent vacuums, + * but otherwise we'll never discover the space on a promoted standby. + * The harm of repeated checking ought to normally not be too bad. The + * space usually should be used at some point, otherwise there + * wouldn't be any regular vacuums. + * + * Make sure these pages are in the FSM, to ensure they can be reused. + * Do that by testing if there's any space recorded for the page. If + * not, enter it. We do so after releasing the lock on the heap page, + * the FSM is approximate, after all. + */ + UnlockReleaseBuffer(buf); + + if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0) + { + freespace = BLCKSZ - SizeOfPageHeaderData; + + RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); + } + + return true; + } + + if (PageIsEmpty(page)) + { + /* + * It seems likely that caller will always be able to get a cleanup + * lock on an empty page. But don't take any chances -- escalate to + * an exclusive lock (still don't need a cleanup lock, though). + */ + if (sharelock) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + if (!PageIsEmpty(page)) + { + /* page isn't new or empty -- keep lock and pin for now */ + return false; + } + } + else + { + /* Already have a full cleanup lock (which is more than enough) */ + } + + /* + * Unlike new pages, empty pages are always set all-visible and + * all-frozen. + */ + if (!PageIsAllVisible(page)) + { + START_CRIT_SECTION(); + + /* mark buffer dirty before writing a WAL record */ + MarkBufferDirty(buf); + + /* + * It's possible that another backend has extended the heap, + * initialized the page, and then failed to WAL-log the page due + * to an ERROR. Since heap extension is not WAL-logged, recovery + * might try to replay our record setting the page all-visible and + * find that the page isn't initialized, which will cause a PANIC. + * To prevent that, check whether the page has been previously + * WAL-logged, and if not, do that now. + */ + if (RelationNeedsWAL(vacrel->rel) && + PageGetLSN(page) == InvalidXLogRecPtr) + log_newpage_buffer(buf, true); + + PageSetAllVisible(page); + visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId, + VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN); + END_CRIT_SECTION(); + } + + freespace = PageGetHeapFreeSpace(page); + UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); + return true; + } + + /* page isn't new or empty -- keep lock and pin */ + return false; +} + /* * lazy_scan_prune() -- lazy_scan_heap() pruning and freezing. * @@ -1492,6 +1547,8 @@ lazy_scan_prune(LVRelState *vacrel, OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; xl_heap_freeze_tuple frozen[MaxHeapTuplesPerPage]; + Assert(BufferGetBlockNumber(buf) == blkno); + maxoff = PageGetMaxOffsetNumber(page); retry: @@ -1554,10 +1611,9 @@ retry: * LP_DEAD items are processed outside of the loop. * * Note that we deliberately don't set hastup=true in the case of an - * LP_DEAD item here, which is not how lazy_check_needs_freeze() or - * count_nondeletable_pages() do it -- they only consider pages empty - * when they only have LP_UNUSED items, which is important for - * correctness. + * LP_DEAD item here, which is not how count_nondeletable_pages() does + * it -- it only considers pages empty/truncatable when they have no + * items at all (except LP_UNUSED items). * * Our assumption is that any LP_DEAD items we encounter here will * become LP_UNUSED inside lazy_vacuum_heap_page() before we actually @@ -1844,6 +1900,226 @@ retry: vacrel->live_tuples += live_tuples; } +/* + * lazy_scan_noprune() -- lazy_scan_prune() without pruning or freezing + * + * Caller need only hold a pin and share lock on the buffer, unlike + * lazy_scan_prune, which requires a full cleanup lock. While pruning isn't + * performed here, it's quite possible that an earlier opportunistic pruning + * operation left LP_DEAD items behind. We'll at least collect any such items + * in the dead_items array for removal from indexes. + * + * For aggressive VACUUM callers, we may return false to indicate that a full + * cleanup lock is required for processing by lazy_scan_prune. This is only + * necessary when the aggressive VACUUM needs to freeze some tuple XIDs from + * one or more tuples on the page. We always return true for non-aggressive + * callers. + * + * See lazy_scan_prune for an explanation of hastup return flag. + * recordfreespace flag instructs caller on whether or not it should do + * generic FSM processing for page. + */ +static bool +lazy_scan_noprune(LVRelState *vacrel, + Buffer buf, + BlockNumber blkno, + Page page, + bool *hastup, + bool *recordfreespace) +{ + OffsetNumber offnum, + maxoff; + int lpdead_items, + num_tuples, + live_tuples, + new_dead_tuples; + HeapTupleHeader tupleheader; + OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; + + Assert(BufferGetBlockNumber(buf) == blkno); + + *hastup = false; /* for now */ + *recordfreespace = false; /* for now */ + + lpdead_items = 0; + num_tuples = 0; + live_tuples = 0; + new_dead_tuples = 0; + + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + HeapTupleData tuple; + + vacrel->offnum = offnum; + itemid = PageGetItemId(page, offnum); + + if (!ItemIdIsUsed(itemid)) + continue; + + if (ItemIdIsRedirected(itemid)) + { + *hastup = true; + continue; + } + + if (ItemIdIsDead(itemid)) + { + /* + * Deliberately don't set hastup=true here. See same point in + * lazy_scan_prune for an explanation. + */ + deadoffsets[lpdead_items++] = offnum; + continue; + } + + *hastup = true; /* page prevents rel truncation */ + tupleheader = (HeapTupleHeader) PageGetItem(page, itemid); + if (heap_tuple_needs_freeze(tupleheader, + vacrel->FreezeLimit, + vacrel->MultiXactCutoff, buf)) + { + if (vacrel->aggressive) + { + /* Going to have to get cleanup lock for lazy_scan_prune */ + vacrel->offnum = InvalidOffsetNumber; + return false; + } + + /* + * Current non-aggressive VACUUM operation definitely won't be + * able to advance relfrozenxid or relminmxid + */ + vacrel->freeze_cutoffs_valid = false; + } + + num_tuples++; + ItemPointerSet(&(tuple.t_self), blkno, offnum); + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(vacrel->rel); + + switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf)) + { + case HEAPTUPLE_DELETE_IN_PROGRESS: + case HEAPTUPLE_LIVE: + + /* + * Count both cases as live, just like lazy_scan_prune + */ + live_tuples++; + + break; + case HEAPTUPLE_DEAD: + + /* + * There is some useful work for pruning to do, that won't be + * done due to failure to get a cleanup lock. + * + * TODO Add dedicated instrumentation for this case + */ + break; + case HEAPTUPLE_RECENTLY_DEAD: + + /* + * Count in new_dead_tuples, just like lazy_scan_prune + */ + new_dead_tuples++; + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * Do not count these rows as live, just like lazy_scan_prune + */ + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + + } + + vacrel->offnum = InvalidOffsetNumber; + + /* + * Now save details of the LP_DEAD items from the page in vacrel (though + * only when VACUUM uses two-pass strategy) + */ + if (vacrel->nindexes == 0) + { + /* Using one-pass strategy (since table has no indexes) */ + if (lpdead_items > 0) + { + /* + * Perfunctory handling for the corner case where a single pass + * strategy VACUUM cannot get a cleanup lock, and it turns out + * that there is one or more LP_DEAD items: just count the LP_DEAD + * items as missed_dead_tuples instead. (This is a bit dishonest, + * but it beats having to maintain specialized heap vacuuming code + * forever, for vanishingly little benefit.) + */ + *hastup = true; + num_tuples += lpdead_items; + /* TODO HEAPTUPLE_DEAD style instrumentation needed here, too */ + } + + *recordfreespace = true; + } + else if (lpdead_items == 0) + { + /* + * Won't be vacuuming this page later, so record page's freespace in + * the FSM now + */ + *recordfreespace = true; + } + else + { + VacDeadItems *dead_items = vacrel->dead_items; + ItemPointerData tmp; + + /* + * Page has LP_DEAD items, and so any references/TIDs that remain in + * indexes will be deleted during index vacuuming (and then marked + * LP_UNUSED in the heap) + */ + vacrel->lpdead_item_pages++; + + ItemPointerSetBlockNumber(&tmp, blkno); + + for (int i = 0; i < lpdead_items; i++) + { + ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]); + dead_items->items[dead_items->num_items++] = tmp; + } + + Assert(dead_items->num_items <= dead_items->max_items); + pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, + dead_items->num_items); + + vacrel->lpdead_items += lpdead_items; + + /* + * Assume that we'll go on to vacuum this heap page during final pass + * over the heap. Don't record free space until then. + */ + *recordfreespace = false; + } + + /* + * Finally, add relevant page-local counts to whole-VACUUM counts + */ + vacrel->new_dead_tuples += new_dead_tuples; + vacrel->num_tuples += num_tuples; + vacrel->live_tuples += live_tuples; + + /* Caller won't need to call lazy_scan_prune with same page */ + return true; +} + /* * Main entry point for index vacuuming and heap vacuuming. * @@ -2287,67 +2563,6 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, return index; } -/* - * lazy_check_needs_freeze() -- scan page to see if any tuples - * need to be cleaned to avoid wraparound - * - * Returns true if the page needs to be vacuumed using cleanup lock. - * Also returns a flag indicating whether page contains any tuples at all. - */ -static bool -lazy_check_needs_freeze(Buffer buf, bool *hastup, LVRelState *vacrel) -{ - Page page = BufferGetPage(buf); - OffsetNumber offnum, - maxoff; - HeapTupleHeader tupleheader; - - *hastup = false; - - /* - * New and empty pages, obviously, don't contain tuples. We could make - * sure that the page is registered in the FSM, but it doesn't seem worth - * waiting for a cleanup lock just for that, especially because it's - * likely that the pin holder will do so. - */ - if (PageIsNew(page) || PageIsEmpty(page)) - return false; - - maxoff = PageGetMaxOffsetNumber(page); - for (offnum = FirstOffsetNumber; - offnum <= maxoff; - offnum = OffsetNumberNext(offnum)) - { - ItemId itemid; - - /* - * Set the offset number so that we can display it along with any - * error that occurred while processing this tuple. - */ - vacrel->offnum = offnum; - itemid = PageGetItemId(page, offnum); - - /* this should match hastup test in count_nondeletable_pages() */ - if (ItemIdIsUsed(itemid)) - *hastup = true; - - /* dead and redirect items never need freezing */ - if (!ItemIdIsNormal(itemid)) - continue; - - tupleheader = (HeapTupleHeader) PageGetItem(page, itemid); - - if (heap_tuple_needs_freeze(tupleheader, vacrel->FreezeLimit, - vacrel->MultiXactCutoff, buf)) - break; - } /* scan along page */ - - /* Clear the offset information once we have processed the given page. */ - vacrel->offnum = InvalidOffsetNumber; - - return (offnum <= maxoff); -} - /* * Trigger the failsafe to avoid wraparound failure when vacrel table has a * relfrozenxid and/or relminmxid that is dangerously far in the past. @@ -2413,7 +2628,7 @@ lazy_cleanup_all_indexes(LVRelState *vacrel) { double reltuples = vacrel->new_rel_tuples; bool estimated_count = - vacrel->tupcount_pages < vacrel->rel_pages; + vacrel->scanned_pages < vacrel->rel_pages; for (int idx = 0; idx < vacrel->nindexes; idx++) { @@ -2430,7 +2645,7 @@ lazy_cleanup_all_indexes(LVRelState *vacrel) /* Outsource everything to parallel variant */ parallel_vacuum_cleanup_all_indexes(vacrel->pvs, vacrel->new_rel_tuples, vacrel->num_index_scans, - (vacrel->tupcount_pages < vacrel->rel_pages)); + (vacrel->scanned_pages < vacrel->rel_pages)); } } @@ -2537,7 +2752,9 @@ lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat, * should_attempt_truncation - should we attempt to truncate the heap? * * Don't even think about it unless we have a shot at releasing a goodly - * number of pages. Otherwise, the time taken isn't worth it. + * number of pages. Otherwise, the time taken isn't worth it, mainly because + * an AccessExclusive lock must be replayed on any hot standby, where it can + * be particularly disruptive. * * Also don't attempt it if wraparound failsafe is in effect. It's hard to * predict how long lazy_truncate_heap will take. Don't take any chances. diff --git a/src/test/isolation/expected/vacuum-reltuples.out b/src/test/isolation/expected/vacuum-reltuples.out index cdbe7f3a60..ce55376e7f 100644 --- a/src/test/isolation/expected/vacuum-reltuples.out +++ b/src/test/isolation/expected/vacuum-reltuples.out @@ -45,7 +45,7 @@ step stats: relpages|reltuples --------+--------- - 1| 20 + 1| 21 (1 row) diff --git a/src/test/isolation/specs/vacuum-reltuples.spec b/src/test/isolation/specs/vacuum-reltuples.spec index ae2f79b8fe..a2a461f2f5 100644 --- a/src/test/isolation/specs/vacuum-reltuples.spec +++ b/src/test/isolation/specs/vacuum-reltuples.spec @@ -2,9 +2,10 @@ # to page pins. We absolutely need to avoid setting reltuples=0 in # such cases, since that interferes badly with planning. # -# Expected result in second permutation is 20 tuples rather than 21 as -# for the others, because vacuum should leave the previous result -# (from before the insert) in place. +# Expected result for all three permutation is 21 tuples, including +# the second permutation. VACUUM is able to count the concurrently +# inserted tuple in its final reltuples, even when a cleanup lock +# cannot be acquired on the affected heap page. setup { create table smalltbl