From b61d161c146328ae6ba9ed937862d66e5c8b035a Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Mon, 30 Mar 2020 07:33:38 +0530 Subject: [PATCH] Introduce vacuum errcontext to display additional information. The additional information displayed will be block number for error occurring while processing heap and index name for error occurring while processing the index. This will help us in diagnosing the problems that occur during a vacuum. For ex. due to corruption (either caused by bad hardware or by some bug) if we get some error while vacuuming, it can help us identify the block in heap and or additional index information. It sets up an error context callback to display additional information with the error. During different phases of vacuum (heap scan, heap vacuum, index vacuum, index clean up, heap truncate), we update the error context callback to display appropriate information. We can extend it to a bit more granular level like adding the phases for FSM operations or for prefetching the blocks while truncating. However, I felt that it requires adding many more error callback function calls and can make the code a bit complex, so left those for now. Author: Justin Pryzby, with few changes by Amit Kapila Reviewed-by: Alvaro Herrera, Amit Kapila, Andres Freund, Michael Paquier and Sawada Masahiko Discussion: https://www.postgresql.org/message-id/20191120210600.GC30362@telsasoft.com --- src/backend/access/heap/vacuumlazy.c | 240 ++++++++++++++++++++++++--- src/tools/pgindent/typedefs.list | 1 + 2 files changed, 216 insertions(+), 25 deletions(-) diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 03c43efc32..9726f69629 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -144,6 +144,17 @@ */ #define ParallelVacuumIsActive(lps) PointerIsValid(lps) +/* Phases of vacuum during which we report error context. */ +typedef enum +{ + VACUUM_ERRCB_PHASE_UNKNOWN, + VACUUM_ERRCB_PHASE_SCAN_HEAP, + VACUUM_ERRCB_PHASE_VACUUM_INDEX, + VACUUM_ERRCB_PHASE_VACUUM_HEAP, + VACUUM_ERRCB_PHASE_INDEX_CLEANUP, + VACUUM_ERRCB_PHASE_TRUNCATE +} VacErrPhase; + /* * LVDeadTuples stores the dead tuple TIDs collected during the heap scan. * This is allocated in the DSM segment in parallel mode and in local memory @@ -270,6 +281,8 @@ typedef struct LVParallelState typedef struct LVRelStats { + char *relnamespace; + char *relname; /* useindex = true means two-pass strategy; false means one-pass */ bool useindex; /* Overall statistics about rel */ @@ -290,8 +303,12 @@ typedef struct LVRelStats int num_index_scans; TransactionId latestRemovedXid; bool lock_waiter_detected; -} LVRelStats; + /* Used for error callback */ + char *indname; + BlockNumber blkno; /* used only for heap operations */ + VacErrPhase phase; +} LVRelStats; /* A few variables that don't seem worth passing around as parameters */ static int elevel = -1; @@ -314,10 +331,10 @@ static void lazy_vacuum_all_indexes(Relation onerel, Relation *Irel, LVRelStats *vacrelstats, LVParallelState *lps, int nindexes); static void lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats, - LVDeadTuples *dead_tuples, double reltuples); + LVDeadTuples *dead_tuples, double reltuples, LVRelStats *vacrelstats); static void lazy_cleanup_index(Relation indrel, IndexBulkDeleteResult **stats, - double reltuples, bool estimated_count); + double reltuples, bool estimated_count, LVRelStats *vacrelstats); static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer); static bool should_attempt_truncation(VacuumParams *params, @@ -337,13 +354,13 @@ static void lazy_parallel_vacuum_indexes(Relation *Irel, IndexBulkDeleteResult * int nindexes); static void parallel_vacuum_index(Relation *Irel, IndexBulkDeleteResult **stats, LVShared *lvshared, LVDeadTuples *dead_tuples, - int nindexes); + int nindexes, LVRelStats *vacrelstats); static void vacuum_indexes_leader(Relation *Irel, IndexBulkDeleteResult **stats, LVRelStats *vacrelstats, LVParallelState *lps, int nindexes); static void vacuum_one_index(Relation indrel, IndexBulkDeleteResult **stats, LVShared *lvshared, LVSharedIndStats *shared_indstats, - LVDeadTuples *dead_tuples); + LVDeadTuples *dead_tuples, LVRelStats *vacrelstats); static void lazy_cleanup_all_indexes(Relation *Irel, IndexBulkDeleteResult **stats, LVRelStats *vacrelstats, LVParallelState *lps, int nindexes); @@ -361,6 +378,9 @@ static void end_parallel_vacuum(Relation *Irel, IndexBulkDeleteResult **stats, LVParallelState *lps, int nindexes); static LVSharedIndStats *get_indstats(LVShared *lvshared, int n); static bool skip_parallel_vacuum_index(Relation indrel, LVShared *lvshared); +static void vacuum_error_callback(void *arg); +static void update_vacuum_error_info(LVRelStats *errinfo, int phase, + BlockNumber blkno, char *indname); /* @@ -394,6 +414,7 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params, double new_live_tuples; TransactionId new_frozen_xid; MultiXactId new_min_multi; + ErrorContextCallback errcallback; Assert(params != NULL); Assert(params->index_cleanup != VACOPT_TERNARY_DEFAULT); @@ -460,6 +481,10 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params, vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); + vacrelstats->relnamespace = get_namespace_name(RelationGetNamespace(onerel)); + vacrelstats->relname = pstrdup(RelationGetRelationName(onerel)); + vacrelstats->indname = NULL; + vacrelstats->phase = VACUUM_ERRCB_PHASE_UNKNOWN; vacrelstats->old_rel_pages = onerel->rd_rel->relpages; vacrelstats->old_live_tuples = onerel->rd_rel->reltuples; vacrelstats->num_index_scans = 0; @@ -471,6 +496,22 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params, vacrelstats->useindex = (nindexes > 0 && params->index_cleanup == VACOPT_TERNARY_ENABLED); + /* + * Setup error traceback support for ereport(). The idea is to set up an + * error context callback to display additional information on any error + * during a vacuum. During different phases of vacuum (heap scan, heap + * vacuum, index vacuum, index clean up, heap truncate), we update the + * error context callback to display appropriate information. + * + * Note that the index vacuum and heap vacuum phases may be called + * multiple times in the middle of the heap scan phase. So the old phase + * information is restored at the end of those phases. + */ + errcallback.callback = vacuum_error_callback; + errcallback.arg = vacrelstats; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + /* Do the vacuuming */ lazy_scan_heap(onerel, params, vacrelstats, Irel, nindexes, aggressive); @@ -497,7 +538,19 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params, * Optionally truncate the relation. */ if (should_attempt_truncation(params, vacrelstats)) + { + /* + * Update error traceback information. This is the last phase during + * which we add context information to errors, so we don't need to + * revert to the previous phase. + */ + update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_TRUNCATE, + vacrelstats->nonempty_pages, NULL); lazy_truncate_heap(onerel, vacrelstats); + } + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; /* Report that we are now doing final cleanup */ pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, @@ -699,7 +752,6 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, BlockNumber nblocks, blkno; HeapTupleData tuple; - char *relname; TransactionId relfrozenxid = onerel->rd_rel->relfrozenxid; TransactionId relminmxid = onerel->rd_rel->relminmxid; BlockNumber empty_pages, @@ -727,17 +779,16 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, pg_rusage_init(&ru0); - relname = RelationGetRelationName(onerel); if (aggressive) ereport(elevel, (errmsg("aggressively vacuuming \"%s.%s\"", - get_namespace_name(RelationGetNamespace(onerel)), - relname))); + vacrelstats->relnamespace, + vacrelstats->relname))); else ereport(elevel, (errmsg("vacuuming \"%s.%s\"", - get_namespace_name(RelationGetNamespace(onerel)), - relname))); + vacrelstats->relnamespace, + vacrelstats->relname))); empty_pages = vacuumed_pages = 0; next_fsm_block_to_vacuum = (BlockNumber) 0; @@ -893,6 +944,9 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); + update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_SCAN_HEAP, + blkno, NULL); + if (blkno == next_unskippable_block) { /* Time to advance next_unskippable_block */ @@ -1534,7 +1588,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer)) { elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", - relname, blkno); + vacrelstats->relname, blkno); visibilitymap_clear(onerel, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); } @@ -1555,7 +1609,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, else if (PageIsAllVisible(page) && has_dead_tuples) { elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", - relname, blkno); + vacrelstats->relname, blkno); PageClearAllVisible(page); MarkBufferDirty(buf); visibilitymap_clear(onerel, blkno, vmbuffer, @@ -1744,7 +1798,7 @@ lazy_vacuum_all_indexes(Relation onerel, Relation *Irel, for (idx = 0; idx < nindexes; idx++) lazy_vacuum_index(Irel[idx], &stats[idx], vacrelstats->dead_tuples, - vacrelstats->old_live_tuples); + vacrelstats->old_live_tuples, vacrelstats); } /* Increase and report the number of index scans */ @@ -1772,11 +1826,17 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) int npages; PGRUsage ru0; Buffer vmbuffer = InvalidBuffer; + LVRelStats olderrinfo; /* Report that we are now vacuuming the heap */ pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, PROGRESS_VACUUM_PHASE_VACUUM_HEAP); + /* Update error traceback information */ + olderrinfo = *vacrelstats; + update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_VACUUM_HEAP, + InvalidBlockNumber, NULL); + pg_rusage_init(&ru0); npages = 0; @@ -1791,6 +1851,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) vacuum_delay_point(); tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples->itemptrs[tupindex]); + vacrelstats->blkno = tblk; buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL, vac_strategy); if (!ConditionalLockBufferForCleanup(buf)) @@ -1822,6 +1883,12 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) RelationGetRelationName(onerel), tupindex, npages), errdetail_internal("%s", pg_rusage_show(&ru0)))); + + /* Revert to the previous phase information for error traceback */ + update_vacuum_error_info(vacrelstats, + olderrinfo.phase, + olderrinfo.blkno, + olderrinfo.indname); } /* @@ -1844,9 +1911,15 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, int uncnt = 0; TransactionId visibility_cutoff_xid; bool all_frozen; + LVRelStats olderrinfo; pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno); + /* Update error traceback information */ + olderrinfo = *vacrelstats; + update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_VACUUM_HEAP, + blkno, NULL); + START_CRIT_SECTION(); for (; tupindex < dead_tuples->num_tuples; tupindex++) @@ -1923,6 +1996,11 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, *vmbuffer, visibility_cutoff_xid, flags); } + /* Revert to the previous phase information for error traceback */ + update_vacuum_error_info(vacrelstats, + olderrinfo.phase, + olderrinfo.blkno, + olderrinfo.indname); return tupindex; } @@ -2083,7 +2161,7 @@ lazy_parallel_vacuum_indexes(Relation *Irel, IndexBulkDeleteResult **stats, * indexes in the case where no workers are launched. */ parallel_vacuum_index(Irel, stats, lps->lvshared, - vacrelstats->dead_tuples, nindexes); + vacrelstats->dead_tuples, nindexes, vacrelstats); /* Wait for all vacuum workers to finish */ WaitForParallelWorkersToFinish(lps->pcxt); @@ -2106,7 +2184,7 @@ lazy_parallel_vacuum_indexes(Relation *Irel, IndexBulkDeleteResult **stats, static void parallel_vacuum_index(Relation *Irel, IndexBulkDeleteResult **stats, LVShared *lvshared, LVDeadTuples *dead_tuples, - int nindexes) + int nindexes, LVRelStats *vacrelstats) { /* * Increment the active worker count if we are able to launch any worker. @@ -2140,7 +2218,7 @@ parallel_vacuum_index(Relation *Irel, IndexBulkDeleteResult **stats, /* Do vacuum or cleanup of the index */ vacuum_one_index(Irel[idx], &(stats[idx]), lvshared, shared_indstats, - dead_tuples); + dead_tuples, vacrelstats); } /* @@ -2180,7 +2258,8 @@ vacuum_indexes_leader(Relation *Irel, IndexBulkDeleteResult **stats, if (shared_indstats == NULL || skip_parallel_vacuum_index(Irel[i], lps->lvshared)) vacuum_one_index(Irel[i], &(stats[i]), lps->lvshared, - shared_indstats, vacrelstats->dead_tuples); + shared_indstats, vacrelstats->dead_tuples, + vacrelstats); } /* @@ -2200,7 +2279,7 @@ vacuum_indexes_leader(Relation *Irel, IndexBulkDeleteResult **stats, static void vacuum_one_index(Relation indrel, IndexBulkDeleteResult **stats, LVShared *lvshared, LVSharedIndStats *shared_indstats, - LVDeadTuples *dead_tuples) + LVDeadTuples *dead_tuples, LVRelStats *vacrelstats) { IndexBulkDeleteResult *bulkdelete_res = NULL; @@ -2220,10 +2299,10 @@ vacuum_one_index(Relation indrel, IndexBulkDeleteResult **stats, /* Do vacuum or cleanup of the index */ if (lvshared->for_cleanup) lazy_cleanup_index(indrel, stats, lvshared->reltuples, - lvshared->estimated_count); + lvshared->estimated_count, vacrelstats); else lazy_vacuum_index(indrel, stats, dead_tuples, - lvshared->reltuples); + lvshared->reltuples, vacrelstats); /* * Copy the index bulk-deletion result returned from ambulkdelete and @@ -2298,7 +2377,8 @@ lazy_cleanup_all_indexes(Relation *Irel, IndexBulkDeleteResult **stats, for (idx = 0; idx < nindexes; idx++) lazy_cleanup_index(Irel[idx], &stats[idx], vacrelstats->new_rel_tuples, - vacrelstats->tupcount_pages < vacrelstats->rel_pages); + vacrelstats->tupcount_pages < vacrelstats->rel_pages, + vacrelstats); } } @@ -2313,11 +2393,12 @@ lazy_cleanup_all_indexes(Relation *Irel, IndexBulkDeleteResult **stats, */ static void lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats, - LVDeadTuples *dead_tuples, double reltuples) + LVDeadTuples *dead_tuples, double reltuples, LVRelStats *vacrelstats) { IndexVacuumInfo ivinfo; const char *msg; PGRUsage ru0; + LVRelStats olderrinfo; pg_rusage_init(&ru0); @@ -2329,6 +2410,13 @@ lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats, ivinfo.num_heap_tuples = reltuples; ivinfo.strategy = vac_strategy; + /* Update error traceback information */ + olderrinfo = *vacrelstats; + update_vacuum_error_info(vacrelstats, + VACUUM_ERRCB_PHASE_VACUUM_INDEX, + InvalidBlockNumber, + RelationGetRelationName(indrel)); + /* Do bulk deletion */ *stats = index_bulk_delete(&ivinfo, *stats, lazy_tid_reaped, (void *) dead_tuples); @@ -2343,6 +2431,12 @@ lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats, RelationGetRelationName(indrel), dead_tuples->num_tuples), errdetail_internal("%s", pg_rusage_show(&ru0)))); + + /* Revert to the previous phase information for error traceback */ + update_vacuum_error_info(vacrelstats, + olderrinfo.phase, + olderrinfo.blkno, + olderrinfo.indname); } /* @@ -2354,11 +2448,12 @@ lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats, static void lazy_cleanup_index(Relation indrel, IndexBulkDeleteResult **stats, - double reltuples, bool estimated_count) + double reltuples, bool estimated_count, LVRelStats *vacrelstats) { IndexVacuumInfo ivinfo; const char *msg; PGRUsage ru0; + LVRelStats olderrcbarg; pg_rusage_init(&ru0); @@ -2371,8 +2466,20 @@ lazy_cleanup_index(Relation indrel, ivinfo.num_heap_tuples = reltuples; ivinfo.strategy = vac_strategy; + /* Update error traceback information */ + olderrcbarg = *vacrelstats; + update_vacuum_error_info(vacrelstats, + VACUUM_ERRCB_PHASE_INDEX_CLEANUP, + InvalidBlockNumber, + RelationGetRelationName(indrel)); + *stats = index_vacuum_cleanup(&ivinfo, *stats); + /* Revert back to the old phase information for error traceback */ + update_vacuum_error_info(vacrelstats, + olderrcbarg.phase, + olderrcbarg.blkno, + olderrcbarg.indname); if (!(*stats)) return; @@ -2517,6 +2624,7 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats) * were vacuuming. */ new_rel_pages = count_nondeletable_pages(onerel, vacrelstats); + vacrelstats->blkno = new_rel_pages; if (new_rel_pages >= old_rel_pages) { @@ -3320,6 +3428,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) int nindexes; char *sharedquery; IndexBulkDeleteResult **stats; + LVRelStats vacrelstats; + ErrorContextCallback errcallback; lvshared = (LVShared *) shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_SHARED, false); @@ -3369,10 +3479,90 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) if (lvshared->maintenance_work_mem_worker > 0) maintenance_work_mem = lvshared->maintenance_work_mem_worker; + /* + * Initialize vacrelstats for use as error callback arg by parallel + * worker. + */ + vacrelstats.relnamespace = get_namespace_name(RelationGetNamespace(onerel)); + vacrelstats.relname = pstrdup(RelationGetRelationName(onerel)); + vacrelstats.indname = NULL; + vacrelstats.phase = VACUUM_ERRCB_PHASE_UNKNOWN; /* Not yet processing */ + + /* Setup error traceback support for ereport() */ + errcallback.callback = vacuum_error_callback; + errcallback.arg = &vacrelstats; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + /* Process indexes to perform vacuum/cleanup */ - parallel_vacuum_index(indrels, stats, lvshared, dead_tuples, nindexes); + parallel_vacuum_index(indrels, stats, lvshared, dead_tuples, nindexes, + &vacrelstats); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; vac_close_indexes(nindexes, indrels, RowExclusiveLock); table_close(onerel, ShareUpdateExclusiveLock); pfree(stats); } + +/* + * Error context callback for errors occurring during vacuum. + */ +static void +vacuum_error_callback(void *arg) +{ + LVRelStats *errinfo = arg; + + switch (errinfo->phase) + { + case VACUUM_ERRCB_PHASE_SCAN_HEAP: + if (BlockNumberIsValid(errinfo->blkno)) + errcontext("while scanning block %u of relation \"%s.%s\"", + errinfo->blkno, errinfo->relnamespace, errinfo->relname); + break; + + case VACUUM_ERRCB_PHASE_VACUUM_HEAP: + if (BlockNumberIsValid(errinfo->blkno)) + errcontext("while vacuuming block %u of relation \"%s.%s\"", + errinfo->blkno, errinfo->relnamespace, errinfo->relname); + break; + + case VACUUM_ERRCB_PHASE_VACUUM_INDEX: + errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"", + errinfo->indname, errinfo->relnamespace, errinfo->relname); + break; + + case VACUUM_ERRCB_PHASE_INDEX_CLEANUP: + errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"", + errinfo->indname, errinfo->relnamespace, errinfo->relname); + break; + + case VACUUM_ERRCB_PHASE_TRUNCATE: + if (BlockNumberIsValid(errinfo->blkno)) + errcontext("while truncating relation \"%s.%s\" to %u blocks", + errinfo->relnamespace, errinfo->relname, errinfo->blkno); + break; + + case VACUUM_ERRCB_PHASE_UNKNOWN: + default: + return; /* do nothing; the errinfo may not be + * initialized */ + } +} + +/* Update vacuum error callback for the current phase, block, and index. */ +static void +update_vacuum_error_info(LVRelStats *errinfo, int phase, BlockNumber blkno, + char *indname) +{ + errinfo->blkno = blkno; + errinfo->phase = phase; + + /* Free index name from any previous phase */ + if (errinfo->indname) + pfree(errinfo->indname); + + /* For index phases, save the name of the current index for the callback */ + errinfo->indname = indname ? pstrdup(indname) : NULL; +} diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index ca2d9ec8fb..ccc34ee2ac 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2565,6 +2565,7 @@ UserMapping UserOpts VacAttrStats VacAttrStatsP +VacErrPhase VacOptTernaryValue VacuumParams VacuumRelation