Introduce vacuum errcontext to display additional information.

The additional information displayed will be block number for error
occurring while processing heap and index name for error occurring
while processing the index.

This will help us in diagnosing the problems that occur during a vacuum.
For ex. due to corruption (either caused by bad hardware or by some bug)
if we get some error while vacuuming, it can help us identify the block
in heap and or additional index information.

It sets up an error context callback to display additional information
with the error.  During different phases of vacuum (heap scan, heap
vacuum, index vacuum, index clean up, heap truncate), we update the error
context callback to display appropriate information.  We can extend it to
a bit more granular level like adding the phases for FSM operations or for
prefetching the blocks while truncating. However, I felt that it requires
adding many more error callback function calls and can make the code a bit
complex, so left those for now.

Author: Justin Pryzby, with few changes by Amit Kapila
Reviewed-by: Alvaro Herrera, Amit Kapila, Andres Freund, Michael Paquier
and Sawada Masahiko
Discussion: https://www.postgresql.org/message-id/20191120210600.GC30362@telsasoft.com
This commit is contained in:
Amit Kapila 2020-03-30 07:33:38 +05:30
parent 2743d9ae4a
commit b61d161c14
2 changed files with 216 additions and 25 deletions

View File

@ -144,6 +144,17 @@
*/
#define ParallelVacuumIsActive(lps) PointerIsValid(lps)
/* Phases of vacuum during which we report error context. */
typedef enum
{
VACUUM_ERRCB_PHASE_UNKNOWN,
VACUUM_ERRCB_PHASE_SCAN_HEAP,
VACUUM_ERRCB_PHASE_VACUUM_INDEX,
VACUUM_ERRCB_PHASE_VACUUM_HEAP,
VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
VACUUM_ERRCB_PHASE_TRUNCATE
} VacErrPhase;
/*
* LVDeadTuples stores the dead tuple TIDs collected during the heap scan.
* This is allocated in the DSM segment in parallel mode and in local memory
@ -270,6 +281,8 @@ typedef struct LVParallelState
typedef struct LVRelStats
{
char *relnamespace;
char *relname;
/* useindex = true means two-pass strategy; false means one-pass */
bool useindex;
/* Overall statistics about rel */
@ -290,8 +303,12 @@ typedef struct LVRelStats
int num_index_scans;
TransactionId latestRemovedXid;
bool lock_waiter_detected;
} LVRelStats;
/* Used for error callback */
char *indname;
BlockNumber blkno; /* used only for heap operations */
VacErrPhase phase;
} LVRelStats;
/* A few variables that don't seem worth passing around as parameters */
static int elevel = -1;
@ -314,10 +331,10 @@ static void lazy_vacuum_all_indexes(Relation onerel, Relation *Irel,
LVRelStats *vacrelstats, LVParallelState *lps,
int nindexes);
static void lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats,
LVDeadTuples *dead_tuples, double reltuples);
LVDeadTuples *dead_tuples, double reltuples, LVRelStats *vacrelstats);
static void lazy_cleanup_index(Relation indrel,
IndexBulkDeleteResult **stats,
double reltuples, bool estimated_count);
double reltuples, bool estimated_count, LVRelStats *vacrelstats);
static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer);
static bool should_attempt_truncation(VacuumParams *params,
@ -337,13 +354,13 @@ static void lazy_parallel_vacuum_indexes(Relation *Irel, IndexBulkDeleteResult *
int nindexes);
static void parallel_vacuum_index(Relation *Irel, IndexBulkDeleteResult **stats,
LVShared *lvshared, LVDeadTuples *dead_tuples,
int nindexes);
int nindexes, LVRelStats *vacrelstats);
static void vacuum_indexes_leader(Relation *Irel, IndexBulkDeleteResult **stats,
LVRelStats *vacrelstats, LVParallelState *lps,
int nindexes);
static void vacuum_one_index(Relation indrel, IndexBulkDeleteResult **stats,
LVShared *lvshared, LVSharedIndStats *shared_indstats,
LVDeadTuples *dead_tuples);
LVDeadTuples *dead_tuples, LVRelStats *vacrelstats);
static void lazy_cleanup_all_indexes(Relation *Irel, IndexBulkDeleteResult **stats,
LVRelStats *vacrelstats, LVParallelState *lps,
int nindexes);
@ -361,6 +378,9 @@ static void end_parallel_vacuum(Relation *Irel, IndexBulkDeleteResult **stats,
LVParallelState *lps, int nindexes);
static LVSharedIndStats *get_indstats(LVShared *lvshared, int n);
static bool skip_parallel_vacuum_index(Relation indrel, LVShared *lvshared);
static void vacuum_error_callback(void *arg);
static void update_vacuum_error_info(LVRelStats *errinfo, int phase,
BlockNumber blkno, char *indname);
/*
@ -394,6 +414,7 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
double new_live_tuples;
TransactionId new_frozen_xid;
MultiXactId new_min_multi;
ErrorContextCallback errcallback;
Assert(params != NULL);
Assert(params->index_cleanup != VACOPT_TERNARY_DEFAULT);
@ -460,6 +481,10 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
vacrelstats->relnamespace = get_namespace_name(RelationGetNamespace(onerel));
vacrelstats->relname = pstrdup(RelationGetRelationName(onerel));
vacrelstats->indname = NULL;
vacrelstats->phase = VACUUM_ERRCB_PHASE_UNKNOWN;
vacrelstats->old_rel_pages = onerel->rd_rel->relpages;
vacrelstats->old_live_tuples = onerel->rd_rel->reltuples;
vacrelstats->num_index_scans = 0;
@ -471,6 +496,22 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
vacrelstats->useindex = (nindexes > 0 &&
params->index_cleanup == VACOPT_TERNARY_ENABLED);
/*
* Setup error traceback support for ereport(). The idea is to set up an
* error context callback to display additional information on any error
* during a vacuum. During different phases of vacuum (heap scan, heap
* vacuum, index vacuum, index clean up, heap truncate), we update the
* error context callback to display appropriate information.
*
* Note that the index vacuum and heap vacuum phases may be called
* multiple times in the middle of the heap scan phase. So the old phase
* information is restored at the end of those phases.
*/
errcallback.callback = vacuum_error_callback;
errcallback.arg = vacrelstats;
errcallback.previous = error_context_stack;
error_context_stack = &errcallback;
/* Do the vacuuming */
lazy_scan_heap(onerel, params, vacrelstats, Irel, nindexes, aggressive);
@ -497,7 +538,19 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
* Optionally truncate the relation.
*/
if (should_attempt_truncation(params, vacrelstats))
{
/*
* Update error traceback information. This is the last phase during
* which we add context information to errors, so we don't need to
* revert to the previous phase.
*/
update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_TRUNCATE,
vacrelstats->nonempty_pages, NULL);
lazy_truncate_heap(onerel, vacrelstats);
}
/* Pop the error context stack */
error_context_stack = errcallback.previous;
/* Report that we are now doing final cleanup */
pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
@ -699,7 +752,6 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
BlockNumber nblocks,
blkno;
HeapTupleData tuple;
char *relname;
TransactionId relfrozenxid = onerel->rd_rel->relfrozenxid;
TransactionId relminmxid = onerel->rd_rel->relminmxid;
BlockNumber empty_pages,
@ -727,17 +779,16 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
pg_rusage_init(&ru0);
relname = RelationGetRelationName(onerel);
if (aggressive)
ereport(elevel,
(errmsg("aggressively vacuuming \"%s.%s\"",
get_namespace_name(RelationGetNamespace(onerel)),
relname)));
vacrelstats->relnamespace,
vacrelstats->relname)));
else
ereport(elevel,
(errmsg("vacuuming \"%s.%s\"",
get_namespace_name(RelationGetNamespace(onerel)),
relname)));
vacrelstats->relnamespace,
vacrelstats->relname)));
empty_pages = vacuumed_pages = 0;
next_fsm_block_to_vacuum = (BlockNumber) 0;
@ -893,6 +944,9 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_SCAN_HEAP,
blkno, NULL);
if (blkno == next_unskippable_block)
{
/* Time to advance next_unskippable_block */
@ -1534,7 +1588,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
&& VM_ALL_VISIBLE(onerel, blkno, &vmbuffer))
{
elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
relname, blkno);
vacrelstats->relname, blkno);
visibilitymap_clear(onerel, blkno, vmbuffer,
VISIBILITYMAP_VALID_BITS);
}
@ -1555,7 +1609,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
else if (PageIsAllVisible(page) && has_dead_tuples)
{
elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
relname, blkno);
vacrelstats->relname, blkno);
PageClearAllVisible(page);
MarkBufferDirty(buf);
visibilitymap_clear(onerel, blkno, vmbuffer,
@ -1744,7 +1798,7 @@ lazy_vacuum_all_indexes(Relation onerel, Relation *Irel,
for (idx = 0; idx < nindexes; idx++)
lazy_vacuum_index(Irel[idx], &stats[idx], vacrelstats->dead_tuples,
vacrelstats->old_live_tuples);
vacrelstats->old_live_tuples, vacrelstats);
}
/* Increase and report the number of index scans */
@ -1772,11 +1826,17 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
int npages;
PGRUsage ru0;
Buffer vmbuffer = InvalidBuffer;
LVRelStats olderrinfo;
/* Report that we are now vacuuming the heap */
pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
/* Update error traceback information */
olderrinfo = *vacrelstats;
update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_VACUUM_HEAP,
InvalidBlockNumber, NULL);
pg_rusage_init(&ru0);
npages = 0;
@ -1791,6 +1851,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
vacuum_delay_point();
tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples->itemptrs[tupindex]);
vacrelstats->blkno = tblk;
buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
vac_strategy);
if (!ConditionalLockBufferForCleanup(buf))
@ -1822,6 +1883,12 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
RelationGetRelationName(onerel),
tupindex, npages),
errdetail_internal("%s", pg_rusage_show(&ru0))));
/* Revert to the previous phase information for error traceback */
update_vacuum_error_info(vacrelstats,
olderrinfo.phase,
olderrinfo.blkno,
olderrinfo.indname);
}
/*
@ -1844,9 +1911,15 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
int uncnt = 0;
TransactionId visibility_cutoff_xid;
bool all_frozen;
LVRelStats olderrinfo;
pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
/* Update error traceback information */
olderrinfo = *vacrelstats;
update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_VACUUM_HEAP,
blkno, NULL);
START_CRIT_SECTION();
for (; tupindex < dead_tuples->num_tuples; tupindex++)
@ -1923,6 +1996,11 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
*vmbuffer, visibility_cutoff_xid, flags);
}
/* Revert to the previous phase information for error traceback */
update_vacuum_error_info(vacrelstats,
olderrinfo.phase,
olderrinfo.blkno,
olderrinfo.indname);
return tupindex;
}
@ -2083,7 +2161,7 @@ lazy_parallel_vacuum_indexes(Relation *Irel, IndexBulkDeleteResult **stats,
* indexes in the case where no workers are launched.
*/
parallel_vacuum_index(Irel, stats, lps->lvshared,
vacrelstats->dead_tuples, nindexes);
vacrelstats->dead_tuples, nindexes, vacrelstats);
/* Wait for all vacuum workers to finish */
WaitForParallelWorkersToFinish(lps->pcxt);
@ -2106,7 +2184,7 @@ lazy_parallel_vacuum_indexes(Relation *Irel, IndexBulkDeleteResult **stats,
static void
parallel_vacuum_index(Relation *Irel, IndexBulkDeleteResult **stats,
LVShared *lvshared, LVDeadTuples *dead_tuples,
int nindexes)
int nindexes, LVRelStats *vacrelstats)
{
/*
* Increment the active worker count if we are able to launch any worker.
@ -2140,7 +2218,7 @@ parallel_vacuum_index(Relation *Irel, IndexBulkDeleteResult **stats,
/* Do vacuum or cleanup of the index */
vacuum_one_index(Irel[idx], &(stats[idx]), lvshared, shared_indstats,
dead_tuples);
dead_tuples, vacrelstats);
}
/*
@ -2180,7 +2258,8 @@ vacuum_indexes_leader(Relation *Irel, IndexBulkDeleteResult **stats,
if (shared_indstats == NULL ||
skip_parallel_vacuum_index(Irel[i], lps->lvshared))
vacuum_one_index(Irel[i], &(stats[i]), lps->lvshared,
shared_indstats, vacrelstats->dead_tuples);
shared_indstats, vacrelstats->dead_tuples,
vacrelstats);
}
/*
@ -2200,7 +2279,7 @@ vacuum_indexes_leader(Relation *Irel, IndexBulkDeleteResult **stats,
static void
vacuum_one_index(Relation indrel, IndexBulkDeleteResult **stats,
LVShared *lvshared, LVSharedIndStats *shared_indstats,
LVDeadTuples *dead_tuples)
LVDeadTuples *dead_tuples, LVRelStats *vacrelstats)
{
IndexBulkDeleteResult *bulkdelete_res = NULL;
@ -2220,10 +2299,10 @@ vacuum_one_index(Relation indrel, IndexBulkDeleteResult **stats,
/* Do vacuum or cleanup of the index */
if (lvshared->for_cleanup)
lazy_cleanup_index(indrel, stats, lvshared->reltuples,
lvshared->estimated_count);
lvshared->estimated_count, vacrelstats);
else
lazy_vacuum_index(indrel, stats, dead_tuples,
lvshared->reltuples);
lvshared->reltuples, vacrelstats);
/*
* Copy the index bulk-deletion result returned from ambulkdelete and
@ -2298,7 +2377,8 @@ lazy_cleanup_all_indexes(Relation *Irel, IndexBulkDeleteResult **stats,
for (idx = 0; idx < nindexes; idx++)
lazy_cleanup_index(Irel[idx], &stats[idx],
vacrelstats->new_rel_tuples,
vacrelstats->tupcount_pages < vacrelstats->rel_pages);
vacrelstats->tupcount_pages < vacrelstats->rel_pages,
vacrelstats);
}
}
@ -2313,11 +2393,12 @@ lazy_cleanup_all_indexes(Relation *Irel, IndexBulkDeleteResult **stats,
*/
static void
lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats,
LVDeadTuples *dead_tuples, double reltuples)
LVDeadTuples *dead_tuples, double reltuples, LVRelStats *vacrelstats)
{
IndexVacuumInfo ivinfo;
const char *msg;
PGRUsage ru0;
LVRelStats olderrinfo;
pg_rusage_init(&ru0);
@ -2329,6 +2410,13 @@ lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats,
ivinfo.num_heap_tuples = reltuples;
ivinfo.strategy = vac_strategy;
/* Update error traceback information */
olderrinfo = *vacrelstats;
update_vacuum_error_info(vacrelstats,
VACUUM_ERRCB_PHASE_VACUUM_INDEX,
InvalidBlockNumber,
RelationGetRelationName(indrel));
/* Do bulk deletion */
*stats = index_bulk_delete(&ivinfo, *stats,
lazy_tid_reaped, (void *) dead_tuples);
@ -2343,6 +2431,12 @@ lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats,
RelationGetRelationName(indrel),
dead_tuples->num_tuples),
errdetail_internal("%s", pg_rusage_show(&ru0))));
/* Revert to the previous phase information for error traceback */
update_vacuum_error_info(vacrelstats,
olderrinfo.phase,
olderrinfo.blkno,
olderrinfo.indname);
}
/*
@ -2354,11 +2448,12 @@ lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats,
static void
lazy_cleanup_index(Relation indrel,
IndexBulkDeleteResult **stats,
double reltuples, bool estimated_count)
double reltuples, bool estimated_count, LVRelStats *vacrelstats)
{
IndexVacuumInfo ivinfo;
const char *msg;
PGRUsage ru0;
LVRelStats olderrcbarg;
pg_rusage_init(&ru0);
@ -2371,8 +2466,20 @@ lazy_cleanup_index(Relation indrel,
ivinfo.num_heap_tuples = reltuples;
ivinfo.strategy = vac_strategy;
/* Update error traceback information */
olderrcbarg = *vacrelstats;
update_vacuum_error_info(vacrelstats,
VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
InvalidBlockNumber,
RelationGetRelationName(indrel));
*stats = index_vacuum_cleanup(&ivinfo, *stats);
/* Revert back to the old phase information for error traceback */
update_vacuum_error_info(vacrelstats,
olderrcbarg.phase,
olderrcbarg.blkno,
olderrcbarg.indname);
if (!(*stats))
return;
@ -2517,6 +2624,7 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
* were vacuuming.
*/
new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
vacrelstats->blkno = new_rel_pages;
if (new_rel_pages >= old_rel_pages)
{
@ -3320,6 +3428,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
int nindexes;
char *sharedquery;
IndexBulkDeleteResult **stats;
LVRelStats vacrelstats;
ErrorContextCallback errcallback;
lvshared = (LVShared *) shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_SHARED,
false);
@ -3369,10 +3479,90 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc)
if (lvshared->maintenance_work_mem_worker > 0)
maintenance_work_mem = lvshared->maintenance_work_mem_worker;
/*
* Initialize vacrelstats for use as error callback arg by parallel
* worker.
*/
vacrelstats.relnamespace = get_namespace_name(RelationGetNamespace(onerel));
vacrelstats.relname = pstrdup(RelationGetRelationName(onerel));
vacrelstats.indname = NULL;
vacrelstats.phase = VACUUM_ERRCB_PHASE_UNKNOWN; /* Not yet processing */
/* Setup error traceback support for ereport() */
errcallback.callback = vacuum_error_callback;
errcallback.arg = &vacrelstats;
errcallback.previous = error_context_stack;
error_context_stack = &errcallback;
/* Process indexes to perform vacuum/cleanup */
parallel_vacuum_index(indrels, stats, lvshared, dead_tuples, nindexes);
parallel_vacuum_index(indrels, stats, lvshared, dead_tuples, nindexes,
&vacrelstats);
/* Pop the error context stack */
error_context_stack = errcallback.previous;
vac_close_indexes(nindexes, indrels, RowExclusiveLock);
table_close(onerel, ShareUpdateExclusiveLock);
pfree(stats);
}
/*
* Error context callback for errors occurring during vacuum.
*/
static void
vacuum_error_callback(void *arg)
{
LVRelStats *errinfo = arg;
switch (errinfo->phase)
{
case VACUUM_ERRCB_PHASE_SCAN_HEAP:
if (BlockNumberIsValid(errinfo->blkno))
errcontext("while scanning block %u of relation \"%s.%s\"",
errinfo->blkno, errinfo->relnamespace, errinfo->relname);
break;
case VACUUM_ERRCB_PHASE_VACUUM_HEAP:
if (BlockNumberIsValid(errinfo->blkno))
errcontext("while vacuuming block %u of relation \"%s.%s\"",
errinfo->blkno, errinfo->relnamespace, errinfo->relname);
break;
case VACUUM_ERRCB_PHASE_VACUUM_INDEX:
errcontext("while vacuuming index \"%s\" of relation \"%s.%s\"",
errinfo->indname, errinfo->relnamespace, errinfo->relname);
break;
case VACUUM_ERRCB_PHASE_INDEX_CLEANUP:
errcontext("while cleaning up index \"%s\" of relation \"%s.%s\"",
errinfo->indname, errinfo->relnamespace, errinfo->relname);
break;
case VACUUM_ERRCB_PHASE_TRUNCATE:
if (BlockNumberIsValid(errinfo->blkno))
errcontext("while truncating relation \"%s.%s\" to %u blocks",
errinfo->relnamespace, errinfo->relname, errinfo->blkno);
break;
case VACUUM_ERRCB_PHASE_UNKNOWN:
default:
return; /* do nothing; the errinfo may not be
* initialized */
}
}
/* Update vacuum error callback for the current phase, block, and index. */
static void
update_vacuum_error_info(LVRelStats *errinfo, int phase, BlockNumber blkno,
char *indname)
{
errinfo->blkno = blkno;
errinfo->phase = phase;
/* Free index name from any previous phase */
if (errinfo->indname)
pfree(errinfo->indname);
/* For index phases, save the name of the current index for the callback */
errinfo->indname = indname ? pstrdup(indname) : NULL;
}

View File

@ -2565,6 +2565,7 @@ UserMapping
UserOpts
VacAttrStats
VacAttrStatsP
VacErrPhase
VacOptTernaryValue
VacuumParams
VacuumRelation