autovacuum: handle analyze for partitioned tables

Previously, autovacuum would completely ignore partitioned tables, which
is not good regarding analyze -- failing to analyze those tables means
poor plans may be chosen.  Make autovacuum aware of those tables by
propagating "changes since analyze" counts from the leaf partitions up
the partitioning hierarchy.

This also introduces necessary reloptions support for partitioned tables
(autovacuum_enabled, autovacuum_analyze_scale_factor,
autovacuum_analyze_threshold).  It's unclear how best to document this
aspect.

Author: Yuzuko Hosoya <yuzukohosoya@gmail.com>
Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Reviewed-by: Tomas Vondra <tomas.vondra@enterprisedb.com>
Reviewed-by: Álvaro Herrera <alvherre@alvh.no-ip.org>
Discussion: https://postgr.es/m/CAKkQ508_PwVgwJyBY=0Lmkz90j8CmWNPUxgHvCUwGhMrouz6UA@mail.gmail.com
This commit is contained in:
Alvaro Herrera 2021-04-08 01:19:36 -04:00
parent b3ee4c5038
commit 0827e8af70
No known key found for this signature in database
GPG Key ID: 1C20ACB9D5C564AE
7 changed files with 257 additions and 44 deletions

View File

@ -108,7 +108,7 @@ static relopt_bool boolRelOpts[] =
{
"autovacuum_enabled",
"Enables autovacuum in this relation",
RELOPT_KIND_HEAP | RELOPT_KIND_TOAST,
RELOPT_KIND_HEAP | RELOPT_KIND_TOAST | RELOPT_KIND_PARTITIONED,
ShareUpdateExclusiveLock
},
true
@ -246,7 +246,7 @@ static relopt_int intRelOpts[] =
{
"autovacuum_analyze_threshold",
"Minimum number of tuple inserts, updates or deletes prior to analyze",
RELOPT_KIND_HEAP,
RELOPT_KIND_HEAP | RELOPT_KIND_PARTITIONED,
ShareUpdateExclusiveLock
},
-1, 0, INT_MAX
@ -420,7 +420,7 @@ static relopt_real realRelOpts[] =
{
"autovacuum_analyze_scale_factor",
"Number of tuple inserts, updates or deletes prior to analyze as a fraction of reltuples",
RELOPT_KIND_HEAP,
RELOPT_KIND_HEAP | RELOPT_KIND_PARTITIONED,
ShareUpdateExclusiveLock
},
-1, 0.0, 100.0
@ -1962,12 +1962,11 @@ bytea *
partitioned_table_reloptions(Datum reloptions, bool validate)
{
/*
* There are no options for partitioned tables yet, but this is able to do
* some validation.
* autovacuum_enabled, autovacuum_analyze_threshold and
* autovacuum_analyze_scale_factor are supported for partitioned tables.
*/
return (bytea *) build_reloptions(reloptions, validate,
RELOPT_KIND_PARTITIONED,
0, NULL, 0);
return default_reloptions(reloptions, validate, RELOPT_KIND_PARTITIONED);
}
/*

View File

@ -660,7 +660,7 @@ CREATE VIEW pg_stat_all_tables AS
FROM pg_class C LEFT JOIN
pg_index I ON C.oid = I.indrelid
LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)
WHERE C.relkind IN ('r', 't', 'm')
WHERE C.relkind IN ('r', 't', 'm', 'p')
GROUP BY C.oid, N.nspname, C.relname;
CREATE VIEW pg_stat_xact_all_tables AS
@ -680,7 +680,7 @@ CREATE VIEW pg_stat_xact_all_tables AS
FROM pg_class C LEFT JOIN
pg_index I ON C.oid = I.indrelid
LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)
WHERE C.relkind IN ('r', 't', 'm')
WHERE C.relkind IN ('r', 't', 'm', 'p')
GROUP BY C.oid, N.nspname, C.relname;
CREATE VIEW pg_stat_sys_tables AS

View File

@ -612,8 +612,8 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
PROGRESS_ANALYZE_PHASE_FINALIZE_ANALYZE);
/*
* Update pages/tuples stats in pg_class, and report ANALYZE to the stats
* collector ... but not if we're doing inherited stats.
* Update pages/tuples stats in pg_class ... but not if we're doing
* inherited stats.
*
* We assume that VACUUM hasn't set pg_class.reltuples already, even
* during a VACUUM ANALYZE. Although VACUUM often updates pg_class,
@ -655,19 +655,35 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
InvalidMultiXactId,
in_outer_xact);
}
}
/*
* Now report ANALYZE to the stats collector.
*
* We deliberately don't report to the stats collector when doing
* inherited stats, because the stats collector only tracks per-table
* stats.
*
* Reset the changes_since_analyze counter only if we analyzed all
* columns; otherwise, there is still work for auto-analyze to do.
*/
/*
* Now report ANALYZE to the stats collector. For regular tables, we do
* it only if not doing inherited stats. For partitioned tables, we only
* do it for inherited stats. (We're never called for not-inherited stats
* on partitioned tables anyway.)
*
* Reset the changes_since_analyze counter only if we analyzed all
* columns; otherwise, there is still work for auto-analyze to do.
*/
if (!inh || onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
pgstat_report_analyze(onerel, totalrows, totaldeadrows,
(va_cols == NIL));
/*
* If this is a manual analyze of all columns of a permanent leaf
* partition, and not doing inherited stats, also let the collector know
* about the ancestor tables of this partition. Autovacuum does the
* equivalent of this at the start of its run, so there's no reason to do
* it there.
*/
if (!inh && !IsAutoVacuumWorkerProcess() &&
(va_cols == NIL) &&
onerel->rd_rel->relispartition &&
onerel->rd_rel->relkind == RELKIND_RELATION &&
onerel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
{
pgstat_report_anl_ancestors(RelationGetRelid(onerel));
}
/*

View File

@ -75,6 +75,7 @@
#include "catalog/dependency.h"
#include "catalog/namespace.h"
#include "catalog/pg_database.h"
#include "catalog/pg_inherits.h"
#include "commands/dbcommands.h"
#include "commands/vacuum.h"
#include "lib/ilist.h"
@ -1969,6 +1970,7 @@ do_autovacuum(void)
int effective_multixact_freeze_max_age;
bool did_vacuum = false;
bool found_concurrent_worker = false;
bool updated = false;
int i;
/*
@ -2054,12 +2056,19 @@ do_autovacuum(void)
/*
* Scan pg_class to determine which tables to vacuum.
*
* We do this in two passes: on the first one we collect the list of plain
* relations and materialized views, and on the second one we collect
* TOAST tables. The reason for doing the second pass is that during it we
* want to use the main relation's pg_class.reloptions entry if the TOAST
* table does not have any, and we cannot obtain it unless we know
* beforehand what's the main table OID.
* We do this in three passes: First we let pgstat collector know about
* the partitioned table ancestors of all partitions that have recently
* acquired rows for analyze. This informs the second pass about the
* total number of tuple count in partitioning hierarchies.
*
* On the second pass, we collect the list of plain relations,
* materialized views and partitioned tables. On the third one we collect
* TOAST tables.
*
* The reason for doing the third pass is that during it we want to use
* the main relation's pg_class.reloptions entry if the TOAST table does
* not have any, and we cannot obtain it unless we know beforehand what's
* the main table OID.
*
* We need to check TOAST tables separately because in cases with short,
* wide tables there might be proportionally much more activity in the
@ -2068,7 +2077,44 @@ do_autovacuum(void)
relScan = table_beginscan_catalog(classRel, 0, NULL);
/*
* On the first pass, we collect main tables to vacuum, and also the main
* First pass: before collecting the list of tables to vacuum, let stat
* collector know about partitioned-table ancestors of each partition.
*/
while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
{
Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
Oid relid = classForm->oid;
PgStat_StatTabEntry *tabentry;
/* Only consider permanent leaf partitions */
if (!classForm->relispartition ||
classForm->relkind == RELKIND_PARTITIONED_TABLE ||
classForm->relpersistence == RELPERSISTENCE_TEMP)
continue;
/*
* No need to do this for partitions that haven't acquired any rows.
*/
tabentry = pgstat_fetch_stat_tabentry(relid);
if (tabentry &&
tabentry->changes_since_analyze -
tabentry->changes_since_analyze_reported > 0)
{
pgstat_report_anl_ancestors(relid);
updated = true;
}
}
/* Acquire fresh stats for the next passes, if needed */
if (updated)
{
autovac_refresh_stats();
dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
shared = pgstat_fetch_stat_dbentry(InvalidOid);
}
/*
* On the second pass, we collect main tables to vacuum, and also the main
* table relid to TOAST relid mapping.
*/
while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
@ -2082,7 +2128,8 @@ do_autovacuum(void)
bool wraparound;
if (classForm->relkind != RELKIND_RELATION &&
classForm->relkind != RELKIND_MATVIEW)
classForm->relkind != RELKIND_MATVIEW &&
classForm->relkind != RELKIND_PARTITIONED_TABLE)
continue;
relid = classForm->oid;
@ -2157,7 +2204,7 @@ do_autovacuum(void)
table_endscan(relScan);
/* second pass: check TOAST tables */
/* third pass: check TOAST tables */
ScanKeyInit(&key,
Anum_pg_class_relkind,
BTEqualStrategyNumber, F_CHAREQ,
@ -2745,6 +2792,7 @@ extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc)
Assert(((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_RELATION ||
((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW ||
((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_PARTITIONED_TABLE ||
((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE);
relopts = extractRelOptions(tup, pg_class_desc, NULL);
@ -3161,7 +3209,44 @@ relation_needs_vacanalyze(Oid relid,
*/
if (PointerIsValid(tabentry) && AutoVacuumingActive())
{
reltuples = classForm->reltuples;
if (classForm->relkind != RELKIND_PARTITIONED_TABLE)
{
reltuples = classForm->reltuples;
}
else
{
/*
* If the relation is a partitioned table, we must add up
* children's reltuples.
*/
List *children;
ListCell *lc;
reltuples = 0;
/* Find all members of inheritance set taking AccessShareLock */
children = find_all_inheritors(relid, AccessShareLock, NULL);
foreach(lc, children)
{
Oid childOID = lfirst_oid(lc);
HeapTuple childtuple;
Form_pg_class childclass;
childtuple = SearchSysCache1(RELOID, ObjectIdGetDatum(childOID));
childclass = (Form_pg_class) GETSTRUCT(childtuple);
/* Skip a partitioned table and foreign partitions */
if (RELKIND_HAS_STORAGE(childclass->relkind))
{
/* Sum up the child's reltuples for its parent table */
reltuples += childclass->reltuples;
}
ReleaseSysCache(childtuple);
}
list_free(children);
}
vactuples = tabentry->n_dead_tuples;
instuples = tabentry->inserts_since_vacuum;
anltuples = tabentry->changes_since_analyze;

View File

@ -38,6 +38,7 @@
#include "access/transam.h"
#include "access/twophase_rmgr.h"
#include "access/xact.h"
#include "catalog/partition.h"
#include "catalog/pg_database.h"
#include "catalog/pg_proc.h"
#include "common/ip.h"
@ -343,6 +344,7 @@ static void pgstat_recv_resetreplslotcounter(PgStat_MsgResetreplslotcounter *msg
static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len);
static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
static void pgstat_recv_anl_ancestors(PgStat_MsgAnlAncestors *msg, int len);
static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
static void pgstat_recv_wal(PgStat_MsgWal *msg, int len);
@ -1592,6 +1594,9 @@ pgstat_report_vacuum(Oid tableoid, bool shared,
*
* Caller must provide new live- and dead-tuples estimates, as well as a
* flag indicating whether to reset the changes_since_analyze counter.
* Exceptional support only changes_since_analyze for partitioned tables,
* though they don't have any data. This counter will tell us whether
* partitioned tables need autoanalyze or not.
* --------
*/
void
@ -1613,21 +1618,31 @@ pgstat_report_analyze(Relation rel,
* be double-counted after commit. (This approach also ensures that the
* collector ends up with the right numbers if we abort instead of
* committing.)
*
* For partitioned tables, we don't report live and dead tuples, because
* such tables don't have any data.
*/
if (rel->pgstat_info != NULL)
{
PgStat_TableXactStatus *trans;
for (trans = rel->pgstat_info->trans; trans; trans = trans->upper)
if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
/* If this rel is partitioned, skip modifying */
livetuples = deadtuples = 0;
else
{
livetuples -= trans->tuples_inserted - trans->tuples_deleted;
deadtuples -= trans->tuples_updated + trans->tuples_deleted;
for (trans = rel->pgstat_info->trans; trans; trans = trans->upper)
{
livetuples -= trans->tuples_inserted - trans->tuples_deleted;
deadtuples -= trans->tuples_updated + trans->tuples_deleted;
}
/* count stuff inserted by already-aborted subxacts, too */
deadtuples -= rel->pgstat_info->t_counts.t_delta_dead_tuples;
/* Since ANALYZE's counts are estimates, we could have underflowed */
livetuples = Max(livetuples, 0);
deadtuples = Max(deadtuples, 0);
}
/* count stuff inserted by already-aborted subxacts, too */
deadtuples -= rel->pgstat_info->t_counts.t_delta_dead_tuples;
/* Since ANALYZE's counts are estimates, we could have underflowed */
livetuples = Max(livetuples, 0);
deadtuples = Max(deadtuples, 0);
}
pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE);
@ -1639,6 +1654,48 @@ pgstat_report_analyze(Relation rel,
msg.m_live_tuples = livetuples;
msg.m_dead_tuples = deadtuples;
pgstat_send(&msg, sizeof(msg));
}
/*
* pgstat_report_anl_ancestors
*
* Send list of partitioned table ancestors of the given partition to the
* collector. The collector is in charge of propagating the analyze tuple
* counts from the partition to its ancestors. This is necessary so that
* other processes can decide whether to analyze the partitioned tables.
*/
void
pgstat_report_anl_ancestors(Oid relid)
{
PgStat_MsgAnlAncestors msg;
List *ancestors;
ListCell *lc;
pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANL_ANCESTORS);
msg.m_databaseid = MyDatabaseId;
msg.m_tableoid = relid;
msg.m_nancestors = 0;
ancestors = get_partition_ancestors(relid);
foreach(lc, ancestors)
{
Oid ancestor = lfirst_oid(lc);
msg.m_ancestors[msg.m_nancestors] = ancestor;
if (++msg.m_nancestors >= PGSTAT_NUM_ANCESTORENTRIES)
{
pgstat_send(&msg, offsetof(PgStat_MsgAnlAncestors, m_ancestors[0]) +
msg.m_nancestors * sizeof(Oid));
msg.m_nancestors = 0;
}
}
if (msg.m_nancestors > 0)
pgstat_send(&msg, offsetof(PgStat_MsgAnlAncestors, m_ancestors[0]) +
msg.m_nancestors * sizeof(Oid));
list_free(ancestors);
}
/* --------
@ -1958,7 +2015,8 @@ pgstat_initstats(Relation rel)
char relkind = rel->rd_rel->relkind;
/* We only count stats for things that have storage */
if (!RELKIND_HAS_STORAGE(relkind))
if (!RELKIND_HAS_STORAGE(relkind) &&
relkind != RELKIND_PARTITIONED_TABLE)
{
rel->pgstat_info = NULL;
return;
@ -3287,6 +3345,10 @@ PgstatCollectorMain(int argc, char *argv[])
pgstat_recv_analyze(&msg.msg_analyze, len);
break;
case PGSTAT_MTYPE_ANL_ANCESTORS:
pgstat_recv_anl_ancestors(&msg.msg_anl_ancestors, len);
break;
case PGSTAT_MTYPE_ARCHIVER:
pgstat_recv_archiver(&msg.msg_archiver, len);
break;
@ -3501,6 +3563,7 @@ pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
result->n_live_tuples = 0;
result->n_dead_tuples = 0;
result->changes_since_analyze = 0;
result->changes_since_analyze_reported = 0;
result->inserts_since_vacuum = 0;
result->blocks_fetched = 0;
result->blocks_hit = 0;
@ -4768,6 +4831,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
tabentry->n_live_tuples = tabmsg->t_counts.t_delta_live_tuples;
tabentry->n_dead_tuples = tabmsg->t_counts.t_delta_dead_tuples;
tabentry->changes_since_analyze = tabmsg->t_counts.t_changed_tuples;
tabentry->changes_since_analyze_reported = 0;
tabentry->inserts_since_vacuum = tabmsg->t_counts.t_tuples_inserted;
tabentry->blocks_fetched = tabmsg->t_counts.t_blocks_fetched;
tabentry->blocks_hit = tabmsg->t_counts.t_blocks_hit;
@ -5159,7 +5223,10 @@ pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
* have no good way to estimate how many of those there were.
*/
if (msg->m_resetcounter)
{
tabentry->changes_since_analyze = 0;
tabentry->changes_since_analyze_reported = 0;
}
if (msg->m_autovacuum)
{
@ -5173,6 +5240,29 @@ pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len)
}
}
static void
pgstat_recv_anl_ancestors(PgStat_MsgAnlAncestors *msg, int len)
{
PgStat_StatDBEntry *dbentry;
PgStat_StatTabEntry *tabentry;
dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true);
for (int i = 0; i < msg->m_nancestors; i++)
{
Oid ancestor_relid = msg->m_ancestors[i];
PgStat_StatTabEntry *ancestor;
ancestor = pgstat_get_tab_entry(dbentry, ancestor_relid, true);
ancestor->changes_since_analyze +=
tabentry->changes_since_analyze - tabentry->changes_since_analyze_reported;
}
tabentry->changes_since_analyze_reported = tabentry->changes_since_analyze;
}
/* ----------
* pgstat_recv_archiver() -

View File

@ -69,6 +69,7 @@ typedef enum StatMsgType
PGSTAT_MTYPE_AUTOVAC_START,
PGSTAT_MTYPE_VACUUM,
PGSTAT_MTYPE_ANALYZE,
PGSTAT_MTYPE_ANL_ANCESTORS,
PGSTAT_MTYPE_ARCHIVER,
PGSTAT_MTYPE_BGWRITER,
PGSTAT_MTYPE_WAL,
@ -106,7 +107,7 @@ typedef int64 PgStat_Counter;
*
* tuples_inserted/updated/deleted/hot_updated count attempted actions,
* regardless of whether the transaction committed. delta_live_tuples,
* delta_dead_tuples, and changed_tuples are set depending on commit or abort.
* delta_dead_tuples, changed_tuples are set depending on commit or abort.
* Note that delta_live_tuples and delta_dead_tuples can be negative!
* ----------
*/
@ -429,6 +430,25 @@ typedef struct PgStat_MsgAnalyze
PgStat_Counter m_dead_tuples;
} PgStat_MsgAnalyze;
/* ----------
* PgStat_MsgAnlAncestors Sent by the backend or autovacuum daemon
* to inform partitioned tables that are
* ancestors of a partition, to propagate
* analyze counters
* ----------
*/
#define PGSTAT_NUM_ANCESTORENTRIES \
((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(Oid) - sizeof(int)) \
/ sizeof(Oid))
typedef struct PgStat_MsgAnlAncestors
{
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
Oid m_tableoid;
int m_nancestors;
Oid m_ancestors[PGSTAT_NUM_ANCESTORENTRIES];
} PgStat_MsgAnlAncestors;
/* ----------
* PgStat_MsgArchiver Sent by the archiver to update statistics.
@ -674,6 +694,7 @@ typedef union PgStat_Msg
PgStat_MsgAutovacStart msg_autovacuum_start;
PgStat_MsgVacuum msg_vacuum;
PgStat_MsgAnalyze msg_analyze;
PgStat_MsgAnlAncestors msg_anl_ancestors;
PgStat_MsgArchiver msg_archiver;
PgStat_MsgBgWriter msg_bgwriter;
PgStat_MsgWal msg_wal;
@ -769,6 +790,7 @@ typedef struct PgStat_StatTabEntry
PgStat_Counter n_live_tuples;
PgStat_Counter n_dead_tuples;
PgStat_Counter changes_since_analyze;
PgStat_Counter changes_since_analyze_reported;
PgStat_Counter inserts_since_vacuum;
PgStat_Counter blocks_fetched;
@ -975,6 +997,7 @@ extern void pgstat_report_vacuum(Oid tableoid, bool shared,
extern void pgstat_report_analyze(Relation rel,
PgStat_Counter livetuples, PgStat_Counter deadtuples,
bool resetcounter);
extern void pgstat_report_anl_ancestors(Oid relid);
extern void pgstat_report_recovery_conflict(int reason);
extern void pgstat_report_deadlock(void);

View File

@ -1807,7 +1807,7 @@ pg_stat_all_tables| SELECT c.oid AS relid,
FROM ((pg_class c
LEFT JOIN pg_index i ON ((c.oid = i.indrelid)))
LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace)))
WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char", 'm'::"char"]))
WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char", 'm'::"char", 'p'::"char"]))
GROUP BY c.oid, n.nspname, c.relname;
pg_stat_archiver| SELECT s.archived_count,
s.last_archived_wal,
@ -2210,7 +2210,7 @@ pg_stat_xact_all_tables| SELECT c.oid AS relid,
FROM ((pg_class c
LEFT JOIN pg_index i ON ((c.oid = i.indrelid)))
LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace)))
WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char", 'm'::"char"]))
WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char", 'm'::"char", 'p'::"char"]))
GROUP BY c.oid, n.nspname, c.relname;
pg_stat_xact_sys_tables| SELECT pg_stat_xact_all_tables.relid,
pg_stat_xact_all_tables.schemaname,