diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index d897bbec2b..5554275e64 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -108,7 +108,7 @@ static relopt_bool boolRelOpts[] = { "autovacuum_enabled", "Enables autovacuum in this relation", - RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST | RELOPT_KIND_PARTITIONED, ShareUpdateExclusiveLock }, true @@ -246,7 +246,7 @@ static relopt_int intRelOpts[] = { "autovacuum_analyze_threshold", "Minimum number of tuple inserts, updates or deletes prior to analyze", - RELOPT_KIND_HEAP, + RELOPT_KIND_HEAP | RELOPT_KIND_PARTITIONED, ShareUpdateExclusiveLock }, -1, 0, INT_MAX @@ -420,7 +420,7 @@ static relopt_real realRelOpts[] = { "autovacuum_analyze_scale_factor", "Number of tuple inserts, updates or deletes prior to analyze as a fraction of reltuples", - RELOPT_KIND_HEAP, + RELOPT_KIND_HEAP | RELOPT_KIND_PARTITIONED, ShareUpdateExclusiveLock }, -1, 0.0, 100.0 @@ -1962,12 +1962,11 @@ bytea * partitioned_table_reloptions(Datum reloptions, bool validate) { /* - * There are no options for partitioned tables yet, but this is able to do - * some validation. + * autovacuum_enabled, autovacuum_analyze_threshold and + * autovacuum_analyze_scale_factor are supported for partitioned tables. */ - return (bytea *) build_reloptions(reloptions, validate, - RELOPT_KIND_PARTITIONED, - 0, NULL, 0); + + return default_reloptions(reloptions, validate, RELOPT_KIND_PARTITIONED); } /* diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 4d6b232787..a47e102f36 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -660,7 +660,7 @@ CREATE VIEW pg_stat_all_tables AS FROM pg_class C LEFT JOIN pg_index I ON C.oid = I.indrelid LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) - WHERE C.relkind IN ('r', 't', 'm') + WHERE C.relkind IN ('r', 't', 'm', 'p') GROUP BY C.oid, N.nspname, C.relname; CREATE VIEW pg_stat_xact_all_tables AS @@ -680,7 +680,7 @@ CREATE VIEW pg_stat_xact_all_tables AS FROM pg_class C LEFT JOIN pg_index I ON C.oid = I.indrelid LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) - WHERE C.relkind IN ('r', 't', 'm') + WHERE C.relkind IN ('r', 't', 'm', 'p') GROUP BY C.oid, N.nspname, C.relname; CREATE VIEW pg_stat_sys_tables AS diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index f84616d3d2..5bdaceefd5 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -612,8 +612,8 @@ do_analyze_rel(Relation onerel, VacuumParams *params, PROGRESS_ANALYZE_PHASE_FINALIZE_ANALYZE); /* - * Update pages/tuples stats in pg_class, and report ANALYZE to the stats - * collector ... but not if we're doing inherited stats. + * Update pages/tuples stats in pg_class ... but not if we're doing + * inherited stats. * * We assume that VACUUM hasn't set pg_class.reltuples already, even * during a VACUUM ANALYZE. Although VACUUM often updates pg_class, @@ -655,19 +655,35 @@ do_analyze_rel(Relation onerel, VacuumParams *params, InvalidMultiXactId, in_outer_xact); } + } - /* - * Now report ANALYZE to the stats collector. - * - * We deliberately don't report to the stats collector when doing - * inherited stats, because the stats collector only tracks per-table - * stats. - * - * Reset the changes_since_analyze counter only if we analyzed all - * columns; otherwise, there is still work for auto-analyze to do. - */ + /* + * Now report ANALYZE to the stats collector. For regular tables, we do + * it only if not doing inherited stats. For partitioned tables, we only + * do it for inherited stats. (We're never called for not-inherited stats + * on partitioned tables anyway.) + * + * Reset the changes_since_analyze counter only if we analyzed all + * columns; otherwise, there is still work for auto-analyze to do. + */ + if (!inh || onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) pgstat_report_analyze(onerel, totalrows, totaldeadrows, (va_cols == NIL)); + + /* + * If this is a manual analyze of all columns of a permanent leaf + * partition, and not doing inherited stats, also let the collector know + * about the ancestor tables of this partition. Autovacuum does the + * equivalent of this at the start of its run, so there's no reason to do + * it there. + */ + if (!inh && !IsAutoVacuumWorkerProcess() && + (va_cols == NIL) && + onerel->rd_rel->relispartition && + onerel->rd_rel->relkind == RELKIND_RELATION && + onerel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT) + { + pgstat_report_anl_ancestors(RelationGetRelid(onerel)); } /* diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 23ef23c13e..aef9ac4dd2 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -75,6 +75,7 @@ #include "catalog/dependency.h" #include "catalog/namespace.h" #include "catalog/pg_database.h" +#include "catalog/pg_inherits.h" #include "commands/dbcommands.h" #include "commands/vacuum.h" #include "lib/ilist.h" @@ -1969,6 +1970,7 @@ do_autovacuum(void) int effective_multixact_freeze_max_age; bool did_vacuum = false; bool found_concurrent_worker = false; + bool updated = false; int i; /* @@ -2054,12 +2056,19 @@ do_autovacuum(void) /* * Scan pg_class to determine which tables to vacuum. * - * We do this in two passes: on the first one we collect the list of plain - * relations and materialized views, and on the second one we collect - * TOAST tables. The reason for doing the second pass is that during it we - * want to use the main relation's pg_class.reloptions entry if the TOAST - * table does not have any, and we cannot obtain it unless we know - * beforehand what's the main table OID. + * We do this in three passes: First we let pgstat collector know about + * the partitioned table ancestors of all partitions that have recently + * acquired rows for analyze. This informs the second pass about the + * total number of tuple count in partitioning hierarchies. + * + * On the second pass, we collect the list of plain relations, + * materialized views and partitioned tables. On the third one we collect + * TOAST tables. + * + * The reason for doing the third pass is that during it we want to use + * the main relation's pg_class.reloptions entry if the TOAST table does + * not have any, and we cannot obtain it unless we know beforehand what's + * the main table OID. * * We need to check TOAST tables separately because in cases with short, * wide tables there might be proportionally much more activity in the @@ -2068,7 +2077,44 @@ do_autovacuum(void) relScan = table_beginscan_catalog(classRel, 0, NULL); /* - * On the first pass, we collect main tables to vacuum, and also the main + * First pass: before collecting the list of tables to vacuum, let stat + * collector know about partitioned-table ancestors of each partition. + */ + while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL) + { + Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple); + Oid relid = classForm->oid; + PgStat_StatTabEntry *tabentry; + + /* Only consider permanent leaf partitions */ + if (!classForm->relispartition || + classForm->relkind == RELKIND_PARTITIONED_TABLE || + classForm->relpersistence == RELPERSISTENCE_TEMP) + continue; + + /* + * No need to do this for partitions that haven't acquired any rows. + */ + tabentry = pgstat_fetch_stat_tabentry(relid); + if (tabentry && + tabentry->changes_since_analyze - + tabentry->changes_since_analyze_reported > 0) + { + pgstat_report_anl_ancestors(relid); + updated = true; + } + } + + /* Acquire fresh stats for the next passes, if needed */ + if (updated) + { + autovac_refresh_stats(); + dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId); + shared = pgstat_fetch_stat_dbentry(InvalidOid); + } + + /* + * On the second pass, we collect main tables to vacuum, and also the main * table relid to TOAST relid mapping. */ while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL) @@ -2082,7 +2128,8 @@ do_autovacuum(void) bool wraparound; if (classForm->relkind != RELKIND_RELATION && - classForm->relkind != RELKIND_MATVIEW) + classForm->relkind != RELKIND_MATVIEW && + classForm->relkind != RELKIND_PARTITIONED_TABLE) continue; relid = classForm->oid; @@ -2157,7 +2204,7 @@ do_autovacuum(void) table_endscan(relScan); - /* second pass: check TOAST tables */ + /* third pass: check TOAST tables */ ScanKeyInit(&key, Anum_pg_class_relkind, BTEqualStrategyNumber, F_CHAREQ, @@ -2745,6 +2792,7 @@ extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc) Assert(((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_RELATION || ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW || + ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_PARTITIONED_TABLE || ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE); relopts = extractRelOptions(tup, pg_class_desc, NULL); @@ -3161,7 +3209,44 @@ relation_needs_vacanalyze(Oid relid, */ if (PointerIsValid(tabentry) && AutoVacuumingActive()) { - reltuples = classForm->reltuples; + if (classForm->relkind != RELKIND_PARTITIONED_TABLE) + { + reltuples = classForm->reltuples; + } + else + { + /* + * If the relation is a partitioned table, we must add up + * children's reltuples. + */ + List *children; + ListCell *lc; + + reltuples = 0; + + /* Find all members of inheritance set taking AccessShareLock */ + children = find_all_inheritors(relid, AccessShareLock, NULL); + + foreach(lc, children) + { + Oid childOID = lfirst_oid(lc); + HeapTuple childtuple; + Form_pg_class childclass; + + childtuple = SearchSysCache1(RELOID, ObjectIdGetDatum(childOID)); + childclass = (Form_pg_class) GETSTRUCT(childtuple); + + /* Skip a partitioned table and foreign partitions */ + if (RELKIND_HAS_STORAGE(childclass->relkind)) + { + /* Sum up the child's reltuples for its parent table */ + reltuples += childclass->reltuples; + } + ReleaseSysCache(childtuple); + } + + list_free(children); + } vactuples = tabentry->n_dead_tuples; instuples = tabentry->inserts_since_vacuum; anltuples = tabentry->changes_since_analyze; diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 5ba776e789..958183dd69 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -38,6 +38,7 @@ #include "access/transam.h" #include "access/twophase_rmgr.h" #include "access/xact.h" +#include "catalog/partition.h" #include "catalog/pg_database.h" #include "catalog/pg_proc.h" #include "common/ip.h" @@ -343,6 +344,7 @@ static void pgstat_recv_resetreplslotcounter(PgStat_MsgResetreplslotcounter *msg static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len); static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len); static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len); +static void pgstat_recv_anl_ancestors(PgStat_MsgAnlAncestors *msg, int len); static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len); static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len); static void pgstat_recv_wal(PgStat_MsgWal *msg, int len); @@ -1592,6 +1594,9 @@ pgstat_report_vacuum(Oid tableoid, bool shared, * * Caller must provide new live- and dead-tuples estimates, as well as a * flag indicating whether to reset the changes_since_analyze counter. + * Exceptional support only changes_since_analyze for partitioned tables, + * though they don't have any data. This counter will tell us whether + * partitioned tables need autoanalyze or not. * -------- */ void @@ -1613,21 +1618,31 @@ pgstat_report_analyze(Relation rel, * be double-counted after commit. (This approach also ensures that the * collector ends up with the right numbers if we abort instead of * committing.) + * + * For partitioned tables, we don't report live and dead tuples, because + * such tables don't have any data. */ if (rel->pgstat_info != NULL) { PgStat_TableXactStatus *trans; - for (trans = rel->pgstat_info->trans; trans; trans = trans->upper) + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + /* If this rel is partitioned, skip modifying */ + livetuples = deadtuples = 0; + else { - livetuples -= trans->tuples_inserted - trans->tuples_deleted; - deadtuples -= trans->tuples_updated + trans->tuples_deleted; + for (trans = rel->pgstat_info->trans; trans; trans = trans->upper) + { + livetuples -= trans->tuples_inserted - trans->tuples_deleted; + deadtuples -= trans->tuples_updated + trans->tuples_deleted; + } + /* count stuff inserted by already-aborted subxacts, too */ + deadtuples -= rel->pgstat_info->t_counts.t_delta_dead_tuples; + /* Since ANALYZE's counts are estimates, we could have underflowed */ + livetuples = Max(livetuples, 0); + deadtuples = Max(deadtuples, 0); } - /* count stuff inserted by already-aborted subxacts, too */ - deadtuples -= rel->pgstat_info->t_counts.t_delta_dead_tuples; - /* Since ANALYZE's counts are estimates, we could have underflowed */ - livetuples = Max(livetuples, 0); - deadtuples = Max(deadtuples, 0); + } pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE); @@ -1639,6 +1654,48 @@ pgstat_report_analyze(Relation rel, msg.m_live_tuples = livetuples; msg.m_dead_tuples = deadtuples; pgstat_send(&msg, sizeof(msg)); + +} + +/* + * pgstat_report_anl_ancestors + * + * Send list of partitioned table ancestors of the given partition to the + * collector. The collector is in charge of propagating the analyze tuple + * counts from the partition to its ancestors. This is necessary so that + * other processes can decide whether to analyze the partitioned tables. + */ +void +pgstat_report_anl_ancestors(Oid relid) +{ + PgStat_MsgAnlAncestors msg; + List *ancestors; + ListCell *lc; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANL_ANCESTORS); + msg.m_databaseid = MyDatabaseId; + msg.m_tableoid = relid; + msg.m_nancestors = 0; + + ancestors = get_partition_ancestors(relid); + foreach(lc, ancestors) + { + Oid ancestor = lfirst_oid(lc); + + msg.m_ancestors[msg.m_nancestors] = ancestor; + if (++msg.m_nancestors >= PGSTAT_NUM_ANCESTORENTRIES) + { + pgstat_send(&msg, offsetof(PgStat_MsgAnlAncestors, m_ancestors[0]) + + msg.m_nancestors * sizeof(Oid)); + msg.m_nancestors = 0; + } + } + + if (msg.m_nancestors > 0) + pgstat_send(&msg, offsetof(PgStat_MsgAnlAncestors, m_ancestors[0]) + + msg.m_nancestors * sizeof(Oid)); + + list_free(ancestors); } /* -------- @@ -1958,7 +2015,8 @@ pgstat_initstats(Relation rel) char relkind = rel->rd_rel->relkind; /* We only count stats for things that have storage */ - if (!RELKIND_HAS_STORAGE(relkind)) + if (!RELKIND_HAS_STORAGE(relkind) && + relkind != RELKIND_PARTITIONED_TABLE) { rel->pgstat_info = NULL; return; @@ -3287,6 +3345,10 @@ PgstatCollectorMain(int argc, char *argv[]) pgstat_recv_analyze(&msg.msg_analyze, len); break; + case PGSTAT_MTYPE_ANL_ANCESTORS: + pgstat_recv_anl_ancestors(&msg.msg_anl_ancestors, len); + break; + case PGSTAT_MTYPE_ARCHIVER: pgstat_recv_archiver(&msg.msg_archiver, len); break; @@ -3501,6 +3563,7 @@ pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create) result->n_live_tuples = 0; result->n_dead_tuples = 0; result->changes_since_analyze = 0; + result->changes_since_analyze_reported = 0; result->inserts_since_vacuum = 0; result->blocks_fetched = 0; result->blocks_hit = 0; @@ -4768,6 +4831,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) tabentry->n_live_tuples = tabmsg->t_counts.t_delta_live_tuples; tabentry->n_dead_tuples = tabmsg->t_counts.t_delta_dead_tuples; tabentry->changes_since_analyze = tabmsg->t_counts.t_changed_tuples; + tabentry->changes_since_analyze_reported = 0; tabentry->inserts_since_vacuum = tabmsg->t_counts.t_tuples_inserted; tabentry->blocks_fetched = tabmsg->t_counts.t_blocks_fetched; tabentry->blocks_hit = tabmsg->t_counts.t_blocks_hit; @@ -5159,7 +5223,10 @@ pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len) * have no good way to estimate how many of those there were. */ if (msg->m_resetcounter) + { tabentry->changes_since_analyze = 0; + tabentry->changes_since_analyze_reported = 0; + } if (msg->m_autovacuum) { @@ -5173,6 +5240,29 @@ pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len) } } +static void +pgstat_recv_anl_ancestors(PgStat_MsgAnlAncestors *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + PgStat_StatTabEntry *tabentry; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true); + + for (int i = 0; i < msg->m_nancestors; i++) + { + Oid ancestor_relid = msg->m_ancestors[i]; + PgStat_StatTabEntry *ancestor; + + ancestor = pgstat_get_tab_entry(dbentry, ancestor_relid, true); + ancestor->changes_since_analyze += + tabentry->changes_since_analyze - tabentry->changes_since_analyze_reported; + } + + tabentry->changes_since_analyze_reported = tabentry->changes_since_analyze; + +} /* ---------- * pgstat_recv_archiver() - diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 7cd137506e..89cd324454 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -69,6 +69,7 @@ typedef enum StatMsgType PGSTAT_MTYPE_AUTOVAC_START, PGSTAT_MTYPE_VACUUM, PGSTAT_MTYPE_ANALYZE, + PGSTAT_MTYPE_ANL_ANCESTORS, PGSTAT_MTYPE_ARCHIVER, PGSTAT_MTYPE_BGWRITER, PGSTAT_MTYPE_WAL, @@ -106,7 +107,7 @@ typedef int64 PgStat_Counter; * * tuples_inserted/updated/deleted/hot_updated count attempted actions, * regardless of whether the transaction committed. delta_live_tuples, - * delta_dead_tuples, and changed_tuples are set depending on commit or abort. + * delta_dead_tuples, changed_tuples are set depending on commit or abort. * Note that delta_live_tuples and delta_dead_tuples can be negative! * ---------- */ @@ -429,6 +430,25 @@ typedef struct PgStat_MsgAnalyze PgStat_Counter m_dead_tuples; } PgStat_MsgAnalyze; +/* ---------- + * PgStat_MsgAnlAncestors Sent by the backend or autovacuum daemon + * to inform partitioned tables that are + * ancestors of a partition, to propagate + * analyze counters + * ---------- + */ +#define PGSTAT_NUM_ANCESTORENTRIES \ + ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(Oid) - sizeof(int)) \ + / sizeof(Oid)) + +typedef struct PgStat_MsgAnlAncestors +{ + PgStat_MsgHdr m_hdr; + Oid m_databaseid; + Oid m_tableoid; + int m_nancestors; + Oid m_ancestors[PGSTAT_NUM_ANCESTORENTRIES]; +} PgStat_MsgAnlAncestors; /* ---------- * PgStat_MsgArchiver Sent by the archiver to update statistics. @@ -674,6 +694,7 @@ typedef union PgStat_Msg PgStat_MsgAutovacStart msg_autovacuum_start; PgStat_MsgVacuum msg_vacuum; PgStat_MsgAnalyze msg_analyze; + PgStat_MsgAnlAncestors msg_anl_ancestors; PgStat_MsgArchiver msg_archiver; PgStat_MsgBgWriter msg_bgwriter; PgStat_MsgWal msg_wal; @@ -769,6 +790,7 @@ typedef struct PgStat_StatTabEntry PgStat_Counter n_live_tuples; PgStat_Counter n_dead_tuples; PgStat_Counter changes_since_analyze; + PgStat_Counter changes_since_analyze_reported; PgStat_Counter inserts_since_vacuum; PgStat_Counter blocks_fetched; @@ -975,6 +997,7 @@ extern void pgstat_report_vacuum(Oid tableoid, bool shared, extern void pgstat_report_analyze(Relation rel, PgStat_Counter livetuples, PgStat_Counter deadtuples, bool resetcounter); +extern void pgstat_report_anl_ancestors(Oid relid); extern void pgstat_report_recovery_conflict(int reason); extern void pgstat_report_deadlock(void); diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 264deda7af..a8a1cc72d0 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1807,7 +1807,7 @@ pg_stat_all_tables| SELECT c.oid AS relid, FROM ((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) - WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char", 'm'::"char"])) + WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char", 'm'::"char", 'p'::"char"])) GROUP BY c.oid, n.nspname, c.relname; pg_stat_archiver| SELECT s.archived_count, s.last_archived_wal, @@ -2210,7 +2210,7 @@ pg_stat_xact_all_tables| SELECT c.oid AS relid, FROM ((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) - WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char", 'm'::"char"])) + WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char", 'm'::"char", 'p'::"char"])) GROUP BY c.oid, n.nspname, c.relname; pg_stat_xact_sys_tables| SELECT pg_stat_xact_all_tables.relid, pg_stat_xact_all_tables.schemaname,