postgresql/contrib/pgstattuple/pgstatapprox.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

320 lines
8.8 KiB
C
Raw Permalink Normal View History

/*-------------------------------------------------------------------------
*
2016-04-02 03:53:10 +02:00
* pgstatapprox.c
* Bloat estimation functions
*
* Copyright (c) 2014-2024, PostgreSQL Global Development Group
*
* IDENTIFICATION
* contrib/pgstattuple/pgstatapprox.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/heapam.h"
#include "access/htup_details.h"
#include "access/multixact.h"
#include "access/relation.h"
#include "access/transam.h"
#include "access/visibilitymap.h"
#include "access/xact.h"
#include "catalog/namespace.h"
#include "catalog/pg_am_d.h"
#include "commands/vacuum.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/freespace.h"
#include "storage/lmgr.h"
#include "storage/procarray.h"
#include "utils/builtins.h"
PG_FUNCTION_INFO_V1(pgstattuple_approx);
PG_FUNCTION_INFO_V1(pgstattuple_approx_v1_5);
Datum pgstattuple_approx_internal(Oid relid, FunctionCallInfo fcinfo);
typedef struct output_type
{
uint64 table_len;
double scanned_percent;
uint64 tuple_count;
uint64 tuple_len;
double tuple_percent;
uint64 dead_tuple_count;
uint64 dead_tuple_len;
double dead_tuple_percent;
uint64 free_space;
double free_percent;
} output_type;
#define NUM_OUTPUT_COLUMNS 10
/*
* This function takes an already open relation and scans its pages,
* skipping those that have the corresponding visibility map bit set.
* For pages we skip, we find the free space from the free space map
* and approximate tuple_len on that basis. For the others, we count
* the exact number of dead tuples etc.
*
* This scan is loosely based on vacuumlazy.c:lazy_scan_heap(), but
* we do not try to avoid skipping single pages.
*/
static void
statapprox_heap(Relation rel, output_type *stat)
{
BlockNumber scanned,
nblocks,
blkno;
Buffer vmbuffer = InvalidBuffer;
BufferAccessStrategy bstrategy;
TransactionId OldestXmin;
snapshot scalability: Don't compute global horizons while building snapshots. To make GetSnapshotData() more scalable, it cannot not look at at each proc's xmin: While snapshot contents do not need to change whenever a read-only transaction commits or a snapshot is released, a proc's xmin is modified in those cases. The frequency of xmin modifications leads to, particularly on higher core count systems, many cache misses inside GetSnapshotData(), despite the data underlying a snapshot not changing. That is the most significant source of GetSnapshotData() scaling poorly on larger systems. Without accessing xmins, GetSnapshotData() cannot calculate accurate horizons / thresholds as it has so far. But we don't really have to: The horizons don't actually change that much between GetSnapshotData() calls. Nor are the horizons actually used every time a snapshot is built. The trick this commit introduces is to delay computation of accurate horizons until there use and using horizon boundaries to determine whether accurate horizons need to be computed. The use of RecentGlobal[Data]Xmin to decide whether a row version could be removed has been replaces with new GlobalVisTest* functions. These use two thresholds to determine whether a row can be pruned: 1) definitely_needed, indicating that rows deleted by XIDs >= definitely_needed are definitely still visible. 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can definitely be removed GetSnapshotData() updates definitely_needed to be the xmin of the computed snapshot. When testing whether a row can be removed (with GlobalVisTestIsRemovableXid()) and the tested XID falls in between the two (i.e. XID >= maybe_needed && XID < definitely_needed) the boundaries can be recomputed to be more accurate. As it is not cheap to compute accurate boundaries, we limit the number of times that happens in short succession. As the boundaries used by GlobalVisTestIsRemovableXid() are never reset (with maybe_needed updated by GetSnapshotData()), it is likely that further test can benefit from an earlier computation of accurate horizons. To avoid regressing performance when old_snapshot_threshold is set (as that requires an accurate horizon to be computed), heap_page_prune_opt() doesn't unconditionally call TransactionIdLimitedForOldSnapshots() anymore. Both the computation of the limited horizon, and the triggering of errors (with SetOldSnapshotThresholdTimestamp()) is now only done when necessary to remove tuples. This commit just removes the accesses to PGXACT->xmin from GetSnapshotData(), but other members of PGXACT residing in the same cache line are accessed. Therefore this in itself does not result in a significant improvement. Subsequent commits will take advantage of the fact that GetSnapshotData() now does not need to access xmins anymore. Note: This contains a workaround in heap_page_prune_opt() to keep the snapshot_too_old tests working. While that workaround is ugly, the tests currently are not meaningful, and it seems best to address them separately. Author: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Reviewed-By: Thomas Munro <thomas.munro@gmail.com> Reviewed-By: David Rowley <dgrowleyml@gmail.com> Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
2020-08-13 01:03:49 +02:00
OldestXmin = GetOldestNonRemovableTransactionId(rel);
bstrategy = GetAccessStrategy(BAS_BULKREAD);
nblocks = RelationGetNumberOfBlocks(rel);
scanned = 0;
for (blkno = 0; blkno < nblocks; blkno++)
{
Buffer buf;
Page page;
OffsetNumber offnum,
maxoff;
Size freespace;
CHECK_FOR_INTERRUPTS();
/*
* If the page has only visible tuples, then we can find out the free
* space from the FSM and move on.
*/
if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
{
freespace = GetRecordedFreeSpace(rel, blkno);
stat->tuple_len += BLCKSZ - freespace;
stat->free_space += freespace;
continue;
}
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
RBM_NORMAL, bstrategy);
LockBuffer(buf, BUFFER_LOCK_SHARE);
page = BufferGetPage(buf);
/*
* It's not safe to call PageGetHeapFreeSpace() on new pages, so we
* treat them as being free space for our purposes.
*/
if (!PageIsNew(page))
stat->free_space += PageGetHeapFreeSpace(page);
else
stat->free_space += BLCKSZ - SizeOfPageHeaderData;
/* We may count the page as scanned even if it's new/empty */
scanned++;
if (PageIsNew(page) || PageIsEmpty(page))
{
UnlockReleaseBuffer(buf);
continue;
}
/*
* Look at each tuple on the page and decide whether it's live or
* dead, then count it and its size. Unlike lazy_scan_heap, we can
* afford to ignore problems and special cases.
*/
maxoff = PageGetMaxOffsetNumber(page);
for (offnum = FirstOffsetNumber;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
{
ItemId itemid;
HeapTupleData tuple;
itemid = PageGetItemId(page, offnum);
if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid) ||
ItemIdIsDead(itemid))
{
continue;
}
Assert(ItemIdIsNormal(itemid));
ItemPointerSet(&(tuple.t_self), blkno, offnum);
tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
tuple.t_len = ItemIdGetLength(itemid);
tuple.t_tableOid = RelationGetRelid(rel);
/*
* We follow VACUUM's lead in counting INSERT_IN_PROGRESS tuples
* as "dead" while DELETE_IN_PROGRESS tuples are "live". We don't
* bother distinguishing tuples inserted/deleted by our own
* transaction.
*/
switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
{
case HEAPTUPLE_LIVE:
case HEAPTUPLE_DELETE_IN_PROGRESS:
stat->tuple_len += tuple.t_len;
stat->tuple_count++;
break;
case HEAPTUPLE_DEAD:
case HEAPTUPLE_RECENTLY_DEAD:
case HEAPTUPLE_INSERT_IN_PROGRESS:
stat->dead_tuple_len += tuple.t_len;
stat->dead_tuple_count++;
break;
default:
elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
break;
}
}
UnlockReleaseBuffer(buf);
}
stat->table_len = (uint64) nblocks * BLCKSZ;
2015-05-24 03:35:49 +02:00
/*
* We don't know how many tuples are in the pages we didn't scan, so
* extrapolate the live-tuple count to the whole table in the same way
* that VACUUM does. (Like VACUUM, we're not taking a random sample, so
* just extrapolating linearly seems unsafe.) There should be no dead
* tuples in all-visible pages, so no correction is needed for that, and
* we already accounted for the space in those pages, too.
*/
When updating reltuples after ANALYZE, just extrapolate from our sample. The existing logic for updating pg_class.reltuples trusted the sampling results only for the pages ANALYZE actually visited, preferring to believe the previous tuple density estimate for all the unvisited pages. While there's some rationale for doing that for VACUUM (first that VACUUM is likely to visit a very nonrandom subset of pages, and second that we know for sure that the unvisited pages did not change), there's no such rationale for ANALYZE: by assumption, it's looked at an unbiased random sample of the table's pages. Furthermore, in a very large table ANALYZE will have examined only a tiny fraction of the table's pages, meaning it cannot slew the overall density estimate very far at all. In a table that is physically growing, this causes reltuples to increase nearly proportionally to the change in relpages, regardless of what is actually happening in the table. This has been observed to cause reltuples to become so much larger than reality that it effectively shuts off autovacuum, whose threshold for doing anything is a fraction of reltuples. (Getting to the point where that would happen seems to require some additional, not well understood, conditions. But it's undeniable that if reltuples is seriously off in a large table, ANALYZE alone will not fix it in any reasonable number of iterations, especially not if the table is continuing to grow.) Hence, restrict the use of vac_estimate_reltuples() to VACUUM alone, and in ANALYZE, just extrapolate from the sample pages on the assumption that they provide an accurate model of the whole table. If, by very bad luck, they don't, at least another ANALYZE will fix it; in the old logic a single bad estimate could cause problems indefinitely. In HEAD, let's remove vac_estimate_reltuples' is_analyze argument altogether; it was never used for anything and now it's totally pointless. But keep it in the back branches, in case any third-party code is calling this function. Per bug #15005. Back-patch to all supported branches. David Gould, reviewed by Alexander Kuzmenkov, cosmetic changes by me Discussion: https://postgr.es/m/20180117164916.3fdcf2e9@engels
2018-03-13 18:24:27 +01:00
stat->tuple_count = vac_estimate_reltuples(rel, nblocks, scanned,
stat->tuple_count);
Redefine pg_class.reltuples to be -1 before the first VACUUM or ANALYZE. Historically, we've considered the state with relpages and reltuples both zero as indicating that we do not know the table's tuple density. This is problematic because it's impossible to distinguish "never yet vacuumed" from "vacuumed and seen to be empty". In particular, a user cannot use VACUUM or ANALYZE to override the planner's normal heuristic that an empty table should not be believed to be empty because it is probably about to get populated. That heuristic is a good safety measure, so I don't care to abandon it, but there should be a way to override it if the table is indeed intended to stay empty. Hence, represent the initial state of ignorance by setting reltuples to -1 (relpages is still set to zero), and apply the minimum-ten-pages heuristic only when reltuples is still -1. If the table is empty, VACUUM or ANALYZE (but not CREATE INDEX) will override that to reltuples = relpages = 0, and then we'll plan on that basis. This requires a bunch of fiddly little changes, but we can get rid of some ugly kluges that were formerly needed to maintain the old definition. One notable point is that FDWs' GetForeignRelSize methods will see baserel->tuples = -1 when no ANALYZE has been done on the foreign table. That seems like a net improvement, since those methods were formerly also in the dark about what baserel->tuples = 0 really meant. Still, it is an API change. I bumped catversion because code predating this change would get confused by seeing reltuples = -1. Discussion: https://postgr.es/m/F02298E0-6EF4-49A1-BCB6-C484794D9ACC@thebuild.com
2020-08-30 18:21:51 +02:00
/* It's not clear if we could get -1 here, but be safe. */
stat->tuple_count = Max(stat->tuple_count, 0);
/*
* Calculate percentages if the relation has one or more pages.
*/
if (nblocks != 0)
{
stat->scanned_percent = 100.0 * scanned / nblocks;
stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len;
stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len;
stat->free_percent = 100.0 * stat->free_space / stat->table_len;
}
if (BufferIsValid(vmbuffer))
{
ReleaseBuffer(vmbuffer);
vmbuffer = InvalidBuffer;
}
}
/*
* Returns estimated live/dead tuple statistics for the given relid.
*
* The superuser() check here must be kept as the library might be upgraded
* without the extension being upgraded, meaning that in pre-1.5 installations
* these functions could be called by any user.
*/
Datum
pgstattuple_approx(PG_FUNCTION_ARGS)
{
Oid relid = PG_GETARG_OID(0);
if (!superuser())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
errmsg("must be superuser to use pgstattuple functions")));
PG_RETURN_DATUM(pgstattuple_approx_internal(relid, fcinfo));
}
/*
* As of pgstattuple version 1.5, we no longer need to check if the user
* is a superuser because we REVOKE EXECUTE on the SQL function from PUBLIC.
* Users can then grant access to it based on their policies.
*
* Otherwise identical to pgstattuple_approx (above).
*/
Datum
pgstattuple_approx_v1_5(PG_FUNCTION_ARGS)
{
Oid relid = PG_GETARG_OID(0);
PG_RETURN_DATUM(pgstattuple_approx_internal(relid, fcinfo));
}
Datum
pgstattuple_approx_internal(Oid relid, FunctionCallInfo fcinfo)
{
Relation rel;
output_type stat = {0};
TupleDesc tupdesc;
bool nulls[NUM_OUTPUT_COLUMNS];
Datum values[NUM_OUTPUT_COLUMNS];
HeapTuple ret;
int i = 0;
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
if (tupdesc->natts != NUM_OUTPUT_COLUMNS)
elog(ERROR, "incorrect number of output arguments");
rel = relation_open(relid, AccessShareLock);
/*
* Reject attempts to read non-local temporary relations; we would be
* likely to get wrong data since we have no visibility into the owning
* session's local buffers.
*/
if (RELATION_IS_OTHER_TEMP(rel))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot access temporary tables of other sessions")));
/*
* We support only relation kinds with a visibility map and a free space
* map.
*/
if (!(rel->rd_rel->relkind == RELKIND_RELATION ||
rel->rd_rel->relkind == RELKIND_MATVIEW ||
rel->rd_rel->relkind == RELKIND_TOASTVALUE))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("relation \"%s\" is of wrong relation kind",
RelationGetRelationName(rel)),
errdetail_relkind_not_supported(rel->rd_rel->relkind)));
if (rel->rd_rel->relam != HEAP_TABLE_AM_OID)
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("only heap AM is supported")));
statapprox_heap(rel, &stat);
relation_close(rel, AccessShareLock);
memset(nulls, 0, sizeof(nulls));
values[i++] = Int64GetDatum(stat.table_len);
values[i++] = Float8GetDatum(stat.scanned_percent);
values[i++] = Int64GetDatum(stat.tuple_count);
values[i++] = Int64GetDatum(stat.tuple_len);
values[i++] = Float8GetDatum(stat.tuple_percent);
values[i++] = Int64GetDatum(stat.dead_tuple_count);
values[i++] = Int64GetDatum(stat.dead_tuple_len);
values[i++] = Float8GetDatum(stat.dead_tuple_percent);
values[i++] = Int64GetDatum(stat.free_space);
values[i++] = Float8GetDatum(stat.free_percent);
ret = heap_form_tuple(tupdesc, values, nulls);
return HeapTupleGetDatum(ret);
}