postgresql/contrib/pgstattuple/pgstatindex.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

762 lines
20 KiB
C
Raw Normal View History

/*
2010-09-20 22:08:53 +02:00
* contrib/pgstattuple/pgstatindex.c
*
*
* pgstatindex
*
* Copyright (c) 2006 Satoshi Nagayasu <nagayasus@nttdata.co.jp>
*
* Permission to use, copy, modify, and distribute this software and
* its documentation for any purpose, without fee, and without a
* written agreement is hereby granted, provided that the above
* copyright notice and this paragraph and the following two
* paragraphs appear in all copies.
*
* IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT,
* INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
* LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
* DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*
* THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS
* IS" BASIS, AND THE AUTHOR HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE,
* SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
*/
#include "postgres.h"
#include "access/gin_private.h"
#include "access/hash.h"
#include "access/htup_details.h"
#include "access/nbtree.h"
#include "access/relation.h"
#include "access/table.h"
#include "catalog/namespace.h"
#include "catalog/pg_am.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/lmgr.h"
#include "utils/builtins.h"
#include "utils/rel.h"
#include "utils/varlena.h"
/*
* Because of backward-compatibility issue, we have decided to have
* two types of interfaces, with regclass-type input arg and text-type
* input arg, for each function.
*
* Those functions which have text-type input arg will be deprecated
* in the future release.
*/
PG_FUNCTION_INFO_V1(pgstatindex);
PG_FUNCTION_INFO_V1(pgstatindexbyid);
PG_FUNCTION_INFO_V1(pg_relpages);
PG_FUNCTION_INFO_V1(pg_relpagesbyid);
PG_FUNCTION_INFO_V1(pgstatginindex);
PG_FUNCTION_INFO_V1(pgstathashindex);
PG_FUNCTION_INFO_V1(pgstatindex_v1_5);
PG_FUNCTION_INFO_V1(pgstatindexbyid_v1_5);
PG_FUNCTION_INFO_V1(pg_relpages_v1_5);
PG_FUNCTION_INFO_V1(pg_relpagesbyid_v1_5);
PG_FUNCTION_INFO_V1(pgstatginindex_v1_5);
Datum pgstatginindex_internal(Oid relid, FunctionCallInfo fcinfo);
#define IS_INDEX(r) ((r)->rd_rel->relkind == RELKIND_INDEX)
#define IS_BTREE(r) ((r)->rd_rel->relam == BTREE_AM_OID)
#define IS_GIN(r) ((r)->rd_rel->relam == GIN_AM_OID)
#define IS_HASH(r) ((r)->rd_rel->relam == HASH_AM_OID)
/* ------------------------------------------------
* A structure for a whole btree index statistics
* used by pgstatindex().
* ------------------------------------------------
*/
typedef struct BTIndexStat
{
uint32 version;
uint32 level;
BlockNumber root_blkno;
uint64 internal_pages;
uint64 leaf_pages;
uint64 empty_pages;
uint64 deleted_pages;
uint64 max_avail;
uint64 free_space;
uint64 fragments;
} BTIndexStat;
/* ------------------------------------------------
* A structure for a whole GIN index statistics
* used by pgstatginindex().
* ------------------------------------------------
*/
typedef struct GinIndexStat
{
int32 version;
BlockNumber pending_pages;
int64 pending_tuples;
} GinIndexStat;
/* ------------------------------------------------
* A structure for a whole HASH index statistics
* used by pgstathashindex().
* ------------------------------------------------
*/
typedef struct HashIndexStat
{
int32 version;
int32 space_per_page;
BlockNumber bucket_pages;
BlockNumber overflow_pages;
BlockNumber bitmap_pages;
BlockNumber unused_pages;
int64 live_items;
int64 dead_items;
uint64 free_space;
} HashIndexStat;
static Datum pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo);
static int64 pg_relpages_impl(Relation rel);
static void GetHashPageStats(Page page, HashIndexStat *stats);
/* ------------------------------------------------------
* pgstatindex()
*
* Usage: SELECT * FROM pgstatindex('t1_pkey');
*
* The superuser() check here must be kept as the library might be upgraded
* without the extension being upgraded, meaning that in pre-1.5 installations
* these functions could be called by any user.
* ------------------------------------------------------
*/
Datum
pgstatindex(PG_FUNCTION_ARGS)
{
text *relname = PG_GETARG_TEXT_PP(0);
Relation rel;
RangeVar *relrv;
if (!superuser())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
errmsg("must be superuser to use pgstattuple functions")));
relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
rel = relation_openrv(relrv, AccessShareLock);
PG_RETURN_DATUM(pgstatindex_impl(rel, fcinfo));
}
/*
* As of pgstattuple version 1.5, we no longer need to check if the user
* is a superuser because we REVOKE EXECUTE on the function from PUBLIC.
* Users can then grant access to it based on their policies.
*
* Otherwise identical to pgstatindex (above).
*/
Datum
pgstatindex_v1_5(PG_FUNCTION_ARGS)
{
text *relname = PG_GETARG_TEXT_PP(0);
Relation rel;
RangeVar *relrv;
relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
rel = relation_openrv(relrv, AccessShareLock);
PG_RETURN_DATUM(pgstatindex_impl(rel, fcinfo));
}
/*
* The superuser() check here must be kept as the library might be upgraded
* without the extension being upgraded, meaning that in pre-1.5 installations
* these functions could be called by any user.
*/
Datum
pgstatindexbyid(PG_FUNCTION_ARGS)
{
Oid relid = PG_GETARG_OID(0);
Relation rel;
if (!superuser())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
errmsg("must be superuser to use pgstattuple functions")));
rel = relation_open(relid, AccessShareLock);
PG_RETURN_DATUM(pgstatindex_impl(rel, fcinfo));
}
/* No need for superuser checks in v1.5, see above */
Datum
pgstatindexbyid_v1_5(PG_FUNCTION_ARGS)
{
Oid relid = PG_GETARG_OID(0);
Relation rel;
rel = relation_open(relid, AccessShareLock);
PG_RETURN_DATUM(pgstatindex_impl(rel, fcinfo));
}
static Datum
pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo)
{
Datum result;
BlockNumber nblocks;
BlockNumber blkno;
BTIndexStat indexStat;
BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
if (!IS_INDEX(rel) || !IS_BTREE(rel))
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("relation \"%s\" is not a btree index",
RelationGetRelationName(rel))));
/*
* Reject attempts to read non-local temporary relations; we would be
* likely to get wrong data since we have no visibility into the owning
* session's local buffers.
*/
if (RELATION_IS_OTHER_TEMP(rel))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot access temporary tables of other sessions")));
/*
* A !indisready index could lead to ERRCODE_DATA_CORRUPTED later, so exit
* early. We're capable of assessing an indisready&&!indisvalid index,
* but the results could be confusing. For example, the index's size
* could be too low for a valid index of the table.
*/
if (!rel->rd_index->indisvalid)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("index \"%s\" is not valid",
RelationGetRelationName(rel))));
/*
* Read metapage
*/
{
Buffer buffer = ReadBufferExtended(rel, MAIN_FORKNUM, 0, RBM_NORMAL, bstrategy);
Page page = BufferGetPage(buffer);
BTMetaPageData *metad = BTPageGetMeta(page);
indexStat.version = metad->btm_version;
indexStat.level = metad->btm_level;
indexStat.root_blkno = metad->btm_root;
ReleaseBuffer(buffer);
}
/* -- init counters -- */
indexStat.internal_pages = 0;
indexStat.leaf_pages = 0;
indexStat.empty_pages = 0;
indexStat.deleted_pages = 0;
indexStat.max_avail = 0;
indexStat.free_space = 0;
indexStat.fragments = 0;
/*
* Scan all blocks except the metapage
*/
nblocks = RelationGetNumberOfBlocks(rel);
for (blkno = 1; blkno < nblocks; blkno++)
{
Buffer buffer;
Page page;
BTPageOpaque opaque;
CHECK_FOR_INTERRUPTS();
/* Read and lock buffer */
buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
LockBuffer(buffer, BUFFER_LOCK_SHARE);
page = BufferGetPage(buffer);
opaque = BTPageGetOpaque(page);
Use full 64-bit XIDs in deleted nbtree pages. Otherwise we risk "leaking" deleted pages by making them non-recyclable indefinitely. Commit 6655a729 did the same thing for deleted pages in GiST indexes. That work was used as a starting point here. Stop storing an XID indicating the oldest bpto.xact across all deleted though unrecycled pages in nbtree metapages. There is no longer any reason to care about that condition/the oldest XID. It only ever made sense when wraparound was something _bt_vacuum_needs_cleanup() had to consider. The btm_oldest_btpo_xact metapage field has been repurposed and renamed. It is now btm_last_cleanup_num_delpages, which is used to remember how many non-recycled deleted pages remain from the last VACUUM (in practice its value is usually the precise number of pages that were _newly deleted_ during the specific VACUUM operation that last set the field). The general idea behind storing btm_last_cleanup_num_delpages is to use it to give _some_ consideration to non-recycled deleted pages inside _bt_vacuum_needs_cleanup() -- though never too much. We only really need to avoid leaving a truly excessive number of deleted pages in an unrecycled state forever. We only do this to cover certain narrow cases where no other factor makes VACUUM do a full scan, and yet the index continues to grow (and so actually misses out on recycling existing deleted pages). These metapage changes result in a clear user-visible benefit: We no longer trigger full index scans during VACUUM operations solely due to the presence of only 1 or 2 known deleted (though unrecycled) blocks from a very large index. All that matters now is keeping the costs and benefits in balance over time. Fix an issue that has been around since commit 857f9c36, which added the "skip full scan of index" mechanism (i.e. the _bt_vacuum_needs_cleanup() logic). The accuracy of btm_last_cleanup_num_heap_tuples accidentally hinged upon _when_ the source value gets stored. We now always store btm_last_cleanup_num_heap_tuples in btvacuumcleanup(). This fixes the issue because IndexVacuumInfo.num_heap_tuples (the source field) is expected to accurately indicate the state of the table _after_ the VACUUM completes inside btvacuumcleanup(). A backpatchable fix cannot easily be extracted from this commit. A targeted fix for the issue will follow in a later commit, though that won't happen today. I (pgeoghegan) have chosen to remove any mention of deleted pages in the documentation of the vacuum_cleanup_index_scale_factor GUC/param, since the presence of deleted (though unrecycled) pages is no longer of much concern to users. The vacuum_cleanup_index_scale_factor description in the docs now seems rather unclear in any case, and it should probably be rewritten in the near future. Perhaps some passing mention of page deletion will be added back at the same time. Bump XLOG_PAGE_MAGIC due to nbtree WAL records using full XIDs now. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznpdHvujGUwYZ8sihX=d5u-tRYhi-F4wnV2uN2zHpMUXw@mail.gmail.com
2021-02-25 03:41:34 +01:00
/*
* Determine page type, and update totals.
*
* Note that we arbitrarily bucket deleted pages together without
* considering if they're leaf pages or internal pages.
*/
Fix multiple bugs in contrib/pgstattuple's pgstatindex() function. Dead or half-dead index leaf pages were incorrectly reported as live, as a consequence of a code rearrangement I made (during a moment of severe brain fade, evidently) in commit d287818eb514d431. The index metapage was not counted in index_size, causing that result to not agree with the actual index size on-disk. Index root pages were not counted in internal_pages, which is inconsistent compared to the case of a root that's also a leaf (one-page index), where the root would be counted in leaf_pages. Aside from that inconsistency, this could lead to additional transient discrepancies between the reported page counts and index_size, since it's possible for pgstatindex's scan to see zero or multiple pages marked as BTP_ROOT, if the root moves due to a split during the scan. With these fixes, index_size will always be exactly one page more than the sum of the displayed page counts. Also, the index_size result was incorrectly documented as being measured in pages; it's always been measured in bytes. (While fixing that, I couldn't resist doing some small additional wordsmithing on the pgstattuple docs.) Including the metapage causes the reported index_size to not be zero for an empty index. To preserve the desired property that the pgstattuple regression test results are platform-independent (ie, BLCKSZ configuration independent), scale the index_size result in the regression tests. The documentation issue was reported by Otsuka Kenji, and the inconsistent root page counting by Peter Geoghegan; the other problems noted by me. Back-patch to all supported branches, because this has been broken for a long time.
2016-02-18 21:40:35 +01:00
if (P_ISDELETED(opaque))
indexStat.deleted_pages++;
else if (P_IGNORE(opaque))
indexStat.empty_pages++; /* this is the "half dead" state */
else if (P_ISLEAF(opaque))
{
int max_avail;
2007-11-15 22:14:46 +01:00
max_avail = BLCKSZ - (BLCKSZ - ((PageHeader) page)->pd_special + SizeOfPageHeaderData);
indexStat.max_avail += max_avail;
indexStat.free_space += PageGetFreeSpace(page);
indexStat.leaf_pages++;
/*
* If the next leaf is on an earlier block, it means a
* fragmentation.
*/
if (opaque->btpo_next != P_NONE && opaque->btpo_next < blkno)
indexStat.fragments++;
}
else
indexStat.internal_pages++;
/* Unlock and release buffer */
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
ReleaseBuffer(buffer);
}
relation_close(rel, AccessShareLock);
/*----------------------------
* Build a result tuple
*----------------------------
*/
{
TupleDesc tupleDesc;
int j;
char *values[10];
HeapTuple tuple;
/* Build a tuple descriptor for our result type */
if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
j = 0;
2014-01-07 03:30:26 +01:00
values[j++] = psprintf("%d", indexStat.version);
values[j++] = psprintf("%d", indexStat.level);
values[j++] = psprintf(INT64_FORMAT,
Fix multiple bugs in contrib/pgstattuple's pgstatindex() function. Dead or half-dead index leaf pages were incorrectly reported as live, as a consequence of a code rearrangement I made (during a moment of severe brain fade, evidently) in commit d287818eb514d431. The index metapage was not counted in index_size, causing that result to not agree with the actual index size on-disk. Index root pages were not counted in internal_pages, which is inconsistent compared to the case of a root that's also a leaf (one-page index), where the root would be counted in leaf_pages. Aside from that inconsistency, this could lead to additional transient discrepancies between the reported page counts and index_size, since it's possible for pgstatindex's scan to see zero or multiple pages marked as BTP_ROOT, if the root moves due to a split during the scan. With these fixes, index_size will always be exactly one page more than the sum of the displayed page counts. Also, the index_size result was incorrectly documented as being measured in pages; it's always been measured in bytes. (While fixing that, I couldn't resist doing some small additional wordsmithing on the pgstattuple docs.) Including the metapage causes the reported index_size to not be zero for an empty index. To preserve the desired property that the pgstattuple regression test results are platform-independent (ie, BLCKSZ configuration independent), scale the index_size result in the regression tests. The documentation issue was reported by Otsuka Kenji, and the inconsistent root page counting by Peter Geoghegan; the other problems noted by me. Back-patch to all supported branches, because this has been broken for a long time.
2016-02-18 21:40:35 +01:00
(1 + /* include the metapage in index_size */
indexStat.leaf_pages +
indexStat.internal_pages +
indexStat.deleted_pages +
indexStat.empty_pages) * BLCKSZ);
2014-01-07 03:30:26 +01:00
values[j++] = psprintf("%u", indexStat.root_blkno);
values[j++] = psprintf(INT64_FORMAT, indexStat.internal_pages);
values[j++] = psprintf(INT64_FORMAT, indexStat.leaf_pages);
values[j++] = psprintf(INT64_FORMAT, indexStat.empty_pages);
values[j++] = psprintf(INT64_FORMAT, indexStat.deleted_pages);
if (indexStat.max_avail > 0)
2014-01-07 03:30:26 +01:00
values[j++] = psprintf("%.2f",
100.0 - (double) indexStat.free_space / (double) indexStat.max_avail * 100.0);
else
2014-01-07 03:30:26 +01:00
values[j++] = pstrdup("NaN");
if (indexStat.leaf_pages > 0)
2014-01-07 03:30:26 +01:00
values[j++] = psprintf("%.2f",
(double) indexStat.fragments / (double) indexStat.leaf_pages * 100.0);
else
2014-01-07 03:30:26 +01:00
values[j++] = pstrdup("NaN");
tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
values);
result = HeapTupleGetDatum(tuple);
}
return result;
}
/* --------------------------------------------------------
* pg_relpages()
*
* Get the number of pages of the table/index.
*
* Usage: SELECT pg_relpages('t1');
* SELECT pg_relpages('t1_pkey');
*
* Must keep superuser() check, see above.
* --------------------------------------------------------
*/
Datum
pg_relpages(PG_FUNCTION_ARGS)
{
text *relname = PG_GETARG_TEXT_PP(0);
Relation rel;
RangeVar *relrv;
if (!superuser())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
errmsg("must be superuser to use pgstattuple functions")));
relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
rel = relation_openrv(relrv, AccessShareLock);
PG_RETURN_INT64(pg_relpages_impl(rel));
}
/* No need for superuser checks in v1.5, see above */
Datum
pg_relpages_v1_5(PG_FUNCTION_ARGS)
{
text *relname = PG_GETARG_TEXT_PP(0);
Relation rel;
RangeVar *relrv;
relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
rel = relation_openrv(relrv, AccessShareLock);
PG_RETURN_INT64(pg_relpages_impl(rel));
}
/* Must keep superuser() check, see above. */
Datum
pg_relpagesbyid(PG_FUNCTION_ARGS)
{
Oid relid = PG_GETARG_OID(0);
Relation rel;
if (!superuser())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
errmsg("must be superuser to use pgstattuple functions")));
rel = relation_open(relid, AccessShareLock);
PG_RETURN_INT64(pg_relpages_impl(rel));
}
/* No need for superuser checks in v1.5, see above */
Datum
pg_relpagesbyid_v1_5(PG_FUNCTION_ARGS)
{
Oid relid = PG_GETARG_OID(0);
Relation rel;
rel = relation_open(relid, AccessShareLock);
PG_RETURN_INT64(pg_relpages_impl(rel));
}
static int64
pg_relpages_impl(Relation rel)
{
int64 relpages;
if (!RELKIND_HAS_STORAGE(rel->rd_rel->relkind))
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("cannot get page count of relation \"%s\"",
RelationGetRelationName(rel)),
errdetail_relkind_not_supported(rel->rd_rel->relkind)));
/* note: this will work OK on non-local temp tables */
relpages = RelationGetNumberOfBlocks(rel);
relation_close(rel, AccessShareLock);
return relpages;
}
/* ------------------------------------------------------
* pgstatginindex()
*
* Usage: SELECT * FROM pgstatginindex('ginindex');
*
* Must keep superuser() check, see above.
* ------------------------------------------------------
*/
Datum
pgstatginindex(PG_FUNCTION_ARGS)
{
Oid relid = PG_GETARG_OID(0);
if (!superuser())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
errmsg("must be superuser to use pgstattuple functions")));
PG_RETURN_DATUM(pgstatginindex_internal(relid, fcinfo));
}
/* No need for superuser checks in v1.5, see above */
Datum
pgstatginindex_v1_5(PG_FUNCTION_ARGS)
{
Oid relid = PG_GETARG_OID(0);
PG_RETURN_DATUM(pgstatginindex_internal(relid, fcinfo));
}
Datum
pgstatginindex_internal(Oid relid, FunctionCallInfo fcinfo)
{
Relation rel;
Buffer buffer;
Page page;
GinMetaPageData *metadata;
GinIndexStat stats;
HeapTuple tuple;
TupleDesc tupleDesc;
Datum values[3];
bool nulls[3] = {false, false, false};
Datum result;
rel = relation_open(relid, AccessShareLock);
if (!IS_INDEX(rel) || !IS_GIN(rel))
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("relation \"%s\" is not a GIN index",
RelationGetRelationName(rel))));
/*
* Reject attempts to read non-local temporary relations; we would be
* likely to get wrong data since we have no visibility into the owning
* session's local buffers.
*/
if (RELATION_IS_OTHER_TEMP(rel))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot access temporary indexes of other sessions")));
/* see pgstatindex_impl */
if (!rel->rd_index->indisvalid)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("index \"%s\" is not valid",
RelationGetRelationName(rel))));
/*
* Read metapage
*/
buffer = ReadBuffer(rel, GIN_METAPAGE_BLKNO);
LockBuffer(buffer, GIN_SHARE);
page = BufferGetPage(buffer);
metadata = GinPageGetMeta(page);
stats.version = metadata->ginVersion;
stats.pending_pages = metadata->nPendingPages;
stats.pending_tuples = metadata->nPendingHeapTuples;
UnlockReleaseBuffer(buffer);
relation_close(rel, AccessShareLock);
/*
* Build a tuple descriptor for our result type
*/
if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
values[0] = Int32GetDatum(stats.version);
values[1] = UInt32GetDatum(stats.pending_pages);
values[2] = Int64GetDatum(stats.pending_tuples);
/*
* Build and return the tuple
*/
tuple = heap_form_tuple(tupleDesc, values, nulls);
result = HeapTupleGetDatum(tuple);
return result;
}
/* ------------------------------------------------------
* pgstathashindex()
*
* Usage: SELECT * FROM pgstathashindex('hashindex');
* ------------------------------------------------------
*/
Datum
pgstathashindex(PG_FUNCTION_ARGS)
{
Oid relid = PG_GETARG_OID(0);
BlockNumber nblocks;
BlockNumber blkno;
Relation rel;
HashIndexStat stats;
BufferAccessStrategy bstrategy;
HeapTuple tuple;
TupleDesc tupleDesc;
Datum values[8];
bool nulls[8] = {0};
Buffer metabuf;
HashMetaPage metap;
float8 free_percent;
uint64 total_space;
rel = relation_open(relid, AccessShareLock);
if (!IS_INDEX(rel) || !IS_HASH(rel))
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("relation \"%s\" is not a hash index",
RelationGetRelationName(rel))));
/*
* Reject attempts to read non-local temporary relations; we would be
* likely to get wrong data since we have no visibility into the owning
* session's local buffers.
*/
if (RELATION_IS_OTHER_TEMP(rel))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot access temporary indexes of other sessions")));
/* see pgstatindex_impl */
if (!rel->rd_index->indisvalid)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("index \"%s\" is not valid",
RelationGetRelationName(rel))));
/* Get the information we need from the metapage. */
memset(&stats, 0, sizeof(stats));
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = HashPageGetMeta(BufferGetPage(metabuf));
stats.version = metap->hashm_version;
stats.space_per_page = metap->hashm_bsize;
_hash_relbuf(rel, metabuf);
/* Get the current relation length */
nblocks = RelationGetNumberOfBlocks(rel);
/* prepare access strategy for this index */
bstrategy = GetAccessStrategy(BAS_BULKREAD);
/* Start from blkno 1 as 0th block is metapage */
for (blkno = 1; blkno < nblocks; blkno++)
{
Buffer buf;
Page page;
CHECK_FOR_INTERRUPTS();
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
bstrategy);
LockBuffer(buf, BUFFER_LOCK_SHARE);
page = (Page) BufferGetPage(buf);
if (PageIsNew(page))
stats.unused_pages++;
else if (PageGetSpecialSize(page) !=
MAXALIGN(sizeof(HashPageOpaqueData)))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("index \"%s\" contains corrupted page at block %u",
RelationGetRelationName(rel),
BufferGetBlockNumber(buf))));
else
{
HashPageOpaque opaque;
int pagetype;
opaque = HashPageGetOpaque(page);
pagetype = opaque->hasho_flag & LH_PAGE_TYPE;
if (pagetype == LH_BUCKET_PAGE)
{
stats.bucket_pages++;
GetHashPageStats(page, &stats);
}
else if (pagetype == LH_OVERFLOW_PAGE)
{
stats.overflow_pages++;
GetHashPageStats(page, &stats);
}
else if (pagetype == LH_BITMAP_PAGE)
stats.bitmap_pages++;
else if (pagetype == LH_UNUSED_PAGE)
stats.unused_pages++;
else
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("unexpected page type 0x%04X in HASH index \"%s\" block %u",
opaque->hasho_flag, RelationGetRelationName(rel),
BufferGetBlockNumber(buf))));
}
UnlockReleaseBuffer(buf);
}
/* Done accessing the index */
index_close(rel, AccessShareLock);
/* Count unused pages as free space. */
stats.free_space += (uint64) stats.unused_pages * stats.space_per_page;
/*
* Total space available for tuples excludes the metapage and the bitmap
* pages.
*/
total_space = (uint64) (nblocks - (stats.bitmap_pages + 1)) *
stats.space_per_page;
if (total_space == 0)
free_percent = 0.0;
else
free_percent = 100.0 * stats.free_space / total_space;
/*
* Build a tuple descriptor for our result type
*/
if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
tupleDesc = BlessTupleDesc(tupleDesc);
/*
* Build and return the tuple
*/
values[0] = Int32GetDatum(stats.version);
values[1] = Int64GetDatum((int64) stats.bucket_pages);
values[2] = Int64GetDatum((int64) stats.overflow_pages);
values[3] = Int64GetDatum((int64) stats.bitmap_pages);
values[4] = Int64GetDatum((int64) stats.unused_pages);
values[5] = Int64GetDatum(stats.live_items);
values[6] = Int64GetDatum(stats.dead_items);
values[7] = Float8GetDatum(free_percent);
tuple = heap_form_tuple(tupleDesc, values, nulls);
PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
}
/* -------------------------------------------------
* GetHashPageStats()
*
* Collect statistics of single hash page
* -------------------------------------------------
*/
static void
GetHashPageStats(Page page, HashIndexStat *stats)
{
OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
int off;
/* count live and dead tuples, and free space */
for (off = FirstOffsetNumber; off <= maxoff; off++)
{
ItemId id = PageGetItemId(page, off);
if (!ItemIdIsDead(id))
stats->live_items++;
else
stats->dead_items++;
}
stats->free_space += PageGetExactFreeSpace(page);
}