mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-10-07 03:26:58 +02:00
05d4cbf9b6
RelFileNumbers are now assigned using a separate counter, instead of being assigned from the OID counter. This counter never wraps around: if all 2^56 possible RelFileNumbers are used, an internal error occurs. As the cluster is limited to 2^64 total bytes of WAL, this limitation should not cause a problem in practice. If the counter were 64 bits wide rather than 56 bits wide, we would need to increase the width of the BufferTag, which might adversely impact buffer lookup performance. Also, this lets us use bigint for pg_class.relfilenode and other places where these values are exposed at the SQL level without worrying about overflow. This should remove the need to keep "tombstone" files around until the next checkpoint when relations are removed. We do that to keep RelFileNumbers from being recycled, but now that won't happen anyway. However, this patch doesn't actually change anything in this area; it just makes it possible for a future patch to do so. Dilip Kumar, based on an idea from Andres Freund, who also reviewed some earlier versions of the patch. Further review and some wordsmithing by me. Also reviewed at various points by Ashutosh Sharma, Vignesh C, Amul Sul, Álvaro Herrera, and Tom Lane. Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
271 lines
8.0 KiB
C
271 lines
8.0 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* pg_buffercache_pages.c
|
|
* display some contents of the buffer cache
|
|
*
|
|
* contrib/pg_buffercache/pg_buffercache_pages.c
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/htup_details.h"
|
|
#include "catalog/pg_type.h"
|
|
#include "funcapi.h"
|
|
#include "storage/buf_internals.h"
|
|
#include "storage/bufmgr.h"
|
|
|
|
|
|
#define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8
|
|
#define NUM_BUFFERCACHE_PAGES_ELEM 9
|
|
|
|
PG_MODULE_MAGIC;
|
|
|
|
/*
|
|
* Record structure holding the to be exposed cache data.
|
|
*/
|
|
typedef struct
|
|
{
|
|
uint32 bufferid;
|
|
RelFileNumber relfilenumber;
|
|
Oid reltablespace;
|
|
Oid reldatabase;
|
|
ForkNumber forknum;
|
|
BlockNumber blocknum;
|
|
bool isvalid;
|
|
bool isdirty;
|
|
uint16 usagecount;
|
|
|
|
/*
|
|
* An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
|
|
* being pinned by too many backends and each backend will only pin once
|
|
* because of bufmgr.c's PrivateRefCount infrastructure.
|
|
*/
|
|
int32 pinning_backends;
|
|
} BufferCachePagesRec;
|
|
|
|
|
|
/*
|
|
* Function context for data persisting over repeated calls.
|
|
*/
|
|
typedef struct
|
|
{
|
|
TupleDesc tupdesc;
|
|
BufferCachePagesRec *record;
|
|
} BufferCachePagesContext;
|
|
|
|
|
|
/*
|
|
* Function returning data from the shared buffer cache - buffer number,
|
|
* relation node/tablespace/database/blocknum and dirty indicator.
|
|
*/
|
|
PG_FUNCTION_INFO_V1(pg_buffercache_pages);
|
|
PG_FUNCTION_INFO_V1(pg_buffercache_pages_v1_4);
|
|
|
|
static Datum
|
|
pg_buffercache_pages_internal(PG_FUNCTION_ARGS, Oid rfn_typid)
|
|
{
|
|
FuncCallContext *funcctx;
|
|
Datum result;
|
|
MemoryContext oldcontext;
|
|
BufferCachePagesContext *fctx; /* User function context. */
|
|
TupleDesc tupledesc;
|
|
TupleDesc expected_tupledesc;
|
|
HeapTuple tuple;
|
|
|
|
if (SRF_IS_FIRSTCALL())
|
|
{
|
|
int i;
|
|
|
|
funcctx = SRF_FIRSTCALL_INIT();
|
|
|
|
/* Switch context when allocating stuff to be used in later calls */
|
|
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
|
|
|
|
/* Create a user function context for cross-call persistence */
|
|
fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext));
|
|
|
|
/*
|
|
* To smoothly support upgrades from version 1.0 of this extension
|
|
* transparently handle the (non-)existence of the pinning_backends
|
|
* column. We unfortunately have to get the result type for that... -
|
|
* we can't use the result type determined by the function definition
|
|
* without potentially crashing when somebody uses the old (or even
|
|
* wrong) function definition though.
|
|
*/
|
|
if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
|
|
elog(ERROR, "return type must be a row type");
|
|
|
|
if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
|
|
expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
|
|
elog(ERROR, "incorrect number of output arguments");
|
|
|
|
/* Construct a tuple descriptor for the result rows. */
|
|
tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
|
|
TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
|
|
INT4OID, -1, 0);
|
|
TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
|
|
rfn_typid, -1, 0);
|
|
TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
|
|
OIDOID, -1, 0);
|
|
TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
|
|
OIDOID, -1, 0);
|
|
TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
|
|
INT2OID, -1, 0);
|
|
TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
|
|
INT8OID, -1, 0);
|
|
TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
|
|
BOOLOID, -1, 0);
|
|
TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
|
|
INT2OID, -1, 0);
|
|
|
|
if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
|
|
TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
|
|
INT4OID, -1, 0);
|
|
|
|
fctx->tupdesc = BlessTupleDesc(tupledesc);
|
|
|
|
/* Allocate NBuffers worth of BufferCachePagesRec records. */
|
|
fctx->record = (BufferCachePagesRec *)
|
|
MemoryContextAllocHuge(CurrentMemoryContext,
|
|
sizeof(BufferCachePagesRec) * NBuffers);
|
|
|
|
/* Set max calls and remember the user function context. */
|
|
funcctx->max_calls = NBuffers;
|
|
funcctx->user_fctx = fctx;
|
|
|
|
/* Return to original context when allocating transient memory */
|
|
MemoryContextSwitchTo(oldcontext);
|
|
|
|
/*
|
|
* Scan through all the buffers, saving the relevant fields in the
|
|
* fctx->record structure.
|
|
*
|
|
* We don't hold the partition locks, so we don't get a consistent
|
|
* snapshot across all buffers, but we do grab the buffer header
|
|
* locks, so the information of each buffer is self-consistent.
|
|
*/
|
|
for (i = 0; i < NBuffers; i++)
|
|
{
|
|
BufferDesc *bufHdr;
|
|
uint32 buf_state;
|
|
|
|
bufHdr = GetBufferDescriptor(i);
|
|
/* Lock each buffer header before inspecting. */
|
|
buf_state = LockBufHdr(bufHdr);
|
|
|
|
fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
|
|
fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
|
|
fctx->record[i].reltablespace = bufHdr->tag.spcOid;
|
|
fctx->record[i].reldatabase = bufHdr->tag.dbOid;
|
|
fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
|
|
fctx->record[i].blocknum = bufHdr->tag.blockNum;
|
|
fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
|
|
fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
|
|
|
|
if (buf_state & BM_DIRTY)
|
|
fctx->record[i].isdirty = true;
|
|
else
|
|
fctx->record[i].isdirty = false;
|
|
|
|
/* Note if the buffer is valid, and has storage created */
|
|
if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
|
|
fctx->record[i].isvalid = true;
|
|
else
|
|
fctx->record[i].isvalid = false;
|
|
|
|
UnlockBufHdr(bufHdr, buf_state);
|
|
}
|
|
}
|
|
|
|
funcctx = SRF_PERCALL_SETUP();
|
|
|
|
/* Get the saved state */
|
|
fctx = funcctx->user_fctx;
|
|
|
|
if (funcctx->call_cntr < funcctx->max_calls)
|
|
{
|
|
uint32 i = funcctx->call_cntr;
|
|
Datum values[NUM_BUFFERCACHE_PAGES_ELEM];
|
|
bool nulls[NUM_BUFFERCACHE_PAGES_ELEM];
|
|
|
|
values[0] = Int32GetDatum(fctx->record[i].bufferid);
|
|
nulls[0] = false;
|
|
|
|
/*
|
|
* Set all fields except the bufferid to null if the buffer is unused
|
|
* or not valid.
|
|
*/
|
|
if (fctx->record[i].blocknum == InvalidBlockNumber ||
|
|
fctx->record[i].isvalid == false)
|
|
{
|
|
nulls[1] = true;
|
|
nulls[2] = true;
|
|
nulls[3] = true;
|
|
nulls[4] = true;
|
|
nulls[5] = true;
|
|
nulls[6] = true;
|
|
nulls[7] = true;
|
|
/* unused for v1.0 callers, but the array is always long enough */
|
|
nulls[8] = true;
|
|
}
|
|
else
|
|
{
|
|
if (rfn_typid == INT8OID)
|
|
values[1] =
|
|
Int64GetDatum((int64) fctx->record[i].relfilenumber);
|
|
else
|
|
{
|
|
Assert(rfn_typid == OIDOID);
|
|
|
|
if (fctx->record[i].relfilenumber > OID_MAX)
|
|
ereport(ERROR,
|
|
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("relfilenode %llu is too large to be represented as an OID",
|
|
(unsigned long long) fctx->record[i].relfilenumber),
|
|
errhint("Upgrade the extension using ALTER EXTENSION pg_buffercache UPDATE"));
|
|
|
|
values[1] =
|
|
ObjectIdGetDatum((Oid) fctx->record[i].relfilenumber);
|
|
}
|
|
|
|
nulls[1] = false;
|
|
values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
|
|
nulls[2] = false;
|
|
values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
|
|
nulls[3] = false;
|
|
values[4] = ObjectIdGetDatum(fctx->record[i].forknum);
|
|
nulls[4] = false;
|
|
values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
|
|
nulls[5] = false;
|
|
values[6] = BoolGetDatum(fctx->record[i].isdirty);
|
|
nulls[6] = false;
|
|
values[7] = Int16GetDatum(fctx->record[i].usagecount);
|
|
nulls[7] = false;
|
|
/* unused for v1.0 callers, but the array is always long enough */
|
|
values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
|
|
nulls[8] = false;
|
|
}
|
|
|
|
/* Build and return the tuple. */
|
|
tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
|
|
result = HeapTupleGetDatum(tuple);
|
|
|
|
SRF_RETURN_NEXT(funcctx, result);
|
|
}
|
|
else
|
|
SRF_RETURN_DONE(funcctx);
|
|
}
|
|
|
|
/* entry point for old extension version */
|
|
Datum
|
|
pg_buffercache_pages(PG_FUNCTION_ARGS)
|
|
{
|
|
return pg_buffercache_pages_internal(fcinfo, OIDOID);
|
|
}
|
|
|
|
Datum
|
|
pg_buffercache_pages_v1_4(PG_FUNCTION_ARGS)
|
|
{
|
|
return pg_buffercache_pages_internal(fcinfo, INT8OID);
|
|
}
|