/*------------------------------------------------------------------------- * * pg_buffercache_pages.c * display some contents of the buffer cache * * contrib/pg_buffercache/pg_buffercache_pages.c *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/htup_details.h" #include "catalog/pg_type.h" #include "funcapi.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" #define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8 #define NUM_BUFFERCACHE_PAGES_ELEM 9 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4 PG_MODULE_MAGIC; /* * Record structure holding the to be exposed cache data. */ typedef struct { uint32 bufferid; RelFileNumber relfilenumber; Oid reltablespace; Oid reldatabase; ForkNumber forknum; BlockNumber blocknum; bool isvalid; bool isdirty; uint16 usagecount; /* * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from * being pinned by too many backends and each backend will only pin once * because of bufmgr.c's PrivateRefCount infrastructure. */ int32 pinning_backends; } BufferCachePagesRec; /* * Function context for data persisting over repeated calls. */ typedef struct { TupleDesc tupdesc; BufferCachePagesRec *record; } BufferCachePagesContext; /* * Function returning data from the shared buffer cache - buffer number, * relation node/tablespace/database/blocknum and dirty indicator. */ PG_FUNCTION_INFO_V1(pg_buffercache_pages); PG_FUNCTION_INFO_V1(pg_buffercache_summary); PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts); PG_FUNCTION_INFO_V1(pg_buffercache_evict); Datum pg_buffercache_pages(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; Datum result; MemoryContext oldcontext; BufferCachePagesContext *fctx; /* User function context. */ TupleDesc tupledesc; TupleDesc expected_tupledesc; HeapTuple tuple; if (SRF_IS_FIRSTCALL()) { int i; funcctx = SRF_FIRSTCALL_INIT(); /* Switch context when allocating stuff to be used in later calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* Create a user function context for cross-call persistence */ fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext)); /* * To smoothly support upgrades from version 1.0 of this extension * transparently handle the (non-)existence of the pinning_backends * column. We unfortunately have to get the result type for that... - * we can't use the result type determined by the function definition * without potentially crashing when somebody uses the old (or even * wrong) function definition though. */ if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM || expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM) elog(ERROR, "incorrect number of output arguments"); /* Construct a tuple descriptor for the result rows. */ tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts); TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", INT4OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber", INT2OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber", INT8OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty", BOOLOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count", INT2OID, -1, 0); if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends", INT4OID, -1, 0); fctx->tupdesc = BlessTupleDesc(tupledesc); /* Allocate NBuffers worth of BufferCachePagesRec records. */ fctx->record = (BufferCachePagesRec *) MemoryContextAllocHuge(CurrentMemoryContext, sizeof(BufferCachePagesRec) * NBuffers); /* Set max calls and remember the user function context. */ funcctx->max_calls = NBuffers; funcctx->user_fctx = fctx; /* Return to original context when allocating transient memory */ MemoryContextSwitchTo(oldcontext); /* * Scan through all the buffers, saving the relevant fields in the * fctx->record structure. * * We don't hold the partition locks, so we don't get a consistent * snapshot across all buffers, but we do grab the buffer header * locks, so the information of each buffer is self-consistent. */ for (i = 0; i < NBuffers; i++) { BufferDesc *bufHdr; uint32 buf_state; bufHdr = GetBufferDescriptor(i); /* Lock each buffer header before inspecting. */ buf_state = LockBufHdr(bufHdr); fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag); fctx->record[i].reltablespace = bufHdr->tag.spcOid; fctx->record[i].reldatabase = bufHdr->tag.dbOid; fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag); fctx->record[i].blocknum = bufHdr->tag.blockNum; fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state); fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state); if (buf_state & BM_DIRTY) fctx->record[i].isdirty = true; else fctx->record[i].isdirty = false; /* Note if the buffer is valid, and has storage created */ if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) fctx->record[i].isvalid = true; else fctx->record[i].isvalid = false; UnlockBufHdr(bufHdr, buf_state); } } funcctx = SRF_PERCALL_SETUP(); /* Get the saved state */ fctx = funcctx->user_fctx; if (funcctx->call_cntr < funcctx->max_calls) { uint32 i = funcctx->call_cntr; Datum values[NUM_BUFFERCACHE_PAGES_ELEM]; bool nulls[NUM_BUFFERCACHE_PAGES_ELEM]; values[0] = Int32GetDatum(fctx->record[i].bufferid); nulls[0] = false; /* * Set all fields except the bufferid to null if the buffer is unused * or not valid. */ if (fctx->record[i].blocknum == InvalidBlockNumber || fctx->record[i].isvalid == false) { nulls[1] = true; nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; nulls[7] = true; /* unused for v1.0 callers, but the array is always long enough */ nulls[8] = true; } else { values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber); nulls[1] = false; values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace); nulls[2] = false; values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase); nulls[3] = false; values[4] = ObjectIdGetDatum(fctx->record[i].forknum); nulls[4] = false; values[5] = Int64GetDatum((int64) fctx->record[i].blocknum); nulls[5] = false; values[6] = BoolGetDatum(fctx->record[i].isdirty); nulls[6] = false; values[7] = Int16GetDatum(fctx->record[i].usagecount); nulls[7] = false; /* unused for v1.0 callers, but the array is always long enough */ values[8] = Int32GetDatum(fctx->record[i].pinning_backends); nulls[8] = false; } /* Build and return the tuple. */ tuple = heap_form_tuple(fctx->tupdesc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } else SRF_RETURN_DONE(funcctx); } Datum pg_buffercache_summary(PG_FUNCTION_ARGS) { Datum result; TupleDesc tupledesc; HeapTuple tuple; Datum values[NUM_BUFFERCACHE_SUMMARY_ELEM]; bool nulls[NUM_BUFFERCACHE_SUMMARY_ELEM]; int32 buffers_used = 0; int32 buffers_unused = 0; int32 buffers_dirty = 0; int32 buffers_pinned = 0; int64 usagecount_total = 0; if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); for (int i = 0; i < NBuffers; i++) { BufferDesc *bufHdr; uint32 buf_state; /* * This function summarizes the state of all headers. Locking the * buffer headers wouldn't provide an improved result as the state of * the buffer can still change after we release the lock and it'd * noticeably increase the cost of the function. */ bufHdr = GetBufferDescriptor(i); buf_state = pg_atomic_read_u32(&bufHdr->state); if (buf_state & BM_VALID) { buffers_used++; usagecount_total += BUF_STATE_GET_USAGECOUNT(buf_state); if (buf_state & BM_DIRTY) buffers_dirty++; } else buffers_unused++; if (BUF_STATE_GET_REFCOUNT(buf_state) > 0) buffers_pinned++; } memset(nulls, 0, sizeof(nulls)); values[0] = Int32GetDatum(buffers_used); values[1] = Int32GetDatum(buffers_unused); values[2] = Int32GetDatum(buffers_dirty); values[3] = Int32GetDatum(buffers_pinned); if (buffers_used != 0) values[4] = Float8GetDatum((double) usagecount_total / buffers_used); else nulls[4] = true; /* Build and return the tuple. */ tuple = heap_form_tuple(tupledesc, values, nulls); result = HeapTupleGetDatum(tuple); PG_RETURN_DATUM(result); } Datum pg_buffercache_usage_counts(PG_FUNCTION_ARGS) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; int usage_counts[BM_MAX_USAGE_COUNT + 1] = {0}; int dirty[BM_MAX_USAGE_COUNT + 1] = {0}; int pinned[BM_MAX_USAGE_COUNT + 1] = {0}; Datum values[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM]; bool nulls[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM] = {0}; InitMaterializedSRF(fcinfo, 0); for (int i = 0; i < NBuffers; i++) { BufferDesc *bufHdr = GetBufferDescriptor(i); uint32 buf_state = pg_atomic_read_u32(&bufHdr->state); int usage_count; usage_count = BUF_STATE_GET_USAGECOUNT(buf_state); usage_counts[usage_count]++; if (buf_state & BM_DIRTY) dirty[usage_count]++; if (BUF_STATE_GET_REFCOUNT(buf_state) > 0) pinned[usage_count]++; } for (int i = 0; i < BM_MAX_USAGE_COUNT + 1; i++) { values[0] = Int32GetDatum(i); values[1] = Int32GetDatum(usage_counts[i]); values[2] = Int32GetDatum(dirty[i]); values[3] = Int32GetDatum(pinned[i]); tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); } return (Datum) 0; } /* * Try to evict a shared buffer. */ Datum pg_buffercache_evict(PG_FUNCTION_ARGS) { Buffer buf = PG_GETARG_INT32(0); if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to use pg_buffercache_evict function"))); if (buf < 1 || buf > NBuffers) elog(ERROR, "bad buffer ID: %d", buf); PG_RETURN_BOOL(EvictUnpinnedBuffer(buf)); }