pgstat: Track more detailed relation IO statistics

Commit 28e626bde0 introduced the infrastructure for tracking more detailed IO
statistics. This commit adds the actual collection of the new IO statistics
for relations and temporary relations. See aforementioned commit for goals and
high-level design.

The changes in this commit are fairly straight-forward. The bulk of the change
is to passing sufficient information to the callsites of pgstat_count_io_op().

A somewhat unsightly detail is that it currently is hard to find a better
place to count fsyncs than in md.c, whereas the other pgstat_count_io_op()
calls are in bufmgr.c/localbuf.c. As the number of fsyncs is tied to md.c
implementation details, it's not obvious there is a better answer.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/20200124195226.lth52iydq2n2uilq@alap3.anarazel.de
This commit is contained in:
Andres Freund 2023-02-09 22:22:26 -08:00
parent 40d0b2d415
commit f30d62c2fc
6 changed files with 184 additions and 36 deletions

View File

@ -472,8 +472,9 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
ForkNumber forkNum,
BlockNumber blockNum,
BufferAccessStrategy strategy,
bool *foundPtr);
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
bool *foundPtr, IOContext *io_context);
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
IOObject io_object, IOContext io_context);
static void FindAndDropRelationBuffers(RelFileLocator rlocator,
ForkNumber forkNum,
BlockNumber nForkBlock,
@ -814,6 +815,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
BufferDesc *bufHdr;
Block bufBlock;
bool found;
IOContext io_context;
IOObject io_object;
bool isExtend;
bool isLocalBuf = SmgrIsTemp(smgr);
@ -846,7 +849,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
if (isLocalBuf)
{
bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
/*
* LocalBufferAlloc() will set the io_context to IOCONTEXT_NORMAL. We
* do not use a BufferAccessStrategy for I/O of temporary tables.
* However, in some cases, the "strategy" may not be NULL, so we can't
* rely on IOContextForStrategy() to set the right IOContext for us.
* This may happen in cases like CREATE TEMPORARY TABLE AS...
*/
bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found, &io_context);
if (found)
pgBufferUsage.local_blks_hit++;
else if (isExtend)
@ -862,7 +872,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* not currently in memory.
*/
bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
strategy, &found);
strategy, &found, &io_context);
if (found)
pgBufferUsage.shared_blks_hit++;
else if (isExtend)
@ -977,7 +987,16 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
*/
Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
if (isLocalBuf)
{
bufBlock = LocalBufHdrGetBlock(bufHdr);
io_object = IOOBJECT_TEMP_RELATION;
}
else
{
bufBlock = BufHdrGetBlock(bufHdr);
io_object = IOOBJECT_RELATION;
}
if (isExtend)
{
@ -986,6 +1005,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
/* don't set checksum for all-zero page */
smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
pgstat_count_io_op(io_object, io_context, IOOP_EXTEND);
/*
* NB: we're *not* doing a ScheduleBufferTagForWriteback here;
* although we're essentially performing a write. At least on linux
@ -1013,6 +1034,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
pgstat_count_io_op(io_object, io_context, IOOP_READ);
if (track_io_timing)
{
INSTR_TIME_SET_CURRENT(io_time);
@ -1106,14 +1129,19 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* *foundPtr is actually redundant with the buffer's BM_VALID flag, but
* we keep it for simplicity in ReadBuffer.
*
* io_context is passed as an output parameter to avoid calling
* IOContextForStrategy() when there is a shared buffers hit and no IO
* statistics need be captured.
*
* No locks are held either at entry or exit.
*/
static BufferDesc *
BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
BlockNumber blockNum,
BufferAccessStrategy strategy,
bool *foundPtr)
bool *foundPtr, IOContext *io_context)
{
bool from_ring;
BufferTag newTag; /* identity of requested block */
uint32 newHash; /* hash value for newTag */
LWLock *newPartitionLock; /* buffer partition lock for it */
@ -1165,8 +1193,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
{
/*
* If we get here, previous attempts to read the buffer must
* have failed ... but we shall bravely try again.
* have failed ... but we shall bravely try again. Set
* io_context since we will in fact need to count an IO
* Operation.
*/
*io_context = IOContextForStrategy(strategy);
*foundPtr = false;
}
}
@ -1180,6 +1211,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
*/
LWLockRelease(newPartitionLock);
*io_context = IOContextForStrategy(strategy);
/* Loop here in case we have to try another victim buffer */
for (;;)
{
@ -1193,7 +1226,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* Select a victim buffer. The buffer is returned with its header
* spinlock still held!
*/
buf = StrategyGetBuffer(strategy, &buf_state);
buf = StrategyGetBuffer(strategy, &buf_state, &from_ring);
Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
@ -1247,7 +1280,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
UnlockBufHdr(buf, buf_state);
if (XLogNeedsFlush(lsn) &&
StrategyRejectBuffer(strategy, buf))
StrategyRejectBuffer(strategy, buf, from_ring))
{
/* Drop lock/pin and loop around for another buffer */
LWLockRelease(BufferDescriptorGetContentLock(buf));
@ -1262,7 +1295,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
smgr->smgr_rlocator.locator.dbOid,
smgr->smgr_rlocator.locator.relNumber);
FlushBuffer(buf, NULL);
FlushBuffer(buf, NULL, IOOBJECT_RELATION, *io_context);
LWLockRelease(BufferDescriptorGetContentLock(buf));
ScheduleBufferTagForWriteback(&BackendWritebackContext,
@ -1443,6 +1476,28 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
LWLockRelease(newPartitionLock);
if (oldFlags & BM_VALID)
{
/*
* When a BufferAccessStrategy is in use, blocks evicted from shared
* buffers are counted as IOOP_EVICT in the corresponding context
* (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
* strategy in two cases: 1) while initially claiming buffers for the
* strategy ring 2) to replace an existing strategy ring buffer
* because it is pinned or in use and cannot be reused.
*
* Blocks evicted from buffers already in the strategy ring are
* counted as IOOP_REUSE in the corresponding strategy context.
*
* At this point, we can accurately count evictions and reuses,
* because we have successfully claimed the valid buffer. Previously,
* we may have been forced to release the buffer due to concurrent
* pinners or erroring out.
*/
pgstat_count_io_op(IOOBJECT_RELATION, *io_context,
from_ring ? IOOP_REUSE : IOOP_EVICT);
}
/*
* Buffer contents are currently invalid. Try to obtain the right to
* start I/O. If StartBufferIO returns false, then someone else managed
@ -2563,7 +2618,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
PinBuffer_Locked(bufHdr);
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, NULL);
FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
@ -2813,7 +2868,8 @@ BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
* as the second parameter. If not, pass NULL.
*/
static void
FlushBuffer(BufferDesc *buf, SMgrRelation reln)
FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
IOContext io_context)
{
XLogRecPtr recptr;
ErrorContextCallback errcallback;
@ -2907,6 +2963,26 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
bufToWrite,
false);
/*
* When a strategy is in use, only flushes of dirty buffers already in the
* strategy ring are counted as strategy writes (IOCONTEXT
* [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
* statistics tracking.
*
* If a shared buffer initially added to the ring must be flushed before
* being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
*
* If a shared buffer which was added to the ring later because the
* current strategy buffer is pinned or in use or because all strategy
* buffers were dirty and rejected (for BAS_BULKREAD operations only)
* requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
* (from_ring will be false).
*
* When a strategy is not in use, the write can only be a "regular" write
* of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
*/
pgstat_count_io_op(IOOBJECT_RELATION, io_context, IOOP_WRITE);
if (track_io_timing)
{
INSTR_TIME_SET_CURRENT(io_time);
@ -3549,6 +3625,8 @@ FlushRelationBuffers(Relation rel)
buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);
/* Pop the error context stack */
error_context_stack = errcallback.previous;
}
@ -3581,7 +3659,7 @@ FlushRelationBuffers(Relation rel)
{
PinBuffer_Locked(bufHdr);
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, RelationGetSmgr(rel));
FlushBuffer(bufHdr, RelationGetSmgr(rel), IOOBJECT_RELATION, IOCONTEXT_NORMAL);
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr);
}
@ -3679,7 +3757,7 @@ FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
{
PinBuffer_Locked(bufHdr);
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, srelent->srel);
FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr);
}
@ -3889,7 +3967,7 @@ FlushDatabaseBuffers(Oid dbid)
{
PinBuffer_Locked(bufHdr);
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, NULL);
FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr);
}
@ -3916,7 +3994,7 @@ FlushOneBuffer(Buffer buffer)
Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
FlushBuffer(bufHdr, NULL);
FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
}
/*

View File

@ -15,6 +15,7 @@
*/
#include "postgres.h"
#include "pgstat.h"
#include "port/atomics.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
@ -81,12 +82,6 @@ typedef struct BufferAccessStrategyData
*/
int current;
/*
* True if the buffer just returned by StrategyGetBuffer had been in the
* ring already.
*/
bool current_was_in_ring;
/*
* Array of buffer numbers. InvalidBuffer (that is, zero) indicates we
* have not yet selected a buffer for this ring slot. For allocation
@ -198,13 +193,15 @@ have_free_buffer(void)
* return the buffer with the buffer header spinlock still held.
*/
BufferDesc *
StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
{
BufferDesc *buf;
int bgwprocno;
int trycounter;
uint32 local_buf_state; /* to avoid repeated (de-)referencing */
*from_ring = false;
/*
* If given a strategy object, see whether it can select a buffer. We
* assume strategy objects don't need buffer_strategy_lock.
@ -213,7 +210,10 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
{
buf = GetBufferFromRing(strategy, buf_state);
if (buf != NULL)
{
*from_ring = true;
return buf;
}
}
/*
@ -602,7 +602,7 @@ FreeAccessStrategy(BufferAccessStrategy strategy)
/*
* GetBufferFromRing -- returns a buffer from the ring, or NULL if the
* ring is empty.
* ring is empty / not usable.
*
* The bufhdr spin lock is held on the returned buffer.
*/
@ -625,10 +625,7 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
*/
bufnum = strategy->buffers[strategy->current];
if (bufnum == InvalidBuffer)
{
strategy->current_was_in_ring = false;
return NULL;
}
/*
* If the buffer is pinned we cannot use it under any circumstances.
@ -644,7 +641,6 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
&& BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1)
{
strategy->current_was_in_ring = true;
*buf_state = local_buf_state;
return buf;
}
@ -654,7 +650,6 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
* Tell caller to allocate a new buffer with the normal allocation
* strategy. He'll then replace this ring element via AddBufferToRing.
*/
strategy->current_was_in_ring = false;
return NULL;
}
@ -670,6 +665,39 @@ AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf)
strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
}
/*
* Utility function returning the IOContext of a given BufferAccessStrategy's
* strategy ring.
*/
IOContext
IOContextForStrategy(BufferAccessStrategy strategy)
{
if (!strategy)
return IOCONTEXT_NORMAL;
switch (strategy->btype)
{
case BAS_NORMAL:
/*
* Currently, GetAccessStrategy() returns NULL for
* BufferAccessStrategyType BAS_NORMAL, so this case is
* unreachable.
*/
pg_unreachable();
return IOCONTEXT_NORMAL;
case BAS_BULKREAD:
return IOCONTEXT_BULKREAD;
case BAS_BULKWRITE:
return IOCONTEXT_BULKWRITE;
case BAS_VACUUM:
return IOCONTEXT_VACUUM;
}
elog(ERROR, "unrecognized BufferAccessStrategyType: %d", strategy->btype);
pg_unreachable();
}
/*
* StrategyRejectBuffer -- consider rejecting a dirty buffer
*
@ -682,14 +710,14 @@ AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf)
* if this buffer should be written and re-used.
*/
bool
StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
{
/* We only do this in bulkread mode */
if (strategy->btype != BAS_BULKREAD)
return false;
/* Don't muck with behavior of normal buffer-replacement strategy */
if (!strategy->current_was_in_ring ||
if (!from_ring ||
strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
return false;

View File

@ -18,6 +18,7 @@
#include "access/parallel.h"
#include "catalog/catalog.h"
#include "executor/instrument.h"
#include "pgstat.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "utils/guc_hooks.h"
@ -107,7 +108,7 @@ PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum,
*/
BufferDesc *
LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
bool *foundPtr)
bool *foundPtr, IOContext *io_context)
{
BufferTag newTag; /* identity of requested block */
LocalBufferLookupEnt *hresult;
@ -127,6 +128,14 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
hresult = (LocalBufferLookupEnt *)
hash_search(LocalBufHash, &newTag, HASH_FIND, NULL);
/*
* IO Operations on local buffers are only done in IOCONTEXT_NORMAL. Set
* io_context here (instead of after a buffer hit would have returned) for
* convenience since we don't have to worry about the overhead of calling
* IOContextForStrategy().
*/
*io_context = IOCONTEXT_NORMAL;
if (hresult)
{
b = hresult->id;
@ -230,6 +239,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
buf_state &= ~BM_DIRTY;
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);
pgBufferUsage.local_blks_written++;
}
@ -255,6 +265,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
ClearBufferTag(&bufHdr->tag);
buf_state &= ~(BM_VALID | BM_TAG_VALID);
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_EVICT);
}
hresult = (LocalBufferLookupEnt *)

View File

@ -983,6 +983,15 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
{
MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
/*
* fsyncs done through mdimmedsync() should be tracked in a separate
* IOContext than those done through mdsyncfiletag() to differentiate
* between unavoidable client backend fsyncs (e.g. those done during
* index build) and those which ideally would have been done by the
* checkpointer. Since other IO operations bypassing the buffer
* manager could also be tracked in such an IOContext, wait until
* these are also tracked to track immediate fsyncs.
*/
if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
@ -1021,6 +1030,19 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
{
/*
* We have no way of knowing if the current IOContext is
* IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
* point, so count the fsync as being in the IOCONTEXT_NORMAL
* IOContext. This is probably okay, because the number of backend
* fsyncs doesn't say anything about the efficacy of the
* BufferAccessStrategy. And counting both fsyncs done in
* IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
* IOCONTEXT_NORMAL is likely clearer when investigating the number of
* backend fsyncs.
*/
pgstat_count_io_op(IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC);
ereport(DEBUG1,
(errmsg_internal("could not forward fsync request because request queue is full")));
@ -1410,6 +1432,8 @@ mdsyncfiletag(const FileTag *ftag, char *path)
if (need_to_close)
FileClose(file);
pgstat_count_io_op(IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC);
errno = save_errno;
return result;
}

View File

@ -15,6 +15,7 @@
#ifndef BUFMGR_INTERNALS_H
#define BUFMGR_INTERNALS_H
#include "pgstat.h"
#include "port/atomics.h"
#include "storage/buf.h"
#include "storage/bufmgr.h"
@ -391,11 +392,12 @@ extern void IssuePendingWritebacks(WritebackContext *context);
extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
/* freelist.c */
extern IOContext IOContextForStrategy(BufferAccessStrategy bas);
extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
uint32 *buf_state);
uint32 *buf_state, bool *from_ring);
extern void StrategyFreeBuffer(BufferDesc *buf);
extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
BufferDesc *buf);
BufferDesc *buf, bool from_ring);
extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
extern void StrategyNotifyBgWriter(int bgwprocno);
@ -417,7 +419,7 @@ extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
ForkNumber forkNum,
BlockNumber blockNum);
extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
BlockNumber blockNum, bool *foundPtr);
BlockNumber blockNum, bool *foundPtr, IOContext *io_context);
extern void MarkLocalBufferDirty(Buffer buffer);
extern void DropRelationLocalBuffers(RelFileLocator rlocator,
ForkNumber forkNum,

View File

@ -23,7 +23,12 @@
typedef void *Block;
/* Possible arguments for GetAccessStrategy() */
/*
* Possible arguments for GetAccessStrategy().
*
* If adding a new BufferAccessStrategyType, also add a new IOContext so
* IO statistics using this strategy are tracked.
*/
typedef enum BufferAccessStrategyType
{
BAS_NORMAL, /* Normal random access */