postgresql/src/backend/storage/buffer/bufmgr.c

2113 lines
58 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* bufmgr.c
* buffer manager interface routines
*
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.186 2005/03/04 20:21:06 tgl Exp $
*
*-------------------------------------------------------------------------
*/
/*
* ReadBuffer() -- find or create a buffer holding the requested page,
* and pin it so that no one can destroy it while this process
* is using it.
*
* ReleaseBuffer() -- unpin the buffer
*
* WriteNoReleaseBuffer() -- mark the buffer contents as "dirty"
* but don't unpin. The disk IO is delayed until buffer
* replacement.
*
* WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer()
*
* BufferSync() -- flush all dirty buffers in the buffer pool.
*
* BgBufferSync() -- flush some dirty buffers in the buffer pool.
*
* InitBufferPool() -- Init the buffer module.
*
* See other files:
* freelist.c -- chooses victim for buffer replacement
* buf_table.c -- manages the buffer lookup table
*/
#include "postgres.h"
2000-10-28 18:21:00 +02:00
#include <sys/file.h>
#include <unistd.h>
#include "lib/stringinfo.h"
1999-07-16 07:00:38 +02:00
#include "miscadmin.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/bufpage.h"
#include "storage/proc.h"
#include "storage/smgr.h"
1996-11-08 07:02:30 +01:00
#include "utils/relcache.h"
#include "utils/resowner.h"
#include "pgstat.h"
/* Note: these two macros only work on shared buffers, not local ones! */
#define BufHdrGetBlock(bufHdr) BufferBlockPointers[(bufHdr)->buf_id]
#define BufferGetLSN(bufHdr) (*((XLogRecPtr*) BufHdrGetBlock(bufHdr)))
/* Note: this macro only works on local buffers, not shared ones! */
#define LocalBufHdrGetBlock(bufHdr) \
LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
2000-11-30 09:46:26 +01:00
/* GUC variables */
2003-08-04 02:43:34 +02:00
bool zero_damaged_pages = false;
double bgwriter_lru_percent = 1.0;
double bgwriter_all_percent = 0.333;
int bgwriter_lru_maxpages = 5;
int bgwriter_all_maxpages = 5;
long NDirectFileRead; /* some I/O's are direct file access.
* bypass bufmgr */
long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */
/* local state for StartBufferIO and related functions */
static BufferDesc *InProgressBuf = NULL;
static bool IsForInput;
/* local state for LockBufferForCleanup */
static BufferDesc *PinCountWaitBuf = NULL;
static bool PinBuffer(BufferDesc *buf);
static void PinBuffer_Locked(BufferDesc *buf);
static void UnpinBuffer(BufferDesc *buf, bool fixOwner, bool trashOK);
static bool SyncOneBuffer(int buf_id, bool skip_pinned);
static void WaitIO(BufferDesc *buf);
static bool StartBufferIO(BufferDesc *buf, bool forInput);
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
int set_flag_bits);
static void buffer_write_error_callback(void *arg);
static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
bool *foundPtr);
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
static void write_buffer(Buffer buffer, bool unpin);
/*
* ReadBuffer -- returns a buffer containing the requested
* block of the requested relation. If the blknum
* requested is P_NEW, extend the relation file and
* allocate a new block. (Caller is responsible for
* ensuring that only one backend tries to extend a
* relation at the same time!)
*
* Returns: the buffer number for the buffer containing
* the block read. The returned buffer has been pinned.
* Does not return on error --- elog's instead.
*
* Assume when this function is called, that reln has been
* opened already.
*/
Buffer
ReadBuffer(Relation reln, BlockNumber blockNum)
{
BufferDesc *bufHdr;
Block bufBlock;
bool found;
bool isExtend;
bool isLocalBuf;
/* Make sure we will have room to remember the buffer pin */
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
isExtend = (blockNum == P_NEW);
isLocalBuf = reln->rd_istemp;
/* Open it at the smgr level if not already done */
RelationOpenSmgr(reln);
/* Substitute proper block number if caller asked for P_NEW */
if (isExtend)
blockNum = smgrnblocks(reln->rd_smgr);
pgstat_count_buffer_read(&reln->pgstat_info, reln);
if (isLocalBuf)
{
ReadLocalBufferCount++;
bufHdr = LocalBufferAlloc(reln, blockNum, &found);
if (found)
LocalBufferHitCount++;
}
else
{
ReadBufferCount++;
2004-08-29 07:07:03 +02:00
/*
* lookup the buffer. IO_IN_PROGRESS is set if the requested
* block is not currently in memory.
*/
bufHdr = BufferAlloc(reln, blockNum, &found);
if (found)
BufferHitCount++;
}
/* At this point we do NOT hold any locks. */
/* if it was already in the buffer pool, we're done */
if (found)
{
/* Just need to update stats before we exit */
pgstat_count_buffer_hit(&reln->pgstat_info, reln);
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageHit;
return BufferDescriptorGetBuffer(bufHdr);
}
/*
* if we have gotten to this point, we have allocated a buffer for the
* page but its contents are not yet valid. IO_IN_PROGRESS is set for
* it, if it's a shared buffer.
*
* Note: if smgrextend fails, we will end up with a buffer that is
2004-08-29 07:07:03 +02:00
* allocated but not marked BM_VALID. P_NEW will still select the
* same block number (because the relation didn't get any longer on
* disk) and so future attempts to extend the relation will find the
* same buffer (if it's not been recycled) but come right back here to
* try smgrextend again.
*/
Assert(!(bufHdr->flags & BM_VALID)); /* spinlock not needed */
bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
if (isExtend)
{
/* new buffers are zero-filled */
MemSet((char *) bufBlock, 0, BLCKSZ);
smgrextend(reln->rd_smgr, blockNum, (char *) bufBlock,
reln->rd_istemp);
}
else
{
smgrread(reln->rd_smgr, blockNum, (char *) bufBlock);
/* check for garbage data */
if (!PageHeaderIsValid((PageHeader) bufBlock))
{
/*
2004-08-29 07:07:03 +02:00
* During WAL recovery, the first access to any data page
* should overwrite the whole page from the WAL; so a
* clobbered page header is not reason to fail. Hence, when
* InRecovery we may always act as though zero_damaged_pages
* is ON.
*/
if (zero_damaged_pages || InRecovery)
{
ereport(WARNING,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("invalid page header in block %u of relation \"%s\"; zeroing out page",
2003-08-04 02:43:34 +02:00
blockNum, RelationGetRelationName(reln))));
MemSet((char *) bufBlock, 0, BLCKSZ);
}
else
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
2004-08-29 07:07:03 +02:00
errmsg("invalid page header in block %u of relation \"%s\"",
blockNum, RelationGetRelationName(reln))));
}
}
if (isLocalBuf)
{
/* Only need to adjust flags */
bufHdr->flags |= BM_VALID;
}
else
{
/* Set BM_VALID, terminate IO, and wake up any waiters */
TerminateBufferIO(bufHdr, false, BM_VALID);
}
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageMiss;
1998-09-01 05:29:17 +02:00
return BufferDescriptorGetBuffer(bufHdr);
}
/*
* BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
* buffer. If no buffer exists already, selects a replacement
* victim and evicts the old page, but does NOT read in new page.
*
* The returned buffer is pinned and is already marked as holding the
* desired page. If it already did have the desired page, *foundPtr is
* set TRUE. Otherwise, *foundPtr is set FALSE and the buffer is marked
* as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
*
* *foundPtr is actually redundant with the buffer's BM_VALID flag, but
* we keep it for simplicity in ReadBuffer.
*
* No locks are held either at entry or exit.
*/
static BufferDesc *
BufferAlloc(Relation reln,
BlockNumber blockNum,
bool *foundPtr)
{
BufferTag newTag; /* identity of requested block */
BufferTag oldTag;
BufFlags oldFlags;
int buf_id;
BufferDesc *buf;
bool valid;
/* create a tag so we can lookup the buffer */
INIT_BUFFERTAG(newTag, reln, blockNum);
/* see if the block is in the buffer pool already */
LWLockAcquire(BufMappingLock, LW_SHARED);
buf_id = BufTableLookup(&newTag);
if (buf_id >= 0)
{
/*
* Found it. Now, pin the buffer so no one can steal it from the
* buffer pool, and check to see if the correct data has been
* loaded into the buffer.
*/
buf = &BufferDescriptors[buf_id];
valid = PinBuffer(buf);
/* Can release the mapping lock as soon as we've pinned it */
LWLockRelease(BufMappingLock);
*foundPtr = TRUE;
if (!valid)
{
/*
* We can only get here if (a) someone else is still reading
* in the page, or (b) a previous read attempt failed. We
* have to wait for any active read attempt to finish, and
* then set up our own read attempt if the page is still not
* BM_VALID. StartBufferIO does it all.
*/
if (StartBufferIO(buf, true))
{
/*
* If we get here, previous attempts to read the buffer
* must have failed ... but we shall bravely try again.
*/
*foundPtr = FALSE;
}
}
1998-09-01 05:29:17 +02:00
return buf;
}
/*
* Didn't find it in the buffer pool. We'll have to initialize a new
* buffer. Remember to unlock BufMappingLock while doing the work.
*/
LWLockRelease(BufMappingLock);
/* Loop here in case we have to try another victim buffer */
for (;;)
{
/*
* Select a victim buffer. The buffer is returned with its
* header spinlock still held! Also the BufFreelistLock is
* still held, since it would be bad to hold the spinlock
* while possibly waking up other processes.
*/
buf = StrategyGetBuffer();
Assert(buf->refcount == 0);
/* Must copy buffer flags while we still hold the spinlock */
oldFlags = buf->flags;
/* Pin the buffer and then release the buffer spinlock */
PinBuffer_Locked(buf);
2001-03-22 05:01:46 +01:00
/* Now it's safe to release the freelist lock */
LWLockRelease(BufFreelistLock);
/*
* If the buffer was dirty, try to write it out. There is a race
* condition here, in that someone might dirty it after we released
* it above, or even while we are writing it out (since our share-lock
* won't prevent hint-bit updates). We will recheck the dirty bit
* after re-locking the buffer header.
*/
if (oldFlags & BM_DIRTY)
{
/*
* We need a share-lock on the buffer contents to write it out
* (else we might write invalid data, eg because someone else
* is compacting the page contents while we write). We must use
* a conditional lock acquisition here to avoid deadlock. Even
* though the buffer was not pinned (and therefore surely not
* locked) when StrategyGetBuffer returned it, someone else could
* have pinned and exclusive-locked it by the time we get here.
* If we try to get the lock unconditionally, we'd block waiting
* for them; if they later block waiting for us, deadlock ensues.
* (This has been observed to happen when two backends are both
* trying to split btree index pages, and the second one just
* happens to be trying to split the page the first one got from
* StrategyGetBuffer.)
*/
if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
{
FlushBuffer(buf, NULL);
LWLockRelease(buf->content_lock);
}
else
{
/*
* Someone else has pinned the buffer, so give it up and
* loop back to get another one.
*/
UnpinBuffer(buf, true, false /* evidently recently used */ );
continue;
}
}
/*
* Acquire exclusive mapping lock in preparation for changing
* the buffer's association.
*/
LWLockAcquire(BufMappingLock, LW_EXCLUSIVE);
/*
* Try to make a hashtable entry for the buffer under its new tag.
* This could fail because while we were writing someone else
* allocated another buffer for the same block we want to read in.
* Note that we have not yet removed the hashtable entry for the
* old tag.
*/
buf_id = BufTableInsert(&newTag, buf->buf_id);
if (buf_id >= 0)
{
/*
* Got a collision. Someone has already done what we were about
* to do. We'll just handle this as if it were found in
* the buffer pool in the first place. First, give up the
* buffer we were planning to use. Don't allow it to be
* thrown in the free list (we don't want to hold both
* global locks at once).
*/
UnpinBuffer(buf, true, false);
/* remaining code should match code at top of routine */
buf = &BufferDescriptors[buf_id];
valid = PinBuffer(buf);
/* Can release the mapping lock as soon as we've pinned it */
LWLockRelease(BufMappingLock);
*foundPtr = TRUE;
if (!valid)
{
/*
* We can only get here if (a) someone else is still reading
* in the page, or (b) a previous read attempt failed. We
* have to wait for any active read attempt to finish, and
* then set up our own read attempt if the page is still not
* BM_VALID. StartBufferIO does it all.
*/
if (StartBufferIO(buf, true))
{
/*
* If we get here, previous attempts to read the buffer
* must have failed ... but we shall bravely try again.
*/
*foundPtr = FALSE;
}
}
return buf;
}
/*
* Need to lock the buffer header too in order to change its tag.
*/
LockBufHdr_NoHoldoff(buf);
/*
* Somebody could have pinned or re-dirtied the buffer while we were
* doing the I/O and making the new hashtable entry. If so, we
* can't recycle this buffer; we must undo everything we've done and
* start over with a new victim buffer.
*/
if (buf->refcount == 1 && !(buf->flags & BM_DIRTY))
break;
UnlockBufHdr_NoHoldoff(buf);
BufTableDelete(&newTag);
LWLockRelease(BufMappingLock);
UnpinBuffer(buf, true, false /* evidently recently used */ );
}
/*
* Okay, it's finally safe to rename the buffer.
*
* Clearing BM_VALID here is necessary, clearing the dirtybits
* is just paranoia. We also clear the usage_count since any
* recency of use of the old content is no longer relevant.
*/
oldTag = buf->tag;
oldFlags = buf->flags;
buf->tag = newTag;
buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
buf->flags |= BM_TAG_VALID;
buf->usage_count = 0;
UnlockBufHdr_NoHoldoff(buf);
if (oldFlags & BM_TAG_VALID)
BufTableDelete(&oldTag);
LWLockRelease(BufMappingLock);
/*
* Buffer contents are currently invalid. Try to get the io_in_progress
* lock. If StartBufferIO returns false, then someone else managed
* to read it before we did, so there's nothing left for BufferAlloc()
* to do.
*/
if (StartBufferIO(buf, true))
*foundPtr = FALSE;
else
*foundPtr = TRUE;
1998-09-01 05:29:17 +02:00
return buf;
}
/*
* InvalidateBuffer -- mark a shared buffer invalid and return it to the
* freelist.
*
* The buffer header spinlock must be held at entry. We drop it before
* returning. (This is sane because the caller must have locked the
* buffer in order to be sure it should be dropped.)
*
* This is used only in contexts such as dropping a relation. We assume
* that no other backend could possibly be interested in using the page,
* so the only reason the buffer might be pinned is if someone else is
* trying to write it out. We have to let them finish before we can
* reclaim the buffer.
*
* The buffer could get reclaimed by someone else while we are waiting
* to acquire the necessary locks; if so, don't mess it up.
*/
static void
InvalidateBuffer(BufferDesc *buf)
{
BufferTag oldTag;
BufFlags oldFlags;
/* Save the original buffer tag before dropping the spinlock */
oldTag = buf->tag;
UnlockBufHdr(buf);
retry:
/*
* Acquire exclusive mapping lock in preparation for changing
* the buffer's association.
*/
LWLockAcquire(BufMappingLock, LW_EXCLUSIVE);
/* Re-lock the buffer header (NoHoldoff since we have an LWLock) */
LockBufHdr_NoHoldoff(buf);
/* If it's changed while we were waiting for lock, do nothing */
if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
{
UnlockBufHdr_NoHoldoff(buf);
LWLockRelease(BufMappingLock);
return;
}
/*
* We assume the only reason for it to be pinned is that someone else
* is flushing the page out. Wait for them to finish. (This could be
* an infinite loop if the refcount is messed up... it would be nice
* to time out after awhile, but there seems no way to be sure how
* many loops may be needed. Note that if the other guy has pinned
* the buffer but not yet done StartBufferIO, WaitIO will fall through
* and we'll effectively be busy-looping here.)
*/
if (buf->refcount != 0)
{
UnlockBufHdr_NoHoldoff(buf);
LWLockRelease(BufMappingLock);
WaitIO(buf);
goto retry;
}
/*
* Clear out the buffer's tag and flags. We must do this to ensure
* that linear scans of the buffer array don't think the buffer is valid.
*/
oldFlags = buf->flags;
CLEAR_BUFFERTAG(buf->tag);
buf->flags = 0;
buf->usage_count = 0;
UnlockBufHdr_NoHoldoff(buf);
/*
* Remove the buffer from the lookup hashtable, if it was in there.
*/
if (oldFlags & BM_TAG_VALID)
BufTableDelete(&oldTag);
/*
* Avoid accepting a cancel interrupt when we release the mapping lock;
* that would leave the buffer free but not on the freelist. (Which would
* not be fatal, since it'd get picked up again by the clock scanning
* code, but we'd rather be sure it gets to the freelist.)
*/
HOLD_INTERRUPTS();
LWLockRelease(BufMappingLock);
/*
* Insert the buffer at the head of the list of free buffers.
*/
StrategyFreeBuffer(buf, true);
RESUME_INTERRUPTS();
}
/*
* write_buffer -- common functionality for
2002-09-04 22:31:48 +02:00
* WriteBuffer and WriteNoReleaseBuffer
*/
static void
write_buffer(Buffer buffer, bool unpin)
{
BufferDesc *bufHdr;
if (!BufferIsValid(buffer))
elog(ERROR, "bad buffer id: %d", buffer);
if (BufferIsLocal(buffer))
{
WriteLocalBuffer(buffer, unpin);
return;
}
bufHdr = &BufferDescriptors[buffer - 1];
Assert(PrivateRefCount[buffer - 1] > 0);
LockBufHdr(bufHdr);
Assert(bufHdr->refcount > 0);
2000-11-30 09:46:26 +01:00
2004-02-06 20:36:18 +01:00
/*
* If the buffer was not dirty already, do vacuum cost accounting.
2004-02-06 20:36:18 +01:00
*/
if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty;
bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
2000-11-30 09:46:26 +01:00
UnlockBufHdr(bufHdr);
if (unpin)
UnpinBuffer(bufHdr, true, true);
}
/*
* WriteBuffer
*
* Marks buffer contents as dirty (actual write happens later).
*
* Assume that buffer is pinned. Assume that reln is valid.
*
* Side Effects:
* Pin count is decremented.
*/
void
WriteBuffer(Buffer buffer)
{
write_buffer(buffer, true);
}
/*
* WriteNoReleaseBuffer -- like WriteBuffer, but do not unpin the buffer
* when the operation is complete.
*/
void
WriteNoReleaseBuffer(Buffer buffer)
{
write_buffer(buffer, false);
}
/*
* ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
*
* Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
* compared to calling the two routines separately. Now it's mainly just
* a convenience function. However, if the passed buffer is valid and
* already contains the desired block, we just return it as-is; and that
* does save considerable work compared to a full release and reacquire.
*
* Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
* buffer actually needs to be released. This case is the same as ReadBuffer,
* but can save some tests in the caller.
*/
Buffer
ReleaseAndReadBuffer(Buffer buffer,
Relation relation,
BlockNumber blockNum)
{
BufferDesc *bufHdr;
if (BufferIsValid(buffer))
{
if (BufferIsLocal(buffer))
{
Assert(LocalRefCount[-buffer - 1] > 0);
bufHdr = &LocalBufferDescriptors[-buffer - 1];
if (bufHdr->tag.blockNum == blockNum &&
RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node))
return buffer;
ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
LocalRefCount[-buffer - 1]--;
if (LocalRefCount[-buffer - 1] == 0 &&
bufHdr->usage_count < BM_MAX_USAGE_COUNT)
bufHdr->usage_count++;
}
else
{
Assert(PrivateRefCount[buffer - 1] > 0);
bufHdr = &BufferDescriptors[buffer - 1];
/* we have pin, so it's ok to examine tag without spinlock */
if (bufHdr->tag.blockNum == blockNum &&
RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node))
return buffer;
UnpinBuffer(bufHdr, true, true);
From: Dan McGuirk <mcguirk@indirect.com> Reply-To: hackers@hub.org, Dan McGuirk <mcguirk@indirect.com> To: hackers@hub.org Subject: [HACKERS] tmin writeback optimization I was doing some profiling of the backend, and noticed that during a certain benchmark I was running somewhere between 30% and 75% of the backend's CPU time was being spent in calls to TransactionIdDidCommit() from HeapTupleSatisfiesNow() or HeapTupleSatisfiesItself() to determine that changed rows' transactions had in fact been committed even though the rows' tmin values had not yet been set. When a query looks at a given row, it needs to figure out whether the transaction that changed the row has been committed and hence it should pay attention to the row, or whether on the other hand the transaction is still in progress or has been aborted and hence the row should be ignored. If a tmin value is set, it is known definitively that the row's transaction has been committed. However, if tmin is not set, the transaction referred to in xmin must be looked up in pg_log, and this is what the backend was spending a lot of time doing during my benchmark. So, implementing a method suggested by Vadim, I created the following patch that, the first time a query finds a committed row whose tmin value is not set, sets it, and marks the buffer where the row is stored as dirty. (It works for tmax, too.) This doesn't result in the boost in real time performance I was hoping for, however it does decrease backend CPU usage by up to two-thirds in certain situations, so it could be rather beneficial in high-concurrency settings.
1997-03-28 08:06:53 +01:00
}
}
return ReadBuffer(relation, blockNum);
}
/*
* PinBuffer -- make buffer unavailable for replacement.
*
* This should be applied only to shared buffers, never local ones.
*
* Note that ResourceOwnerEnlargeBuffers must have been done already.
*
* Returns TRUE if buffer is BM_VALID, else FALSE. This provision allows
* some callers to avoid an extra spinlock cycle.
*/
static bool
PinBuffer(BufferDesc *buf)
{
int b = buf->buf_id;
bool result;
if (PrivateRefCount[b] == 0)
{
/*
* Use NoHoldoff here because we don't want the unlock to be a
* potential place to honor a QueryCancel request.
* (The caller should be holding off interrupts anyway.)
*/
LockBufHdr_NoHoldoff(buf);
buf->refcount++;
result = (buf->flags & BM_VALID) != 0;
UnlockBufHdr_NoHoldoff(buf);
}
else
{
/* If we previously pinned the buffer, it must surely be valid */
result = true;
}
PrivateRefCount[b]++;
Assert(PrivateRefCount[b] > 0);
ResourceOwnerRememberBuffer(CurrentResourceOwner,
BufferDescriptorGetBuffer(buf));
return result;
}
/*
* PinBuffer_Locked -- as above, but caller already locked the buffer header.
* The spinlock is released before return.
*
* Note: use of this routine is frequently mandatory, not just an optimization
* to save a spin lock/unlock cycle, because we need to pin a buffer before
* its state can change under us.
*/
static void
PinBuffer_Locked(BufferDesc *buf)
{
int b = buf->buf_id;
if (PrivateRefCount[b] == 0)
buf->refcount++;
/* NoHoldoff since we mustn't accept cancel interrupt here */
UnlockBufHdr_NoHoldoff(buf);
PrivateRefCount[b]++;
Assert(PrivateRefCount[b] > 0);
ResourceOwnerRememberBuffer(CurrentResourceOwner,
BufferDescriptorGetBuffer(buf));
/* Now we can accept cancel */
RESUME_INTERRUPTS();
}
/*
* UnpinBuffer -- make buffer available for replacement.
*
* This should be applied only to shared buffers, never local ones.
*
* Most but not all callers want CurrentResourceOwner to be adjusted.
*
* If we are releasing a buffer during VACUUM, and it's not been otherwise
* used recently, and trashOK is true, send the buffer to the freelist.
*/
static void
UnpinBuffer(BufferDesc *buf, bool fixOwner, bool trashOK)
{
int b = buf->buf_id;
if (fixOwner)
ResourceOwnerForgetBuffer(CurrentResourceOwner,
BufferDescriptorGetBuffer(buf));
Assert(PrivateRefCount[b] > 0);
PrivateRefCount[b]--;
if (PrivateRefCount[b] == 0)
{
bool trash_buffer = false;
/* I'd better not still hold any locks on the buffer */
Assert(!LWLockHeldByMe(buf->content_lock));
Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
/* NoHoldoff ensures we don't lose control before sending signal */
LockBufHdr_NoHoldoff(buf);
/* Decrement the shared reference count */
Assert(buf->refcount > 0);
buf->refcount--;
/* Mark the buffer recently used, unless we are in VACUUM */
if (!strategy_hint_vacuum)
{
if (buf->usage_count < BM_MAX_USAGE_COUNT)
buf->usage_count++;
}
else if (trashOK &&
buf->refcount == 0 &&
buf->usage_count == 0)
trash_buffer = true;
if ((buf->flags & BM_PIN_COUNT_WAITER) &&
buf->refcount == 1)
{
/* we just released the last pin other than the waiter's */
BackendId wait_backend_id = buf->wait_backend_id;
buf->flags &= ~BM_PIN_COUNT_WAITER;
UnlockBufHdr_NoHoldoff(buf);
ProcSendSignal(wait_backend_id);
}
else
UnlockBufHdr_NoHoldoff(buf);
/*
* If VACUUM is releasing an otherwise-unused buffer, send it to
* the freelist for near-term reuse. We put it at the tail so that
* it won't be used before any invalid buffers that may exist.
*/
if (trash_buffer)
StrategyFreeBuffer(buf, false);
}
}
/*
* BufferSync -- Write out all dirty buffers in the pool.
*
* This is called at checkpoint time to write out all dirty shared buffers.
*/
void
BufferSync(void)
{
int buf_id;
int num_to_scan;
/*
* Find out where to start the circular scan.
*/
buf_id = StrategySyncStart();
/* Make sure we can handle the pin inside SyncOneBuffer */
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
/*
* Loop over all buffers.
*/
num_to_scan = NBuffers;
while (num_to_scan-- > 0)
{
(void) SyncOneBuffer(buf_id, false);
if (++buf_id >= NBuffers)
buf_id = 0;
}
}
/*
* BgBufferSync -- Write out some dirty buffers in the pool.
*
* This is called periodically by the background writer process.
*/
void
BgBufferSync(void)
{
static int buf_id1 = 0;
int buf_id2;
int num_to_scan;
int num_written;
/* Make sure we can handle the pin inside SyncOneBuffer */
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
/*
* To minimize work at checkpoint time, we want to try to keep all the
* buffers clean; this motivates a scan that proceeds sequentially through
* all buffers. But we are also charged with ensuring that buffers that
* will be recycled soon are clean when needed; these buffers are the
* ones just ahead of the StrategySyncStart point. We make a separate
* scan through those.
*/
/*
* This loop runs over all buffers, including pinned ones. The
* starting point advances through the buffer pool on successive calls.
*/
if (bgwriter_all_percent > 0.0 && bgwriter_all_maxpages > 0)
{
num_to_scan = (int) ((NBuffers * bgwriter_all_percent + 99) / 100);
num_written = 0;
while (num_to_scan-- > 0)
{
if (SyncOneBuffer(buf_id1, false))
num_written++;
if (++buf_id1 >= NBuffers)
buf_id1 = 0;
if (num_written >= bgwriter_all_maxpages)
break;
}
}
/*
* This loop considers only unpinned buffers close to the clock sweep
* point.
*/
if (bgwriter_lru_percent > 0.0 && bgwriter_lru_maxpages > 0)
{
num_to_scan = (int) ((NBuffers * bgwriter_lru_percent + 99) / 100);
num_written = 0;
buf_id2 = StrategySyncStart();
while (num_to_scan-- > 0)
{
if (SyncOneBuffer(buf_id2, true))
num_written++;
if (++buf_id2 >= NBuffers)
buf_id2 = 0;
if (num_written >= bgwriter_lru_maxpages)
break;
}
}
2000-11-30 09:46:26 +01:00
}
/*
* SyncOneBuffer -- process a single buffer during syncing.
*
* If skip_pinned is true, we don't write currently-pinned buffers, nor
* buffers marked recently used, as these are not replacement candidates.
*
* Returns true if buffer was written, else false. (This could be in error
* if FlushBuffers finds the buffer clean after locking it, but we don't
* care all that much.)
*
* Note: caller must have done ResourceOwnerEnlargeBuffers.
*/
static bool
SyncOneBuffer(int buf_id, bool skip_pinned)
{
BufferDesc *bufHdr = &BufferDescriptors[buf_id];
/*
* Check whether buffer needs writing.
*
* We can make this check without taking the buffer content lock
* so long as we mark pages dirty in access methods *before* logging
* changes with XLogInsert(): if someone marks the buffer dirty
* just after our check we don't worry because our checkpoint.redo
* points before log record for upcoming changes and so we are not
* required to write such dirty buffer.
*/
LockBufHdr(bufHdr);
if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
{
UnlockBufHdr(bufHdr);
return false;
}
if (skip_pinned &&
(bufHdr->refcount != 0 || bufHdr->usage_count != 0))
{
UnlockBufHdr(bufHdr);
return false;
}
/*
* Pin it, share-lock it, write it. (FlushBuffer will do nothing
* if the buffer is clean by the time we've locked it.)
*/
PinBuffer_Locked(bufHdr);
LWLockAcquire(bufHdr->content_lock, LW_SHARED);
FlushBuffer(bufHdr, NULL);
LWLockRelease(bufHdr->content_lock);
UnpinBuffer(bufHdr, true, false /* don't change freelist */ );
return true;
}
/*
* Return a palloc'd string containing buffer usage statistics.
*/
char *
ShowBufferUsage(void)
{
StringInfoData str;
float hitrate;
float localhitrate;
initStringInfo(&str);
if (ReadBufferCount == 0)
hitrate = 0.0;
else
hitrate = (float) BufferHitCount *100.0 / ReadBufferCount;
if (ReadLocalBufferCount == 0)
localhitrate = 0.0;
else
localhitrate = (float) LocalBufferHitCount *100.0 / ReadLocalBufferCount;
appendStringInfo(&str,
2002-09-04 22:31:48 +02:00
"!\tShared blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
ReadBufferCount - BufferHitCount, BufferFlushCount, hitrate);
appendStringInfo(&str,
2002-09-04 22:31:48 +02:00
"!\tLocal blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n",
ReadLocalBufferCount - LocalBufferHitCount, LocalBufferFlushCount, localhitrate);
appendStringInfo(&str,
2002-09-04 22:31:48 +02:00
"!\tDirect blocks: %10ld read, %10ld written\n",
NDirectFileRead, NDirectFileWrite);
return str.data;
}
void
ResetBufferUsage(void)
{
BufferHitCount = 0;
ReadBufferCount = 0;
BufferFlushCount = 0;
LocalBufferHitCount = 0;
ReadLocalBufferCount = 0;
LocalBufferFlushCount = 0;
NDirectFileRead = 0;
NDirectFileWrite = 0;
}
/*
* AtEOXact_Buffers - clean up at end of transaction.
*
* As of PostgreSQL 8.0, buffer pins should get released by the
* ResourceOwner mechanism. This routine is just a debugging
* cross-check that no pins remain.
*/
void
AtEOXact_Buffers(bool isCommit)
{
#ifdef USE_ASSERT_CHECKING
int i;
for (i = 0; i < NBuffers; i++)
{
Assert(PrivateRefCount[i] == 0);
}
AtEOXact_LocalBuffers(isCommit);
#endif
/* Make sure we reset the strategy hint in case VACUUM errored out */
StrategyHintVacuum(false);
}
/*
* Ensure we have released all shared-buffer locks and pins during backend exit
*/
void
AtProcExit_Buffers(void)
{
int i;
AbortBufferIO();
UnlockBuffers();
for (i = 0; i < NBuffers; i++)
{
if (PrivateRefCount[i] != 0)
{
BufferDesc *buf = &(BufferDescriptors[i]);
/*
* We don't worry about updating ResourceOwner; if we even got
* here, it suggests that ResourceOwners are messed up.
*/
PrivateRefCount[i] = 1; /* make sure we release shared pin */
UnpinBuffer(buf, false, false /* don't change freelist */ );
Assert(PrivateRefCount[i] == 0);
}
}
}
/*
* Helper routine to issue warnings when a buffer is unexpectedly pinned
*/
void
PrintBufferLeakWarning(Buffer buffer)
{
BufferDesc *buf;
int32 loccount;
Assert(BufferIsValid(buffer));
if (BufferIsLocal(buffer))
{
buf = &LocalBufferDescriptors[-buffer - 1];
loccount = LocalRefCount[-buffer - 1];
}
else
{
buf = &BufferDescriptors[buffer - 1];
loccount = PrivateRefCount[buffer - 1];
}
/* theoretically we should lock the bufhdr here */
elog(WARNING,
"buffer refcount leak: [%03d] "
"(rel=%u/%u/%u, blockNum=%u, flags=0x%x, refcount=%u %d)",
buffer,
buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
buf->tag.rnode.relNode,
buf->tag.blockNum, buf->flags,
buf->refcount, loccount);
}
/*
2000-11-30 09:46:26 +01:00
* FlushBufferPool
*
* Flush all dirty blocks in buffer pool to disk at the checkpoint time.
* Local relations do not participate in checkpoints, so they don't need to be
* flushed.
*/
void
1999-09-28 13:41:09 +02:00
FlushBufferPool(void)
{
BufferSync();
2000-11-30 09:46:26 +01:00
smgrsync();
}
2000-11-30 09:46:26 +01:00
/*
* Do whatever is needed to prepare for commit at the bufmgr and smgr levels
2000-11-30 09:46:26 +01:00
*/
void
BufmgrCommit(void)
{
/* Nothing to do in bufmgr anymore... */
2001-03-22 05:01:46 +01:00
1999-09-28 13:41:09 +02:00
smgrcommit();
}
/*
1999-05-25 18:15:34 +02:00
* BufferGetBlockNumber
* Returns the block number associated with a buffer.
*
* Note:
* Assumes that the buffer is valid and pinned, else the
* value may be obsolete immediately...
*/
BlockNumber
BufferGetBlockNumber(Buffer buffer)
{
BufferDesc *bufHdr;
Assert(BufferIsPinned(buffer));
if (BufferIsLocal(buffer))
bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
else
bufHdr = &BufferDescriptors[buffer - 1];
/* pinned, so OK to read tag without spinlock */
return bufHdr->tag.blockNum;
}
/*
* BufferGetFileNode
* Returns the relation ID (RelFileNode) associated with a buffer.
*
* This should make the same checks as BufferGetBlockNumber, but since the
* two are generally called together, we don't bother.
*/
RelFileNode
BufferGetFileNode(Buffer buffer)
{
BufferDesc *bufHdr;
if (BufferIsLocal(buffer))
bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
else
bufHdr = &BufferDescriptors[buffer - 1];
return bufHdr->tag.rnode;
}
/*
* FlushBuffer
* Physically write out a shared buffer.
*
* NOTE: this actually just passes the buffer contents to the kernel; the
* real write to disk won't happen until the kernel feels like it. This
* is okay from our point of view since we can redo the changes from WAL.
* However, we will need to force the changes to disk via fsync before
* we can checkpoint WAL.
*
* The caller must hold a pin on the buffer and have share-locked the
* buffer contents. (Note: a share-lock does not prevent updates of
* hint bits in the buffer, so the page could change while the write
* is in progress, but we assume that that will not invalidate the data
* written.)
*
* If the caller has an smgr reference for the buffer's relation, pass it
* as the second parameter. If not, pass NULL.
*/
static void
FlushBuffer(BufferDesc *buf, SMgrRelation reln)
{
2000-11-30 09:46:26 +01:00
XLogRecPtr recptr;
ErrorContextCallback errcontext;
/*
* Acquire the buffer's io_in_progress lock. If StartBufferIO returns
* false, then someone else flushed the buffer before we could, so
* we need not do anything.
*/
if (!StartBufferIO(buf, false))
return;
/* Setup error traceback support for ereport() */
errcontext.callback = buffer_write_error_callback;
errcontext.arg = buf;
errcontext.previous = error_context_stack;
error_context_stack = &errcontext;
/* Find smgr relation for buffer */
if (reln == NULL)
reln = smgropen(buf->tag.rnode);
/*
* Force XLOG flush up to buffer's LSN. This implements the basic WAL
* rule that log updates must hit disk before any of the data-file
* changes they describe do.
*/
recptr = BufferGetLSN(buf);
XLogFlush(recptr);
/*
2004-08-29 07:07:03 +02:00
* Now it's safe to write buffer to disk. Note that no one else should
* have been able to write it while we were busy with log flushing
* because we have the io_in_progress lock.
*/
/* To check if block content changes while flushing. - vadim 01/17/97 */
LockBufHdr_NoHoldoff(buf);
buf->flags &= ~BM_JUST_DIRTIED;
UnlockBufHdr_NoHoldoff(buf);
smgrwrite(reln,
buf->tag.blockNum,
(char *) BufHdrGetBlock(buf),
false);
BufferFlushCount++;
/*
* Mark the buffer as clean (unless BM_JUST_DIRTIED has become set)
* and end the io_in_progress state.
*/
TerminateBufferIO(buf, true, 0);
/* Pop the error context stack */
error_context_stack = errcontext.previous;
}
/*
1999-05-25 18:15:34 +02:00
* RelationGetNumberOfBlocks
* Determines the current number of pages in the relation.
*/
BlockNumber
RelationGetNumberOfBlocks(Relation relation)
{
/* Open it at the smgr level if not already done */
RelationOpenSmgr(relation);
return smgrnblocks(relation->rd_smgr);
}
/*
* RelationTruncate
* Physically truncate a relation to the specified number of blocks.
*
* Caller should already have done something to flush any buffered pages
* that are to be dropped.
*/
void
RelationTruncate(Relation rel, BlockNumber nblocks)
{
/* Open it at the smgr level if not already done */
RelationOpenSmgr(rel);
/* Make sure rd_targblock isn't pointing somewhere past end */
rel->rd_targblock = InvalidBlockNumber;
/* Do the real work */
smgrtruncate(rel->rd_smgr, nblocks, rel->rd_istemp);
}
/* ---------------------------------------------------------------------
* DropRelationBuffers
*
* This function removes all the buffered pages for a relation
* from the buffer pool. Dirty pages are simply dropped, without
* bothering to write them out first. This is NOT rollback-able,
* and so should be used only with extreme caution!
*
* There is no particularly good reason why this doesn't have a
* firstDelBlock parameter, except that current callers don't need it.
*
* We assume that the caller holds an exclusive lock on the relation,
* which should assure that no new buffers will be acquired for the rel
* meanwhile.
* --------------------------------------------------------------------
*/
void
DropRelationBuffers(Relation rel)
{
DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp, 0);
}
/* ---------------------------------------------------------------------
* DropRelFileNodeBuffers
*
* This is the same as DropRelationBuffers, except that the target
* relation is specified by RelFileNode and temp status, and one
* may specify the first block to drop.
*
* This is NOT rollback-able. One legitimate use is to clear the
* buffer cache of buffers for a relation that is being deleted
* during transaction abort.
* --------------------------------------------------------------------
*/
void
DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
BlockNumber firstDelBlock)
{
int i;
BufferDesc *bufHdr;
if (istemp)
{
for (i = 0; i < NLocBuffer; i++)
{
bufHdr = &LocalBufferDescriptors[i];
if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
bufHdr->tag.blockNum >= firstDelBlock)
{
if (LocalRefCount[i] != 0)
elog(ERROR, "block %u of %u/%u/%u is still referenced (local %u)",
bufHdr->tag.blockNum,
bufHdr->tag.rnode.spcNode,
bufHdr->tag.rnode.dbNode,
bufHdr->tag.rnode.relNode,
LocalRefCount[i]);
CLEAR_BUFFERTAG(bufHdr->tag);
bufHdr->flags = 0;
bufHdr->usage_count = 0;
}
}
return;
}
for (i = 0; i < NBuffers; i++)
{
bufHdr = &BufferDescriptors[i];
LockBufHdr(bufHdr);
if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
bufHdr->tag.blockNum >= firstDelBlock)
InvalidateBuffer(bufHdr); /* releases spinlock */
else
UnlockBufHdr(bufHdr);
}
}
/* ---------------------------------------------------------------------
* DropBuffers
*
* This function removes all the buffers in the buffer cache for a
* particular database. Dirty pages are simply dropped, without
2001-03-22 05:01:46 +01:00
* bothering to write them out first. This is used when we destroy a
* database, to avoid trying to flush data to disk when the directory
* tree no longer exists. Implementation is pretty similar to
* DropRelationBuffers() which is for destroying just one relation.
* --------------------------------------------------------------------
*/
void
DropBuffers(Oid dbid)
{
int i;
BufferDesc *bufHdr;
/*
* We needn't consider local buffers, since by assumption the target
* database isn't our own.
*/
for (i = 0; i < NBuffers; i++)
{
bufHdr = &BufferDescriptors[i];
LockBufHdr(bufHdr);
if (bufHdr->tag.rnode.dbNode == dbid)
InvalidateBuffer(bufHdr); /* releases spinlock */
else
UnlockBufHdr(bufHdr);
}
}
/* -----------------------------------------------------------------
* PrintBufferDescs
*
* this function prints all the buffer descriptors, for debugging
* use only.
* -----------------------------------------------------------------
*/
#ifdef NOT_USED
void
PrintBufferDescs(void)
{
int i;
BufferDesc *buf = BufferDescriptors;
for (i = 0; i < NBuffers; ++i, ++buf)
{
/* theoretically we should lock the bufhdr here */
elog(LOG,
"[%02d] (freeNext=%d, rel=%u/%u/%u, "
"blockNum=%u, flags=0x%x, refcount=%u %d)",
i, buf->freeNext,
buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
buf->tag.rnode.relNode,
buf->tag.blockNum, buf->flags,
buf->refcount, PrivateRefCount[i]);
}
}
#endif
#ifdef NOT_USED
void
PrintPinnedBufs(void)
{
int i;
BufferDesc *buf = BufferDescriptors;
for (i = 0; i < NBuffers; ++i, ++buf)
{
if (PrivateRefCount[i] > 0)
{
/* theoretically we should lock the bufhdr here */
elog(LOG,
"[%02d] (freeNext=%d, rel=%u/%u/%u, "
"blockNum=%u, flags=0x%x, refcount=%u %d)",
i, buf->freeNext,
buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
buf->tag.rnode.relNode,
buf->tag.blockNum, buf->flags,
buf->refcount, PrivateRefCount[i]);
}
}
}
#endif
/* ---------------------------------------------------------------------
* FlushRelationBuffers
*
* This function writes all dirty pages of a relation out to disk.
* Furthermore, pages that have blocknumber >= firstDelBlock are
* actually removed from the buffer pool.
*
* This is called by DROP TABLE to clear buffers for the relation
* from the buffer pool. Note that we must write dirty buffers,
* rather than just dropping the changes, because our transaction
* might abort later on; we want to roll back safely in that case.
*
* This is also called by VACUUM before truncating the relation to the
* given number of blocks. It might seem unnecessary for VACUUM to
* write dirty pages before firstDelBlock, since VACUUM should already
* have committed its changes. However, it is possible for there still
* to be dirty pages: if some page had unwritten on-row tuple status
* updates from a prior transaction, and VACUUM had no additional
* changes to make to that page, then VACUUM won't have written it.
* This is harmless in most cases but will break pg_upgrade, which
* relies on VACUUM to ensure that *all* tuples have correct on-row
* status. So, we check and flush all dirty pages of the rel
* regardless of block number.
*
* In all cases, the caller should be holding AccessExclusiveLock on
* the target relation to ensure that no other backend is busy reading
* more blocks of the relation (or might do so before we commit).
* This should also ensure that no one is busy dirtying these blocks.
*
* Formerly, we considered it an error condition if we found dirty
* buffers here. However, since BufferSync no longer forces out all
* dirty buffers at every xact commit, it's possible for dirty buffers
* to still be present in the cache due to failure of an earlier
* transaction. So, must flush dirty buffers without complaint.
*
* XXX currently it sequentially searches the buffer pool, should be
* changed to more clever ways of searching.
* --------------------------------------------------------------------
*/
void
FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
{
int i;
BufferDesc *bufHdr;
/* Open rel at the smgr level if not already done */
RelationOpenSmgr(rel);
if (rel->rd_istemp)
{
for (i = 0; i < NLocBuffer; i++)
{
bufHdr = &LocalBufferDescriptors[i];
if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
{
if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
{
ErrorContextCallback errcontext;
/* Setup error traceback support for ereport() */
errcontext.callback = buffer_write_error_callback;
errcontext.arg = bufHdr;
errcontext.previous = error_context_stack;
error_context_stack = &errcontext;
smgrwrite(rel->rd_smgr,
bufHdr->tag.blockNum,
(char *) LocalBufHdrGetBlock(bufHdr),
true);
2000-11-30 09:46:26 +01:00
bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
/* Pop the error context stack */
error_context_stack = errcontext.previous;
}
if (LocalRefCount[i] > 0)
elog(ERROR, "FlushRelationBuffers(\"%s\" (local), %u): block %u is referenced (%d)",
RelationGetRelationName(rel), firstDelBlock,
bufHdr->tag.blockNum, LocalRefCount[i]);
if (bufHdr->tag.blockNum >= firstDelBlock)
{
CLEAR_BUFFERTAG(bufHdr->tag);
bufHdr->flags = 0;
bufHdr->usage_count = 0;
}
}
}
return;
}
/* Make sure we can handle the pin inside the loop */
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
for (i = 0; i < NBuffers; i++)
{
bufHdr = &BufferDescriptors[i];
recheck:
LockBufHdr(bufHdr);
if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
{
if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
{
PinBuffer_Locked(bufHdr);
LWLockAcquire(bufHdr->content_lock, LW_SHARED);
FlushBuffer(bufHdr, rel->rd_smgr);
LWLockRelease(bufHdr->content_lock);
UnpinBuffer(bufHdr, true, false /* no freelist change */ );
/*
* As soon as we unpin, it's possible for someone to take
* the buffer away from us; so loop back to re-lock and
* re-check if it still belongs to the target relation.
*/
goto recheck;
}
/*
* Even though it's not dirty, it could still be pinned because
* TerminateIO and UnpinBuffer are separate actions. Hence,
* we can't error out on nonzero reference count here.
*/
if (bufHdr->tag.blockNum >= firstDelBlock)
InvalidateBuffer(bufHdr); /* releases spinlock */
else
UnlockBufHdr(bufHdr);
}
else
UnlockBufHdr(bufHdr);
}
}
/*
* ReleaseBuffer -- remove the pin on a buffer without
* marking it dirty.
*/
void
ReleaseBuffer(Buffer buffer)
{
BufferDesc *bufHdr;
if (!BufferIsValid(buffer))
elog(ERROR, "bad buffer id: %d", buffer);
ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
if (BufferIsLocal(buffer))
{
Assert(LocalRefCount[-buffer - 1] > 0);
bufHdr = &LocalBufferDescriptors[-buffer - 1];
LocalRefCount[-buffer - 1]--;
if (LocalRefCount[-buffer - 1] == 0 &&
bufHdr->usage_count < BM_MAX_USAGE_COUNT)
bufHdr->usage_count++;
return;
}
bufHdr = &BufferDescriptors[buffer - 1];
Assert(PrivateRefCount[buffer - 1] > 0);
if (PrivateRefCount[buffer - 1] > 1)
PrivateRefCount[buffer - 1]--;
else
UnpinBuffer(bufHdr, false, true);
}
/*
* IncrBufferRefCount
* Increment the pin count on a buffer that we have *already* pinned
* at least once.
*
* This function cannot be used on a buffer we do not have pinned,
* because it doesn't change the shared buffer state.
*/
void
IncrBufferRefCount(Buffer buffer)
{
Assert(BufferIsPinned(buffer));
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
if (BufferIsLocal(buffer))
LocalRefCount[-buffer - 1]++;
else
PrivateRefCount[buffer - 1]++;
}
/*
* SetBufferCommitInfoNeedsSave
*
* Mark a buffer dirty when we have updated tuple commit-status bits in it.
*
* This is essentially the same as WriteNoReleaseBuffer. We preserve the
* distinction as a way of documenting that the caller has not made a critical
* data change --- the status-bit update could be redone by someone else just
* as easily. Therefore, no WAL log record need be generated, whereas calls
* to WriteNoReleaseBuffer really ought to be associated with a WAL-entry-
* creating action.
*
* This routine might get called many times on the same page, if we are making
* the first scan after commit of an xact that added/deleted many tuples.
* So, be as quick as we can if the buffer is already dirty. We do this by
* not acquiring spinlock if it looks like the status bits are already OK.
* (Note it is okay if someone else clears BM_JUST_DIRTIED immediately after
* we look, because the buffer content update is already done and will be
* reflected in the I/O.)
*/
void
SetBufferCommitInfoNeedsSave(Buffer buffer)
From: Dan McGuirk <mcguirk@indirect.com> Reply-To: hackers@hub.org, Dan McGuirk <mcguirk@indirect.com> To: hackers@hub.org Subject: [HACKERS] tmin writeback optimization I was doing some profiling of the backend, and noticed that during a certain benchmark I was running somewhere between 30% and 75% of the backend's CPU time was being spent in calls to TransactionIdDidCommit() from HeapTupleSatisfiesNow() or HeapTupleSatisfiesItself() to determine that changed rows' transactions had in fact been committed even though the rows' tmin values had not yet been set. When a query looks at a given row, it needs to figure out whether the transaction that changed the row has been committed and hence it should pay attention to the row, or whether on the other hand the transaction is still in progress or has been aborted and hence the row should be ignored. If a tmin value is set, it is known definitively that the row's transaction has been committed. However, if tmin is not set, the transaction referred to in xmin must be looked up in pg_log, and this is what the backend was spending a lot of time doing during my benchmark. So, implementing a method suggested by Vadim, I created the following patch that, the first time a query finds a committed row whose tmin value is not set, sets it, and marks the buffer where the row is stored as dirty. (It works for tmax, too.) This doesn't result in the boost in real time performance I was hoping for, however it does decrease backend CPU usage by up to two-thirds in certain situations, so it could be rather beneficial in high-concurrency settings.
1997-03-28 08:06:53 +01:00
{
BufferDesc *bufHdr;
if (!BufferIsValid(buffer))
elog(ERROR, "bad buffer id: %d", buffer);
if (BufferIsLocal(buffer))
{
WriteLocalBuffer(buffer, false);
return;
}
bufHdr = &BufferDescriptors[buffer - 1];
Assert(PrivateRefCount[buffer - 1] > 0);
if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
(BM_DIRTY | BM_JUST_DIRTIED))
{
LockBufHdr(bufHdr);
Assert(bufHdr->refcount > 0);
bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
UnlockBufHdr(bufHdr);
}
From: Dan McGuirk <mcguirk@indirect.com> Reply-To: hackers@hub.org, Dan McGuirk <mcguirk@indirect.com> To: hackers@hub.org Subject: [HACKERS] tmin writeback optimization I was doing some profiling of the backend, and noticed that during a certain benchmark I was running somewhere between 30% and 75% of the backend's CPU time was being spent in calls to TransactionIdDidCommit() from HeapTupleSatisfiesNow() or HeapTupleSatisfiesItself() to determine that changed rows' transactions had in fact been committed even though the rows' tmin values had not yet been set. When a query looks at a given row, it needs to figure out whether the transaction that changed the row has been committed and hence it should pay attention to the row, or whether on the other hand the transaction is still in progress or has been aborted and hence the row should be ignored. If a tmin value is set, it is known definitively that the row's transaction has been committed. However, if tmin is not set, the transaction referred to in xmin must be looked up in pg_log, and this is what the backend was spending a lot of time doing during my benchmark. So, implementing a method suggested by Vadim, I created the following patch that, the first time a query finds a committed row whose tmin value is not set, sets it, and marks the buffer where the row is stored as dirty. (It works for tmax, too.) This doesn't result in the boost in real time performance I was hoping for, however it does decrease backend CPU usage by up to two-thirds in certain situations, so it could be rather beneficial in high-concurrency settings.
1997-03-28 08:06:53 +01:00
}
/*
* Release buffer content locks for shared buffers.
*
* Used to clean up after errors.
*
* Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
* of releasing buffer content locks per se; the only thing we need to deal
* with here is clearing any PIN_COUNT request that was in progress.
*/
void
UnlockBuffers(void)
{
BufferDesc *buf = PinCountWaitBuf;
if (buf)
{
HOLD_INTERRUPTS(); /* don't want to die() partway through... */
LockBufHdr_NoHoldoff(buf);
/*
* Don't complain if flag bit not set; it could have been
* reset but we got a cancel/die interrupt before getting the
* signal.
*/
if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
buf->wait_backend_id == MyBackendId)
buf->flags &= ~BM_PIN_COUNT_WAITER;
UnlockBufHdr_NoHoldoff(buf);
ProcCancelWaitForSignal();
PinCountWaitBuf = NULL;
RESUME_INTERRUPTS();
}
}
/*
* Acquire or release the content_lock for the buffer.
*/
void
1999-05-25 18:15:34 +02:00
LockBuffer(Buffer buffer, int mode)
{
BufferDesc *buf;
Assert(BufferIsValid(buffer));
if (BufferIsLocal(buffer))
return;
1999-05-25 18:15:34 +02:00
buf = &(BufferDescriptors[buffer - 1]);
if (mode == BUFFER_LOCK_UNLOCK)
LWLockRelease(buf->content_lock);
else if (mode == BUFFER_LOCK_SHARE)
LWLockAcquire(buf->content_lock, LW_SHARED);
else if (mode == BUFFER_LOCK_EXCLUSIVE)
{
LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
2000-11-30 09:46:26 +01:00
/*
* This is not the best place to mark buffer dirty (eg indices do
2001-03-22 05:01:46 +01:00
* not always change buffer they lock in excl mode). But please
* remember that it's critical to set dirty bit *before* logging
* changes with XLogInsert() - see comments in SyncOneBuffer().
*/
LockBufHdr_NoHoldoff(buf);
buf->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
UnlockBufHdr_NoHoldoff(buf);
}
else
elog(ERROR, "unrecognized buffer lock mode: %d", mode);
}
/*
* Acquire the content_lock for the buffer, but only if we don't have to wait.
*
* This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
*/
bool
ConditionalLockBuffer(Buffer buffer)
{
BufferDesc *buf;
Assert(BufferIsValid(buffer));
if (BufferIsLocal(buffer))
return true; /* act as though we got it */
buf = &(BufferDescriptors[buffer - 1]);
if (LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE))
{
/*
* This is not the best place to mark buffer dirty (eg indices do
* not always change buffer they lock in excl mode). But please
* remember that it's critical to set dirty bit *before* logging
* changes with XLogInsert() - see comments in SyncOneBuffer().
*/
LockBufHdr_NoHoldoff(buf);
buf->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
UnlockBufHdr_NoHoldoff(buf);
return true;
}
return false;
}
/*
* LockBufferForCleanup - lock a buffer in preparation for deleting items
*
* Items may be deleted from a disk page only when the caller (a) holds an
* exclusive lock on the buffer and (b) has observed that no other backend
* holds a pin on the buffer. If there is a pin, then the other backend
* might have a pointer into the buffer (for example, a heapscan reference
* to an item --- see README for more details). It's OK if a pin is added
* after the cleanup starts, however; the newly-arrived backend will be
* unable to look at the page until we release the exclusive lock.
*
* To implement this protocol, a would-be deleter must pin the buffer and
* then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
* LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
* it has successfully observed pin count = 1.
*/
void
LockBufferForCleanup(Buffer buffer)
{
BufferDesc *bufHdr;
Assert(BufferIsValid(buffer));
Assert(PinCountWaitBuf == NULL);
if (BufferIsLocal(buffer))
{
/* There should be exactly one pin */
if (LocalRefCount[-buffer - 1] != 1)
elog(ERROR, "incorrect local pin count: %d",
LocalRefCount[-buffer - 1]);
/* Nobody else to wait for */
return;
}
/* There should be exactly one local pin */
if (PrivateRefCount[buffer - 1] != 1)
elog(ERROR, "incorrect local pin count: %d",
PrivateRefCount[buffer - 1]);
bufHdr = &BufferDescriptors[buffer - 1];
for (;;)
{
/* Try to acquire lock */
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
LockBufHdr_NoHoldoff(bufHdr);
Assert(bufHdr->refcount > 0);
if (bufHdr->refcount == 1)
{
/* Successfully acquired exclusive lock with pincount 1 */
UnlockBufHdr_NoHoldoff(bufHdr);
return;
}
/* Failed, so mark myself as waiting for pincount 1 */
if (bufHdr->flags & BM_PIN_COUNT_WAITER)
{
UnlockBufHdr_NoHoldoff(bufHdr);
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
elog(ERROR, "multiple backends attempting to wait for pincount 1");
}
bufHdr->wait_backend_id = MyBackendId;
bufHdr->flags |= BM_PIN_COUNT_WAITER;
PinCountWaitBuf = bufHdr;
UnlockBufHdr_NoHoldoff(bufHdr);
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
/* Wait to be signaled by UnpinBuffer() */
ProcWaitForSignal();
PinCountWaitBuf = NULL;
/* Loop back and try again */
}
}
/*
* Functions for buffer I/O handling
*
* Note: We assume that nested buffer I/O never occurs.
* i.e at most one io_in_progress lock is held per proc.
*
* Also note that these are used only for shared buffers, not local ones.
*/
/*
* WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
*/
static void
WaitIO(BufferDesc *buf)
{
/*
* Changed to wait until there's no IO - Inoue 01/13/2000
*
* Note this is *necessary* because an error abort in the process doing
* I/O could release the io_in_progress_lock prematurely. See
* AbortBufferIO.
*/
for (;;)
{
BufFlags sv_flags;
/*
* It may not be necessary to acquire the spinlock to check the
* flag here, but since this test is essential for correctness,
* we'd better play it safe.
*/
LockBufHdr(buf);
sv_flags = buf->flags;
UnlockBufHdr(buf);
if (!(sv_flags & BM_IO_IN_PROGRESS))
break;
LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
LWLockRelease(buf->io_in_progress_lock);
}
}
/*
* StartBufferIO: begin I/O on this buffer
* (Assumptions)
* My process is executing no IO
* The buffer is Pinned
*
* In some scenarios there are race conditions in which multiple backends
* could attempt the same I/O operation concurrently. If someone else
* has already started I/O on this buffer then we will block on the
* io_in_progress lock until he's done.
*
* Input operations are only attempted on buffers that are not BM_VALID,
* and output operations only on buffers that are BM_VALID and BM_DIRTY,
* so we can always tell if the work is already done.
*
* Returns TRUE if we successfully marked the buffer as I/O busy,
* FALSE if someone else already did the work.
*/
static bool
StartBufferIO(BufferDesc *buf, bool forInput)
{
Assert(!InProgressBuf);
for (;;)
{
/*
* Grab the io_in_progress lock so that other processes can wait for
* me to finish the I/O.
*/
LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
/* NoHoldoff is OK since we now have an LWLock */
LockBufHdr_NoHoldoff(buf);
if (!(buf->flags & BM_IO_IN_PROGRESS))
break;
/*
* The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
* lock isn't held is if the process doing the I/O is recovering from
* an error (see AbortBufferIO). If that's the case, we must wait for
* him to get unwedged.
*/
UnlockBufHdr_NoHoldoff(buf);
LWLockRelease(buf->io_in_progress_lock);
WaitIO(buf);
}
/* Once we get here, there is definitely no I/O active on this buffer */
if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
{
/* someone else already did the I/O */
UnlockBufHdr_NoHoldoff(buf);
LWLockRelease(buf->io_in_progress_lock);
return false;
}
buf->flags |= BM_IO_IN_PROGRESS;
UnlockBufHdr_NoHoldoff(buf);
InProgressBuf = buf;
IsForInput = forInput;
return true;
}
/*
* TerminateBufferIO: release a buffer we were doing I/O on
* (Assumptions)
* My process is executing IO for the buffer
* BM_IO_IN_PROGRESS bit is set for the buffer
* We hold the buffer's io_in_progress lock
* The buffer is Pinned
*
* If clear_dirty is TRUE and BM_JUST_DIRTIED is not set, we clear the
* buffer's BM_DIRTY flag. This is appropriate when terminating a
* successful write. The check on BM_JUST_DIRTIED is necessary to avoid
* marking the buffer clean if it was re-dirtied while we were writing.
*
* set_flag_bits gets ORed into the buffer's flags. It must include
* BM_IO_ERROR in a failure case. For successful completion it could
* be 0, or BM_VALID if we just finished reading in the page.
*/
static void
TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
{
Assert(buf == InProgressBuf);
/* NoHoldoff is OK since we must have an LWLock */
LockBufHdr_NoHoldoff(buf);
Assert(buf->flags & BM_IO_IN_PROGRESS);
buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
buf->flags &= ~BM_DIRTY;
buf->flags |= set_flag_bits;
UnlockBufHdr_NoHoldoff(buf);
InProgressBuf = NULL;
LWLockRelease(buf->io_in_progress_lock);
}
/*
* AbortBufferIO: Clean up any active buffer I/O after an error.
*
* All LWLocks we might have held have been released,
* but we haven't yet released buffer pins, so the buffer is still pinned.
*
* If I/O was in progress, we always set BM_IO_ERROR, even though it's
* possible the error condition wasn't related to the I/O.
*/
void
AbortBufferIO(void)
{
BufferDesc *buf = InProgressBuf;
if (buf)
{
/*
* Since LWLockReleaseAll has already been called, we're not
* holding the buffer's io_in_progress_lock. We have to re-acquire
* it so that we can use TerminateBufferIO. Anyone who's executing
* WaitIO on the buffer will be in a busy spin until we succeed in
* doing this.
*/
LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
/* NoHoldoff is OK since we now have an LWLock */
LockBufHdr_NoHoldoff(buf);
Assert(buf->flags & BM_IO_IN_PROGRESS);
if (IsForInput)
{
Assert(!(buf->flags & BM_DIRTY));
/* We'd better not think buffer is valid yet */
Assert(!(buf->flags & BM_VALID));
UnlockBufHdr_NoHoldoff(buf);
}
else
{
BufFlags sv_flags;
sv_flags = buf->flags;
Assert(sv_flags & BM_DIRTY);
UnlockBufHdr_NoHoldoff(buf);
/* Issue notice if this is not the first failure... */
if (sv_flags & BM_IO_ERROR)
{
/* Buffer is pinned, so we can read tag without spinlock */
ereport(WARNING,
(errcode(ERRCODE_IO_ERROR),
errmsg("could not write block %u of %u/%u/%u",
buf->tag.blockNum,
buf->tag.rnode.spcNode,
buf->tag.rnode.dbNode,
buf->tag.rnode.relNode),
errdetail("Multiple failures --- write error may be permanent.")));
}
}
TerminateBufferIO(buf, false, BM_IO_ERROR);
}
}
/*
* Error context callback for errors occurring during buffer writes.
*/
static void
buffer_write_error_callback(void *arg)
{
BufferDesc *bufHdr = (BufferDesc *) arg;
/* Buffer is pinned, so we can read the tag without locking the spinlock */
if (bufHdr != NULL)
errcontext("writing block %u of relation %u/%u/%u",
bufHdr->tag.blockNum,
bufHdr->tag.rnode.spcNode,
bufHdr->tag.rnode.dbNode,
bufHdr->tag.rnode.relNode);
}