diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 3eee86afe5..2fa0b065a2 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -4585,3 +4585,95 @@ TestForOldSnapshot_impl(Snapshot snapshot, Relation relation) (errcode(ERRCODE_SNAPSHOT_TOO_OLD), errmsg("snapshot too old"))); } + + +/* + * CheckBuffer + * + * Check the state of a buffer without loading it into the shared buffers. To + * avoid torn pages and possible false positives when reading data, a shared + * LWLock is taken on the target buffer pool partition mapping, and we check + * if the page is in shared buffers or not. An I/O lock is taken on the block + * to prevent any concurrent activity from happening. + * + * If the page is found as dirty in the shared buffers, it is ignored as + * it will be flushed to disk either before the end of the next checkpoint + * or during recovery in the event of an unsafe shutdown. + * + * If the page is found in the shared buffers but is not dirty, we still + * check the state of its data on disk, as it could be possible that the + * page stayed in shared buffers for a rather long time while the on-disk + * data got corrupted. + * + * If the page is not found in shared buffers, the block is read from disk + * while holding the buffer pool partition mapping LWLock. + * + * The page data is stored in a private memory area local to this function + * while running the checks. + */ +bool +CheckBuffer(SMgrRelation smgr, ForkNumber forknum, BlockNumber blkno) +{ + char buffer[BLCKSZ]; + BufferTag buf_tag; /* identity of requested block */ + uint32 buf_hash; /* hash value for buf_tag */ + LWLock *partLock; /* buffer partition lock for the buffer */ + BufferDesc *bufdesc; + int buf_id; + + Assert(smgrexists(smgr, forknum)); + + /* create a tag so we can look after the buffer */ + INIT_BUFFERTAG(buf_tag, smgr->smgr_rnode.node, forknum, blkno); + + /* determine its hash code and partition lock ID */ + buf_hash = BufTableHashCode(&buf_tag); + partLock = BufMappingPartitionLock(buf_hash); + + /* see if the block is in the buffer pool or not */ + LWLockAcquire(partLock, LW_SHARED); + buf_id = BufTableLookup(&buf_tag, buf_hash); + if (buf_id >= 0) + { + uint32 buf_state; + + /* + * Found it. Now, retrieve its state to know what to do with it, and + * release the pin immediately. We do so to limit overhead as much as + * possible. We keep the shared LWLock on the target buffer mapping + * partition for now, so this buffer cannot be evicted, and we acquire + * an I/O Lock on the buffer as we may need to read its contents from + * disk. + */ + bufdesc = GetBufferDescriptor(buf_id); + + LWLockAcquire(BufferDescriptorGetIOLock(bufdesc), LW_SHARED); + buf_state = LockBufHdr(bufdesc); + UnlockBufHdr(bufdesc, buf_state); + + /* If the page is dirty or invalid, skip it */ + if ((buf_state & BM_DIRTY) != 0 || (buf_state & BM_TAG_VALID) == 0) + { + LWLockRelease(BufferDescriptorGetIOLock(bufdesc)); + LWLockRelease(partLock); + return true; + } + + /* Read the buffer from disk, with the I/O lock still held */ + smgrread(smgr, forknum, blkno, buffer); + LWLockRelease(BufferDescriptorGetIOLock(bufdesc)); + } + else + { + /* + * Simply read the buffer. There's no risk of modification on it as + * we are holding the buffer pool partition mapping lock. + */ + smgrread(smgr, forknum, blkno, buffer); + } + + /* buffer lookup done, so now do its check */ + LWLockRelease(partLock); + + return PageIsVerifiedExtended(buffer, blkno, PIV_REPORT_STAT); +} diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index ee91b8fa26..a21cab2eaf 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -240,6 +240,9 @@ extern void AtProcExit_LocalBuffers(void); extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation); +extern bool CheckBuffer(struct SMgrRelationData *smgr, ForkNumber forknum, + BlockNumber blkno); + /* in freelist.c */ extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype); extern void FreeAccessStrategy(BufferAccessStrategy strategy);