Fix race condition between hot standby and restoring a full-page image.

There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.

To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.

In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.

Backpatch to all supported versions; this has been racy since hot standby
was introduced.
This commit is contained in:
Heikki Linnakangas 2014-11-13 19:47:44 +02:00
parent 35fed51626
commit 81c4508196
5 changed files with 78 additions and 32 deletions

View File

@ -155,9 +155,8 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno)
if (blkno == P_NEW) if (blkno == P_NEW)
elog(ERROR, "hash AM does not use P_NEW"); elog(ERROR, "hash AM does not use P_NEW");
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO, NULL); buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,
NULL);
LockBuffer(buf, HASH_WRITE);
/* ref count and lock type are correct */ /* ref count and lock type are correct */
@ -198,11 +197,13 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
if (BufferGetBlockNumber(buf) != blkno) if (BufferGetBlockNumber(buf) != blkno)
elog(ERROR, "unexpected hash relation size: %u, should be %u", elog(ERROR, "unexpected hash relation size: %u, should be %u",
BufferGetBlockNumber(buf), blkno); BufferGetBlockNumber(buf), blkno);
LockBuffer(buf, HASH_WRITE);
} }
else else
buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO, NULL); {
buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
LockBuffer(buf, HASH_WRITE); NULL);
}
/* ref count and lock type are correct */ /* ref count and lock type are correct */

View File

@ -7556,7 +7556,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
{ {
XLogReadBufferForRedoExtended(lsn, record, 0, XLogReadBufferForRedoExtended(lsn, record, 0,
target_node, MAIN_FORKNUM, blkno, target_node, MAIN_FORKNUM, blkno,
RBM_ZERO, false, &buffer); RBM_ZERO_AND_LOCK, false, &buffer);
page = BufferGetPage(buffer); page = BufferGetPage(buffer);
PageInit(page, BufferGetPageSize(buffer), 0); PageInit(page, BufferGetPageSize(buffer), 0);
action = BLK_NEEDS_REDO; action = BLK_NEEDS_REDO;
@ -7683,7 +7683,7 @@ heap_xlog_multi_insert(XLogRecPtr lsn, XLogRecord *record)
{ {
XLogReadBufferForRedoExtended(lsn, record, 0, XLogReadBufferForRedoExtended(lsn, record, 0,
rnode, MAIN_FORKNUM, blkno, rnode, MAIN_FORKNUM, blkno,
RBM_ZERO, false, &buffer); RBM_ZERO_AND_LOCK, false, &buffer);
page = BufferGetPage(buffer); page = BufferGetPage(buffer);
PageInit(page, BufferGetPageSize(buffer), 0); PageInit(page, BufferGetPageSize(buffer), 0);
action = BLK_NEEDS_REDO; action = BLK_NEEDS_REDO;
@ -7876,7 +7876,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
{ {
XLogReadBufferForRedoExtended(lsn, record, 1, XLogReadBufferForRedoExtended(lsn, record, 1,
rnode, MAIN_FORKNUM, newblk, rnode, MAIN_FORKNUM, newblk,
RBM_ZERO, false, &nbuffer); RBM_ZERO_AND_LOCK, false, &nbuffer);
page = (Page) BufferGetPage(nbuffer); page = (Page) BufferGetPage(nbuffer);
PageInit(page, BufferGetPageSize(nbuffer), 0); PageInit(page, BufferGetPageSize(nbuffer), 0);
newaction = BLK_NEEDS_REDO; newaction = BLK_NEEDS_REDO;

View File

@ -287,9 +287,13 @@ XLogReadBufferForRedo(XLogRecPtr lsn, XLogRecord *record, int block_index,
* XLogReadBufferForRedoExtended * XLogReadBufferForRedoExtended
* Like XLogReadBufferForRedo, but with extra options. * Like XLogReadBufferForRedo, but with extra options.
* *
* If mode is RBM_ZERO or RBM_ZERO_ON_ERROR, if the page doesn't exist, the * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
* relation is extended with all-zeroes pages up to the referenced block * with all-zeroes pages up to the referenced block number. In
* number. In RBM_ZERO mode, the return value is always BLK_NEEDS_REDO. * RBM_ZERO_AND_LOCK and RBM_ZERO_AND_CLEANUP_LOCK modes, the return value
* is always BLK_NEEDS_REDO.
*
* (The RBM_ZERO_AND_CLEANUP_LOCK mode is redundant with the get_cleanup_lock
* parameter. Do not use an inconsistent combination!)
* *
* If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer * If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer
* using LockBufferForCleanup(), instead of a regular exclusive lock. * using LockBufferForCleanup(), instead of a regular exclusive lock.
@ -312,10 +316,13 @@ XLogReadBufferForRedoExtended(XLogRecPtr lsn, XLogRecord *record,
*buf = XLogReadBufferExtended(rnode, forkno, blkno, mode); *buf = XLogReadBufferExtended(rnode, forkno, blkno, mode);
if (BufferIsValid(*buf)) if (BufferIsValid(*buf))
{ {
if (get_cleanup_lock) if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK)
LockBufferForCleanup(*buf); {
else if (get_cleanup_lock)
LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); LockBufferForCleanup(*buf);
else
LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE);
}
if (lsn <= PageGetLSN(BufferGetPage(*buf))) if (lsn <= PageGetLSN(BufferGetPage(*buf)))
return BLK_DONE; return BLK_DONE;
else else
@ -341,7 +348,8 @@ XLogReadBufferForRedoExtended(XLogRecPtr lsn, XLogRecord *record,
* The returned buffer is exclusively-locked. * The returned buffer is exclusively-locked.
* *
* For historical reasons, instead of a ReadBufferMode argument, this only * For historical reasons, instead of a ReadBufferMode argument, this only
* supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes. * supports RBM_ZERO_AND_LOCK (init == true) and RBM_NORMAL (init == false)
* modes.
*/ */
Buffer Buffer
XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init) XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
@ -349,8 +357,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
Buffer buf; Buffer buf;
buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
init ? RBM_ZERO : RBM_NORMAL); init ? RBM_ZERO_AND_LOCK : RBM_NORMAL);
if (BufferIsValid(buf)) if (BufferIsValid(buf) && !init)
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
return buf; return buf;
@ -369,8 +377,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
* dropped or truncated. If we don't see evidence of that later in the WAL * dropped or truncated. If we don't see evidence of that later in the WAL
* sequence, we'll complain at the end of WAL replay.) * sequence, we'll complain at the end of WAL replay.)
* *
* In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
* relation is extended with all-zeroes pages up to the given block number. * with all-zeroes pages up to the given block number.
* *
* In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
* exist, and we don't check for all-zeroes. Thus, no log entry is made * exist, and we don't check for all-zeroes. Thus, no log entry is made
@ -424,7 +432,11 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
do do
{ {
if (buffer != InvalidBuffer) if (buffer != InvalidBuffer)
{
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
ReleaseBuffer(buffer); ReleaseBuffer(buffer);
}
buffer = ReadBufferWithoutRelcache(rnode, forknum, buffer = ReadBufferWithoutRelcache(rnode, forknum,
P_NEW, mode, NULL); P_NEW, mode, NULL);
} }
@ -432,6 +444,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
/* Handle the corner case that P_NEW returns non-consecutive pages */ /* Handle the corner case that P_NEW returns non-consecutive pages */
if (BufferGetBlockNumber(buffer) != blkno) if (BufferGetBlockNumber(buffer) != blkno)
{ {
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
ReleaseBuffer(buffer); ReleaseBuffer(buffer);
buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
mode, NULL); mode, NULL);
@ -537,12 +551,8 @@ RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
Page page; Page page;
buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block, buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
RBM_ZERO); get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
Assert(BufferIsValid(buffer)); Assert(BufferIsValid(buffer));
if (get_cleanup_lock)
LockBufferForCleanup(buffer);
else
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
page = (Page) BufferGetPage(buffer); page = (Page) BufferGetPage(buffer);

View File

@ -499,14 +499,19 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
* valid, the page is zeroed instead of throwing an error. This is intended * valid, the page is zeroed instead of throwing an error. This is intended
* for non-critical data, where the caller is prepared to repair errors. * for non-critical data, where the caller is prepared to repair errors.
* *
* In RBM_ZERO mode, if the page isn't in buffer cache already, it's filled * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
* with zeros instead of reading it from disk. Useful when the caller is * filled with zeros instead of reading it from disk. Useful when the caller
* going to fill the page from scratch, since this saves I/O and avoids * is going to fill the page from scratch, since this saves I/O and avoids
* unnecessary failure if the page-on-disk has corrupt page headers. * unnecessary failure if the page-on-disk has corrupt page headers.
* The page is returned locked to ensure that the caller has a chance to
* initialize the page before it's made visible to others.
* Caution: do not use this mode to read a page that is beyond the relation's * Caution: do not use this mode to read a page that is beyond the relation's
* current physical EOF; that is likely to cause problems in md.c when * current physical EOF; that is likely to cause problems in md.c when
* the page is modified and written out. P_NEW is OK, though. * the page is modified and written out. P_NEW is OK, though.
* *
* RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
* a cleanup-strength lock on the page.
*
* RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here. * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
* *
* If strategy is not NULL, a nondefault buffer access strategy is used. * If strategy is not NULL, a nondefault buffer access strategy is used.
@ -648,6 +653,18 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
isExtend, isExtend,
found); found);
/*
* In RBM_ZERO_AND_LOCK mode the caller expects the page to
* be locked on return.
*/
if (!isLocalBuf)
{
if (mode == RBM_ZERO_AND_LOCK)
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
}
return BufferDescriptorGetBuffer(bufHdr); return BufferDescriptorGetBuffer(bufHdr);
} }
@ -729,7 +746,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* Read in the page, unless the caller intends to overwrite it and * Read in the page, unless the caller intends to overwrite it and
* just wants us to allocate a buffer. * just wants us to allocate a buffer.
*/ */
if (mode == RBM_ZERO) if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
MemSet((char *) bufBlock, 0, BLCKSZ); MemSet((char *) bufBlock, 0, BLCKSZ);
else else
{ {
@ -771,6 +788,22 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
} }
} }
/*
* In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
* the page as valid, to make sure that no other backend sees the zeroed
* page before the caller has had a chance to initialize it.
*
* Since no-one else can be looking at the page contents yet, there is no
* difference between an exclusive lock and a cleanup-strength lock.
* (Note that we cannot use LockBuffer() of LockBufferForCleanup() here,
* because they assert that the buffer is already valid.)
*/
if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
!isLocalBuf)
{
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
}
if (isLocalBuf) if (isLocalBuf)
{ {
/* Only need to adjust flags */ /* Only need to adjust flags */

View File

@ -36,8 +36,10 @@ typedef enum BufferAccessStrategyType
typedef enum typedef enum
{ {
RBM_NORMAL, /* Normal read */ RBM_NORMAL, /* Normal read */
RBM_ZERO, /* Don't read from disk, caller will RBM_ZERO_AND_LOCK, /* Don't read from disk, caller will
* initialize */ * initialize. Also locks the page. */
RBM_ZERO_AND_CLEANUP_LOCK, /* Like RBM_ZERO_AND_LOCK, but locks the page
* in "cleanup" mode */
RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */ RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */
RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL
* replay; otherwise same as RBM_NORMAL */ * replay; otherwise same as RBM_NORMAL */