diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README index e7ff23b76f..dc2a63a137 100644 --- a/src/backend/storage/freespace/README +++ b/src/backend/storage/freespace/README @@ -169,9 +169,7 @@ Recovery -------- The FSM is not explicitly WAL-logged. Instead, we rely on a bunch of -self-correcting measures to repair possible corruption. As a result when -we write to the FSM we treat that as a hint and thus use MarkBufferDirtyHint() -rather than MarkBufferDirty(). +self-correcting measures to repair possible corruption. First of all, whenever a value is set on an FSM page, the root node of the page is compared against the new value after bubbling up the change is @@ -188,6 +186,18 @@ goes through fsm_set_avail(), so that the upper nodes on those pages are immediately updated. Periodically, VACUUM calls FreeSpaceMapVacuum[Range] to propagate the new free-space info into the upper pages of the FSM tree. +As a result when we write to the FSM we treat that as a hint and thus use +MarkBufferDirtyHint() rather than MarkBufferDirty(). Every read here uses +RBM_ZERO_ON_ERROR to bypass checksum mismatches and other verification +failures. We'd operate correctly without the full page images that +MarkBufferDirtyHint() provides, but they do decrease the chance of losing slot +knowledge to RBM_ZERO_ON_ERROR. + +Relation extension is not WAL-logged. Hence, after WAL replay, an on-disk FSM +slot may indicate free space in PageIsNew() blocks that never reached disk. +We detect this case by comparing against the actual relation size, and we mark +the block as full in that case. + TODO ---- diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 09d4b16067..5bc96954de 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -111,6 +111,7 @@ static BlockNumber fsm_search(Relation rel, uint8 min_cat); static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, BlockNumber start, BlockNumber end, bool *eof); +static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber); /******** Public API ********/ @@ -127,6 +128,9 @@ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, * amount of free space available on that page and then try again (see * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned, * extend the relation. + * + * This can trigger FSM updates if any FSM entry is found to point to a block + * past the end of the relation. */ BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded) @@ -165,9 +169,17 @@ RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, * Otherwise, search as usual. */ if (search_slot != -1) - return fsm_get_heap_blk(addr, search_slot); - else - return fsm_search(rel, search_cat); + { + BlockNumber blknum = fsm_get_heap_blk(addr, search_slot); + + /* + * Check that the blknum is actually in the relation. Don't try to + * update the FSM in that case, just fall back to the other case + */ + if (fsm_does_block_exist(rel, blknum)) + return blknum; + } + return fsm_search(rel, search_cat); } /* @@ -295,14 +307,25 @@ FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks) fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); /* - * Truncation of a relation is WAL-logged at a higher-level, and we - * will be called at WAL replay. But if checksums are enabled, we need - * to still write a WAL record to protect against a torn page, if the - * page is flushed to disk before the truncation WAL record. We cannot - * use MarkBufferDirtyHint here, because that will not dirty the page - * during recovery. + * This change is non-critical, because fsm_does_block_exist() would + * stop us from returning a truncated-away block. However, since this + * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost + * of that many fsm_does_block_exist() rejections. Use a full + * MarkBufferDirty(), not MarkBufferDirtyHint(). */ MarkBufferDirty(buf); + + /* + * WAL-log like MarkBufferDirtyHint() might have done, just to avoid + * differing from the rest of the file in this respect. This is + * optional; see README mention of full page images. XXX consider + * XLogSaveBufferForHint() for even closer similarity. + * + * A higher-level operation calls us at WAL replay. If we crash + * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has + * not changed, and our fork remains valid. If we crash after that + * flush, redo will return here. + */ if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded()) log_newpage_buffer(buf, false); @@ -719,8 +742,15 @@ fsm_search(Relation rel, uint8 min_cat) (addr.level == FSM_BOTTOM_LEVEL), false); if (slot == -1) + { max_avail = fsm_get_max_avail(BufferGetPage(buf)); - UnlockReleaseBuffer(buf); + UnlockReleaseBuffer(buf); + } + else + { + /* Keep the pin for possible update below */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } } else slot = -1; @@ -732,8 +762,37 @@ fsm_search(Relation rel, uint8 min_cat) * bottom. */ if (addr.level == FSM_BOTTOM_LEVEL) - return fsm_get_heap_blk(addr, slot); + { + BlockNumber blkno = fsm_get_heap_blk(addr, slot); + Page page; + if (fsm_does_block_exist(rel, blkno)) + { + ReleaseBuffer(buf); + return blkno; + } + + /* + * Block is past the end of the relation. Update FSM, and + * restart from root. The usual "advancenext" behavior is + * pessimal for this rare scenario, since every later slot is + * unusable in the same way. We could zero all affected slots + * on the same FSM page, but don't bet on the benefits of that + * optimization justifying its compiled code bulk. + */ + page = BufferGetPage(buf); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + fsm_set_avail(page, slot, 0); + MarkBufferDirtyHint(buf, false); + UnlockReleaseBuffer(buf); + if (restarts++ > 10000) /* same rationale as below */ + return InvalidBlockNumber; + addr = FSM_ROOT_ADDRESS; + } + else + { + ReleaseBuffer(buf); + } addr = fsm_get_child(addr, slot); } else if (addr.level == FSM_ROOT_LEVEL) @@ -901,3 +960,26 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, return max_avail; } + + +/* + * Check whether a block number is past the end of the relation. This can + * happen after WAL replay, if the FSM reached disk but newly-extended pages + * it refers to did not. + */ +static bool +fsm_does_block_exist(Relation rel, BlockNumber blknumber) +{ + SMgrRelation smgr = RelationGetSmgr(rel); + + /* + * If below the cached nblocks, the block surely exists. Otherwise, we + * face a trade-off. We opt to compare to a fresh nblocks, incurring + * lseek() overhead. The alternative would be to assume the block does + * not exist, but that would cause FSM to set zero space available for + * blocks that main fork extension just recorded. + */ + return ((BlockNumberIsValid(smgr->smgr_cached_nblocks[MAIN_FORKNUM]) && + blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) || + blknumber < RelationGetNumberOfBlocks(rel)); +} diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 4dc24649df..1d3bb92dfe 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -572,8 +572,9 @@ BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum) { /* - * For now, we only use cached values in recovery due to lack of a shared - * invalidation mechanism for changes in file size. + * For now, this function uses cached values only in recovery due to lack + * of a shared invalidation mechanism for changes in file size. Code + * elsewhere reads smgr_cached_nblocks and copes with stale data. */ if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber) return reln->smgr_cached_nblocks[forknum]; diff --git a/src/test/recovery/t/008_fsm_truncation.pl b/src/test/recovery/t/008_fsm_truncation.pl index 14b4b97e9e..0a81205b5b 100644 --- a/src/test/recovery/t/008_fsm_truncation.pl +++ b/src/test/recovery/t/008_fsm_truncation.pl @@ -1,9 +1,8 @@ # Copyright (c) 2021, PostgreSQL Global Development Group -# Test WAL replay of FSM changes. -# -# FSM changes don't normally need to be WAL-logged, except for truncation. +# Test FSM-driven INSERT just after truncation clears FSM slots indicating +# free space in removed blocks. # The FSM mustn't return a page that doesn't exist (anymore). use strict; use warnings;