diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index 5839c168e6..d41d318eef 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -74,23 +74,31 @@ RelationPutHeapTuple(Relation relation, } /* - * Read in a buffer, using bulk-insert strategy if bistate isn't NULL. + * Read in a buffer in mode, using bulk-insert strategy if bistate isn't NULL. */ static Buffer ReadBufferBI(Relation relation, BlockNumber targetBlock, - BulkInsertState bistate) + ReadBufferMode mode, BulkInsertState bistate) { Buffer buffer; /* If not bulk-insert, exactly like ReadBuffer */ if (!bistate) - return ReadBuffer(relation, targetBlock); + return ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock, + mode, NULL); /* If we have the desired block already pinned, re-pin and return it */ if (bistate->current_buf != InvalidBuffer) { if (BufferGetBlockNumber(bistate->current_buf) == targetBlock) { + /* + * Currently the LOCK variants are only used for extending + * relation, which should never reach this branch. + */ + Assert(mode != RBM_ZERO_AND_LOCK && + mode != RBM_ZERO_AND_CLEANUP_LOCK); + IncrBufferRefCount(bistate->current_buf); return bistate->current_buf; } @@ -101,7 +109,7 @@ ReadBufferBI(Relation relation, BlockNumber targetBlock, /* Perform a read using the buffer strategy */ buffer = ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock, - RBM_NORMAL, bistate->strategy); + mode, bistate->strategy); /* Save the selected block as target for future inserts */ IncrBufferRefCount(buffer); @@ -204,11 +212,10 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate) /* * Extend by one page. This should generally match the main-line * extension code in RelationGetBufferForTuple, except that we hold - * the relation extension lock throughout. + * the relation extension lock throughout, and we don't immediately + * initialize the page (see below). */ - buffer = ReadBufferBI(relation, P_NEW, bistate); - - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate); page = BufferGetPage(buffer); if (!PageIsNew(page)) @@ -216,18 +223,18 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate) BufferGetBlockNumber(buffer), RelationGetRelationName(relation)); - PageInit(page, BufferGetPageSize(buffer), 0); - /* - * We mark all the new buffers dirty, but do nothing to write them - * out; they'll probably get used soon, and even if they are not, a - * crash will leave an okay all-zeroes page on disk. + * Add the page to the FSM without initializing. If we were to + * initialize here, the page would potentially get flushed out to disk + * before we add any useful content. There's no guarantee that that'd + * happen before a potential crash, so we need to deal with + * uninitialized pages anyway, thus avoid the potential for + * unnecessary writes. */ - MarkBufferDirty(buffer); /* we'll need this info below */ blockNum = BufferGetBlockNumber(buffer); - freespace = PageGetHeapFreeSpace(page); + freespace = BufferGetPageSize(buffer) - SizeOfPageHeaderData; UnlockReleaseBuffer(buffer); @@ -412,7 +419,7 @@ loop: if (otherBuffer == InvalidBuffer) { /* easy case */ - buffer = ReadBufferBI(relation, targetBlock, bistate); + buffer = ReadBufferBI(relation, targetBlock, RBM_NORMAL, bistate); if (PageIsAllVisible(BufferGetPage(buffer))) visibilitymap_pin(relation, targetBlock, vmbuffer); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); @@ -479,6 +486,19 @@ loop: * we're done. */ page = BufferGetPage(buffer); + + /* + * If necessary initialize page, it'll be used soon. We could avoid + * dirtying the buffer here, and rely on the caller to do so whenever + * it puts a tuple onto the page, but there seems not much benefit in + * doing so. + */ + if (PageIsNew(page)) + { + PageInit(page, BufferGetPageSize(buffer), 0); + MarkBufferDirty(buffer); + } + pageFreeSpace = PageGetHeapFreeSpace(page); if (len + saveFreeSpace <= pageFreeSpace) { @@ -571,28 +591,7 @@ loop: * it worth keeping an accurate file length in shared memory someplace, * rather than relying on the kernel to do it for us? */ - buffer = ReadBufferBI(relation, P_NEW, bistate); - - /* - * We can be certain that locking the otherBuffer first is OK, since it - * must have a lower page number. - */ - if (otherBuffer != InvalidBuffer) - LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); - - /* - * Now acquire lock on the new page. - */ - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - - /* - * Release the file-extension lock; it's now OK for someone else to extend - * the relation some more. Note that we cannot release this lock before - * we have buffer lock on the new page, or we risk a race condition - * against vacuumlazy.c --- see comments therein. - */ - if (needLock) - UnlockRelationForExtension(relation, ExclusiveLock); + buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate); /* * We need to initialize the empty new page. Double-check that it really @@ -607,6 +606,52 @@ loop: RelationGetRelationName(relation)); PageInit(page, BufferGetPageSize(buffer), 0); + MarkBufferDirty(buffer); + + /* + * Release the file-extension lock; it's now OK for someone else to extend + * the relation some more. + */ + if (needLock) + UnlockRelationForExtension(relation, ExclusiveLock); + + /* + * Lock the other buffer. It's guaranteed to be of a lower page number + * than the new page. To conform with the deadlock prevent rules, we ought + * to lock otherBuffer first, but that would give other backends a chance + * to put tuples on our page. To reduce the likelihood of that, attempt to + * lock the other buffer conditionally, that's very likely to work. + * Otherwise we need to lock buffers in the correct order, and retry if + * the space has been used in the mean time. + * + * Alternatively, we could acquire the lock on otherBuffer before + * extending the relation, but that'd require holding the lock while + * performing IO, which seems worse than an unlikely retry. + */ + if (otherBuffer != InvalidBuffer) + { + Assert(otherBuffer != buffer); + + if (unlikely(!ConditionalLockBuffer(otherBuffer))) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * Because the buffer was unlocked for a while, it's possible, + * although unlikely, that the page was filled. If so, just retry + * from start. + */ + if (len > PageGetHeapFreeSpace(page)) + { + LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK); + UnlockReleaseBuffer(buffer); + + goto loop; + } + } + } if (len > PageGetHeapFreeSpace(page)) { diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 37aa484ec3..26dfb0c7e0 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -860,43 +860,46 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, if (PageIsNew(page)) { + bool still_new; + /* - * An all-zeroes page could be left over if a backend extends the - * relation but crashes before initializing the page. Reclaim such - * pages for use. + * All-zeroes pages can be left over if either a backend extends + * the relation by a single page, but crashes before the newly + * initialized page has been written out, or when bulk-extending + * the relation (which creates a number of empty pages at the tail + * end of the relation, but enters them into the FSM). * - * We have to be careful here because we could be looking at a - * page that someone has just added to the relation and not yet - * been able to initialize (see RelationGetBufferForTuple). To - * protect against that, release the buffer lock, grab the - * relation extension lock momentarily, and re-lock the buffer. If - * the page is still uninitialized by then, it must be left over - * from a crashed backend, and we can initialize it. + * Make sure these pages are in the FSM, to ensure they can be + * reused. Do that by testing if there's any space recorded for + * the page. If not, enter it. * - * We don't really need the relation lock when this is a new or - * temp relation, but it's probably not worth the code space to - * check that, since this surely isn't a critical path. - * - * Note: the comparable code in vacuum.c need not worry because - * it's got exclusive lock on the whole relation. + * Note we do not enter the page into the visibilitymap. That has + * the downside that we repeatedly visit this page in subsequent + * vacuums, but otherwise we'll never not discover the space on a + * promoted standby. The harm of repeated checking ought to + * normally not be too bad - the space usually should be used at + * some point, otherwise there wouldn't be any regular vacuums. */ - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - LockRelationForExtension(onerel, ExclusiveLock); - UnlockRelationForExtension(onerel, ExclusiveLock); - LockBufferForCleanup(buf); - if (PageIsNew(page)) - { - ereport(WARNING, - (errmsg("relation \"%s\" page %u is uninitialized --- fixing", - relname, blkno))); - PageInit(page, BufferGetPageSize(buf), 0); - empty_pages++; - } - freespace = PageGetHeapFreeSpace(page); - MarkBufferDirty(buf); + + /* + * Perform checking of FSM after releasing lock, the fsm is + * approximate, after all. + */ + still_new = PageIsNew(page); UnlockReleaseBuffer(buf); - RecordPageWithFreeSpace(onerel, blkno, freespace); + if (still_new) + { + empty_pages++; + + if (GetRecordedFreeSpace(onerel, blkno) == 0) + { + Size freespace; + + freespace = BufferGetPageSize(buf) - SizeOfPageHeaderData; + RecordPageWithFreeSpace(onerel, blkno, freespace); + } + } continue; } @@ -905,7 +908,10 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, empty_pages++; freespace = PageGetHeapFreeSpace(page); - /* empty pages are always all-visible and all-frozen */ + /* + * Empty pages are always all-visible and all-frozen (note that + * the same is currently not true for new pages, see above). + */ if (!PageIsAllVisible(page)) { START_CRIT_SECTION(); @@ -1639,12 +1645,13 @@ lazy_check_needs_freeze(Buffer buf, bool *hastup) *hastup = false; - /* If we hit an uninitialized page, we want to force vacuuming it. */ - if (PageIsNew(page)) - return true; - - /* Quick out for ordinary empty page. */ - if (PageIsEmpty(page)) + /* + * New and empty pages, obviously, don't contain tuples. We could make + * sure that the page is registered in the FSM, but it doesn't seem worth + * waiting for a cleanup lock just for that, especially because it's + * likely that the pin holder will do so. + */ + if (PageIsNew(page) || PageIsEmpty(page)) return false; maxoff = PageGetMaxOffsetNumber(page); @@ -2029,7 +2036,6 @@ count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats) if (PageIsNew(page) || PageIsEmpty(page)) { - /* PageIsNew probably shouldn't happen... */ UnlockReleaseBuffer(buf); continue; }