hio: Don't pin the VM while holding buffer lock while extending

Starting with commit 7db0cd2145, RelationGetBufferForTuple() did a
visibilitymap_pin() while holding an exclusive buffer content lock on a newly
extended page, when using COPY FREEZE. We elsewhere try hard to avoid to doing
IO while holding a content lock. And until 14f98e0af9, that happened while
holding the relation extension lock.

Practically, this isn't a huge issue, because COPY FREEZE is restricted to
relations created or truncated in the current session, so it's unlikely
there's a lot of contention.

We can't avoid doing IO while holding the content lock by pinning the VM
earlier, because we don't know which page it will be on.

While we could just ignore the issue in this case, a future commit will add
bulk relation extension, which needs to enter pages into the FSM while also
trying to hold onto a buffer lock.

To address this issue, use visibilitymap_pin_ok() to see if the relevant
buffer is already pinned. If not, release the buffer, pin the VM buffer, and
acquire the lock again. This opens up a small window for other backends to
insert data onto the page - as the page is not entered into the freespacemap,
other backends won't see it normally, but a concurrent vacuum could enter the
page, if it started just after the relation is extended. In case the page is
used by another backend, retry. This is very similar to how locking
"otherBuffer" is already dealt with.

Reviewed-by: Tomas Vondra <tomas.vondra@enterprisedb.com>
Discussion: http://postgr.es/m/20230325025740.wzvchp2kromw4zqz@awork3.anarazel.de
This commit is contained in:
Andres Freund 2023-04-06 11:11:13 -07:00
parent bba9003b62
commit 18103b7c5f
1 changed files with 85 additions and 41 deletions

View File

@ -134,14 +134,17 @@ ReadBufferBI(Relation relation, BlockNumber targetBlock,
* To avoid complexity in the callers, either buffer1 or buffer2 may be * To avoid complexity in the callers, either buffer1 or buffer2 may be
* InvalidBuffer if only one buffer is involved. For the same reason, block2 * InvalidBuffer if only one buffer is involved. For the same reason, block2
* may be smaller than block1. * may be smaller than block1.
*
* Returns whether buffer locks were temporarily released.
*/ */
static void static bool
GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2, GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2,
BlockNumber block1, BlockNumber block2, BlockNumber block1, BlockNumber block2,
Buffer *vmbuffer1, Buffer *vmbuffer2) Buffer *vmbuffer1, Buffer *vmbuffer2)
{ {
bool need_to_pin_buffer1; bool need_to_pin_buffer1;
bool need_to_pin_buffer2; bool need_to_pin_buffer2;
bool released_locks = false;
/* /*
* Swap buffers around to handle case of a single block/buffer, and to * Swap buffers around to handle case of a single block/buffer, and to
@ -175,9 +178,10 @@ GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2,
&& PageIsAllVisible(BufferGetPage(buffer2)) && PageIsAllVisible(BufferGetPage(buffer2))
&& !visibilitymap_pin_ok(block2, *vmbuffer2); && !visibilitymap_pin_ok(block2, *vmbuffer2);
if (!need_to_pin_buffer1 && !need_to_pin_buffer2) if (!need_to_pin_buffer1 && !need_to_pin_buffer2)
return; break;
/* We must unlock both buffers before doing any I/O. */ /* We must unlock both buffers before doing any I/O. */
released_locks = true;
LockBuffer(buffer1, BUFFER_LOCK_UNLOCK); LockBuffer(buffer1, BUFFER_LOCK_UNLOCK);
if (buffer2 != InvalidBuffer && buffer2 != buffer1) if (buffer2 != InvalidBuffer && buffer2 != buffer1)
LockBuffer(buffer2, BUFFER_LOCK_UNLOCK); LockBuffer(buffer2, BUFFER_LOCK_UNLOCK);
@ -203,6 +207,8 @@ GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2,
|| (need_to_pin_buffer1 && need_to_pin_buffer2)) || (need_to_pin_buffer1 && need_to_pin_buffer2))
break; break;
} }
return released_locks;
} }
/* /*
@ -365,6 +371,8 @@ RelationGetBufferForTuple(Relation relation, Size len,
BlockNumber targetBlock, BlockNumber targetBlock,
otherBlock; otherBlock;
bool needLock; bool needLock;
bool unlockedTargetBuffer;
bool recheckVmPins;
len = MAXALIGN(len); /* be conservative */ len = MAXALIGN(len); /* be conservative */
@ -645,6 +653,9 @@ loop:
if (needLock) if (needLock)
UnlockRelationForExtension(relation, ExclusiveLock); UnlockRelationForExtension(relation, ExclusiveLock);
unlockedTargetBuffer = false;
targetBlock = BufferGetBlockNumber(buffer);
/* /*
* We need to initialize the empty new page. Double-check that it really * We need to initialize the empty new page. Double-check that it really
* is empty (this should never happen, but if it does we don't want to * is empty (this should never happen, but if it does we don't want to
@ -654,75 +665,108 @@ loop:
if (!PageIsNew(page)) if (!PageIsNew(page))
elog(ERROR, "page %u of relation \"%s\" should be empty but is not", elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
BufferGetBlockNumber(buffer), targetBlock,
RelationGetRelationName(relation)); RelationGetRelationName(relation));
PageInit(page, BufferGetPageSize(buffer), 0); PageInit(page, BufferGetPageSize(buffer), 0);
MarkBufferDirty(buffer); MarkBufferDirty(buffer);
/* /*
* The page is empty, pin vmbuffer to set all_frozen bit. * The page is empty, pin vmbuffer to set all_frozen bit. We don't want to
* do IO while the buffer is locked, so we unlock the page first if IO is
* needed (necessitating checks below).
*/ */
if (options & HEAP_INSERT_FROZEN) if (options & HEAP_INSERT_FROZEN)
{ {
Assert(PageGetMaxOffsetNumber(BufferGetPage(buffer)) == 0); Assert(PageGetMaxOffsetNumber(page) == 0);
visibilitymap_pin(relation, BufferGetBlockNumber(buffer), vmbuffer);
if (!visibilitymap_pin_ok(targetBlock, *vmbuffer))
{
unlockedTargetBuffer = true;
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
visibilitymap_pin(relation, targetBlock, vmbuffer);
}
} }
/* /*
* Lock the other buffer. It's guaranteed to be of a lower page number * Reacquire locks if necessary.
* than the new page. To conform with the deadlock prevent rules, we ought *
* to lock otherBuffer first, but that would give other backends a chance * If the target buffer was unlocked above, or is unlocked while
* to put tuples on our page. To reduce the likelihood of that, attempt to * reacquiring the lock on otherBuffer below, it's unlikely, but possible,
* lock the other buffer conditionally, that's very likely to work. * that another backend used space on this page. We check for that below,
* Otherwise we need to lock buffers in the correct order, and retry if * and retry if necessary.
* the space has been used in the mean time. */
recheckVmPins = false;
if (unlockedTargetBuffer)
{
/* released lock on target buffer above */
if (otherBuffer != InvalidBuffer)
LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
recheckVmPins = true;
}
else if (otherBuffer != InvalidBuffer)
{
/*
* We did not release the target buffer, and otherBuffer is valid,
* need to lock the other buffer. It's guaranteed to be of a lower
* page number than the new page. To conform with the deadlock
* prevent rules, we ought to lock otherBuffer first, but that would
* give other backends a chance to put tuples on our page. To reduce
* the likelihood of that, attempt to lock the other buffer
* conditionally, that's very likely to work.
* *
* Alternatively, we could acquire the lock on otherBuffer before * Alternatively, we could acquire the lock on otherBuffer before
* extending the relation, but that'd require holding the lock while * extending the relation, but that'd require holding the lock while
* performing IO, which seems worse than an unlikely retry. * performing IO, which seems worse than an unlikely retry.
*/ */
if (otherBuffer != InvalidBuffer)
{
Assert(otherBuffer != buffer); Assert(otherBuffer != buffer);
targetBlock = BufferGetBlockNumber(buffer);
Assert(targetBlock > otherBlock); Assert(targetBlock > otherBlock);
if (unlikely(!ConditionalLockBuffer(otherBuffer))) if (unlikely(!ConditionalLockBuffer(otherBuffer)))
{ {
unlockedTargetBuffer = true;
LockBuffer(buffer, BUFFER_LOCK_UNLOCK); LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
} }
recheckVmPins = true;
}
/* /*
* Because the buffers were unlocked for a while, it's possible, * If one of the buffers was unlocked (always the case if otherBuffer is
* although unlikely, that an all-visible flag became set or that * valid), it's possible, although unlikely, that an all-visible flag
* somebody used up the available space in the new page. We can use * became set. We can use GetVisibilityMapPins to deal with that. It's
* GetVisibilityMapPins to deal with the first case. In the second * possible that GetVisibilityMapPins() might need to temporarily release
* case, just retry from start. * buffer locks, in which case we'll need to check if there's still enough
* space on the page below.
*/ */
GetVisibilityMapPins(relation, otherBuffer, buffer, if (recheckVmPins)
otherBlock, targetBlock, vmbuffer_other,
vmbuffer);
/*
* Note that we have to check the available space even if our
* conditional lock succeeded, because GetVisibilityMapPins might've
* transiently released lock on the target buffer to acquire a VM pin
* for the otherBuffer.
*/
if (len > PageGetHeapFreeSpace(page))
{ {
if (GetVisibilityMapPins(relation, otherBuffer, buffer,
otherBlock, targetBlock, vmbuffer_other,
vmbuffer))
unlockedTargetBuffer = true;
}
/*
* If the target buffer was temporarily unlocked since the relation
* extension, it's possible, although unlikely, that all the space on the
* page was already used. If so, we just retry from the start. If we
* didn't unlock, something has gone wrong if there's not enough space -
* the test at the top should have prevented reaching this case.
*/
pageFreeSpace = PageGetHeapFreeSpace(page);
if (len > pageFreeSpace)
{
if (unlockedTargetBuffer)
{
if (otherBuffer != InvalidBuffer)
LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK); LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
UnlockReleaseBuffer(buffer); UnlockReleaseBuffer(buffer);
goto loop; goto loop;
} }
}
else if (len > PageGetHeapFreeSpace(page))
{
/* We should not get here given the test at the top */
elog(PANIC, "tuple is too big: size %zu", len); elog(PANIC, "tuple is too big: size %zu", len);
} }
@ -735,7 +779,7 @@ loop:
* current backend to make more insertions or not, which is probably a * current backend to make more insertions or not, which is probably a
* good bet most of the time. So for now, don't add it to FSM yet. * good bet most of the time. So for now, don't add it to FSM yet.
*/ */
RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer)); RelationSetTargetBlock(relation, targetBlock);
return buffer; return buffer;
} }