Close some holes in BRIN page assignment

In some corner cases, it is possible for the BRIN index relation to be
extended by brin_getinsertbuffer but the new page not be used
immediately for anything by its callers; when this happens, the page is
initialized and the FSM is updated (by brin_getinsertbuffer) with the
info about that page, but these actions are not WAL-logged.  A later
index insert/update can use the page, but since the page is already
initialized, the initialization itself is not WAL-logged then either.
Replay of this sequence of events causes recovery to fail altogether.

There is a related corner case within brin_getinsertbuffer itself, in
which we extend the relation to put a new index tuple there, but later
find out that we cannot do so, and do not return the buffer; the page
obtained from extension is not even initialized.  The resulting page is
lost forever.

To fix, shuffle the code so that initialization is not the
responsibility of brin_getinsertbuffer anymore, in normal cases;
instead, the initialization is done by its callers (brin_doinsert and
brin_doupdate) once they're certain that the page is going to be used.
When either those functions determine that the new page cannot be used,
before bailing out they initialize the page as an empty regular page,
enter it in FSM and WAL-log all this.  This way, the page is usable for
future index insertions, and WAL replay doesn't find trying to insert
tuples in pages whose initialization didn't make it to the WAL.  The
same strategy is used in brin_getinsertbuffer when it cannot return the
new page.

Additionally, add a new step to vacuuming so that all pages of the index
are scanned; whenever an uninitialized page is found, it is initialized
as empty and WAL-logged.  This closes the hole that the relation is
extended but the system crashes before anything is WAL-logged about it.
We also take this opportunity to update the FSM, in case it has gotten
out of date.

Thanks to Heikki Linnakangas for finding the problem that kicked some
additional analysis of BRIN page assignment code.

Backpatch to 9.5, where BRIN was introduced.

Discussion: https://www.postgresql.org/message-id/20150723204810.GY5596@postgresql.org
This commit is contained in:
Alvaro Herrera 2015-08-12 14:20:38 -03:00
parent a4b059fdde
commit ccc4c07499
4 changed files with 258 additions and 22 deletions

View File

@ -68,6 +68,7 @@ static void brinsummarize(Relation index, Relation heapRel,
static void form_and_insert_tuple(BrinBuildState *state);
static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
BrinTuple *b);
static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy);
/*
@ -736,6 +737,8 @@ brinvacuumcleanup(PG_FUNCTION_ARGS)
heapRel = heap_open(IndexGetRelation(RelationGetRelid(info->index), false),
AccessShareLock);
brin_vacuum_scan(info->index, info->strategy);
brinsummarize(info->index, heapRel,
&stats->num_index_tuples, &stats->num_index_tuples);
@ -1150,3 +1153,43 @@ union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
MemoryContextDelete(cxt);
}
/*
* brin_vacuum_scan
* Do a complete scan of the index during VACUUM.
*
* This routine scans the complete index looking for uncatalogued index pages,
* i.e. those that might have been lost due to a crash after index extension
* and such.
*/
static void
brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
{
bool vacuum_fsm = false;
BlockNumber blkno;
/*
* Scan the index in physical order, and clean up any possible mess in
* each page.
*/
for (blkno = 0; blkno < RelationGetNumberOfBlocks(idxrel); blkno++)
{
Buffer buf;
CHECK_FOR_INTERRUPTS();
buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno,
RBM_NORMAL, strategy);
vacuum_fsm |= brin_page_cleanup(idxrel, buf);
ReleaseBuffer(buf);
}
/*
* If we made any change to the FSM, make sure the new info is visible all
* the way to the top.
*/
if (vacuum_fsm)
FreeSpaceMapVacuum(idxrel);
}

View File

@ -24,8 +24,9 @@
static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
bool *was_extended);
bool *extended);
static Size br_page_get_freespace(Page page);
static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer);
/*
@ -53,7 +54,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
BrinTuple *oldtup;
Size oldsz;
Buffer newbuf;
bool extended = false;
bool extended;
Assert(newsz == MAXALIGN(newsz));
@ -64,11 +65,11 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
{
/* need a page on which to put the item */
newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended);
/* XXX delay vacuuming FSM until locks are released? */
if (extended)
FreeSpaceMapVacuum(idxrel);
if (!BufferIsValid(newbuf))
{
Assert(!extended);
return false;
}
/*
* Note: it's possible (though unlikely) that the returned newbuf is
@ -76,7 +77,10 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
* buffer does in fact have enough space.
*/
if (newbuf == oldbuf)
{
Assert(!extended);
newbuf = InvalidBuffer;
}
}
else
{
@ -93,8 +97,20 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
if (!ItemIdIsNormal(oldlp))
{
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
/*
* If this happens, and the new buffer was obtained by extending the
* relation, then we need to ensure we don't leave it uninitialized or
* forget about it.
*/
if (BufferIsValid(newbuf))
{
if (extended)
brin_initialize_empty_new_buffer(idxrel, newbuf);
UnlockReleaseBuffer(newbuf);
if (extended)
FreeSpaceMapVacuum(idxrel);
}
return false;
}
@ -108,7 +124,13 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
{
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
if (BufferIsValid(newbuf))
{
if (extended)
brin_initialize_empty_new_buffer(idxrel, newbuf);
UnlockReleaseBuffer(newbuf);
if (extended)
FreeSpaceMapVacuum(idxrel);
}
return false;
}
@ -125,7 +147,12 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
brin_can_do_samepage_update(oldbuf, origsz, newsz))
{
if (BufferIsValid(newbuf))
{
/* as above */
if (extended)
brin_initialize_empty_new_buffer(idxrel, newbuf);
UnlockReleaseBuffer(newbuf);
}
START_CRIT_SECTION();
PageIndexDeleteNoCompact(oldpage, &oldoff, 1);
@ -157,6 +184,10 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
END_CRIT_SECTION();
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
if (extended)
FreeSpaceMapVacuum(idxrel);
return true;
}
else if (newbuf == InvalidBuffer)
@ -178,11 +209,21 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
Buffer revmapbuf;
ItemPointerData newtid;
OffsetNumber newoff;
BlockNumber newblk = InvalidBlockNumber;
Size freespace = 0;
revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
START_CRIT_SECTION();
/*
* We need to initialize the page if it's newly obtained. Note we
* will WAL-log the initialization as part of the update, so we don't
* need to do that here.
*/
if (extended)
brin_page_init(BufferGetPage(newbuf), BRIN_PAGETYPE_REGULAR);
PageIndexDeleteNoCompact(oldpage, &oldoff, 1);
newoff = PageAddItem(newpage, (Item) newtup, newsz,
InvalidOffsetNumber, false, false);
@ -191,6 +232,13 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
MarkBufferDirty(oldbuf);
MarkBufferDirty(newbuf);
/* needed to update FSM below */
if (extended)
{
newblk = BufferGetBlockNumber(newbuf);
freespace = br_page_get_freespace(newpage);
}
ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff);
brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid);
MarkBufferDirty(revmapbuf);
@ -235,6 +283,14 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
UnlockReleaseBuffer(newbuf);
if (extended)
{
Assert(BlockNumberIsValid(newblk));
RecordPageWithFreeSpace(idxrel, newblk, freespace);
FreeSpaceMapVacuum(idxrel);
}
return true;
}
}
@ -271,7 +327,7 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
OffsetNumber off;
Buffer revmapbuf;
ItemPointerData tid;
bool extended = false;
bool extended;
Assert(itemsz == MAXALIGN(itemsz));
@ -302,7 +358,7 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
{
*buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended);
Assert(BufferIsValid(*buffer));
Assert(br_page_get_freespace(BufferGetPage(*buffer)) >= itemsz);
Assert(extended || br_page_get_freespace(BufferGetPage(*buffer)) >= itemsz);
}
/* Now obtain lock on revmap buffer */
@ -312,6 +368,8 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
blk = BufferGetBlockNumber(*buffer);
START_CRIT_SECTION();
if (extended)
brin_page_init(BufferGetPage(*buffer), BRIN_PAGETYPE_REGULAR);
off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber,
false, false);
if (off == InvalidOffsetNumber)
@ -489,27 +547,105 @@ brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
UnlockReleaseBuffer(buf);
}
/*
* Given a BRIN index page, initialize it if necessary, and record it into the
* FSM if necessary. Return value is true if the FSM itself needs "vacuuming".
* The main use for this is when, during vacuuming, an uninitialized page is
* found, which could be the result of relation extension followed by a crash
* before the page can be used.
*/
bool
brin_page_cleanup(Relation idxrel, Buffer buf)
{
Page page = BufferGetPage(buf);
Size freespace;
/*
* If a page was left uninitialized, initialize it now; also record it in
* FSM.
*
* Somebody else might be extending the relation concurrently. To avoid
* re-initializing the page before they can grab the buffer lock, we
* acquire the extension lock momentarily. Since they hold the extension
* lock from before getting the page and after its been initialized, we're
* sure to see their initialization.
*/
if (PageIsNew(page))
{
LockRelationForExtension(idxrel, ShareLock);
UnlockRelationForExtension(idxrel, ShareLock);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
if (PageIsNew(page))
{
brin_initialize_empty_new_buffer(idxrel, buf);
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
return true;
}
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
/* Nothing to be done for non-regular index pages */
if (BRIN_IS_META_PAGE(BufferGetPage(buf)) ||
BRIN_IS_REVMAP_PAGE(BufferGetPage(buf)))
return false;
/* Measure free space and record it */
freespace = br_page_get_freespace(page);
if (freespace > GetRecordedFreeSpace(idxrel, BufferGetBlockNumber(buf)))
{
RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf), freespace);
return true;
}
return false;
}
/*
* Return a pinned and exclusively locked buffer which can be used to insert an
* index item of size itemsz. If oldbuf is a valid buffer, it is also locked
* (in an order determined to avoid deadlocks.)
*
* If there's no existing page with enough free space to accommodate the new
* item, the relation is extended. If this happens, *extended is set to true.
*
* If we find that the old page is no longer a regular index page (because
* of a revmap extension), the old buffer is unlocked and we return
* InvalidBuffer.
*
* If there's no existing page with enough free space to accommodate the new
* item, the relation is extended. If this happens, *extended is set to true,
* and it is the caller's responsibility to initialize the page (and WAL-log
* that fact) prior to use.
*
* Note that in some corner cases it is possible for this routine to extend the
* relation and then not return the buffer. It is this routine's
* responsibility to WAL-log the page initialization and to record the page in
* FSM if that happens. Such a buffer may later be reused by this routine.
*/
static Buffer
brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
bool *was_extended)
bool *extended)
{
BlockNumber oldblk;
BlockNumber newblk;
Page page;
int freespace;
/*
* If the item is oversized, don't bother. We have another, more precise
* check below.
*/
if (itemsz > BLCKSZ - sizeof(BrinSpecialSpace))
{
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
(unsigned long) itemsz,
(unsigned long) BLCKSZ - sizeof(BrinSpecialSpace),
RelationGetRelationName(irel))));
return InvalidBuffer; /* keep compiler quiet */
}
*extended = false;
if (BufferIsValid(oldbuf))
oldblk = BufferGetBlockNumber(oldbuf);
else
@ -528,7 +664,6 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
{
Buffer buf;
bool extensionLockHeld = false;
bool extended = false;
CHECK_FOR_INTERRUPTS();
@ -546,7 +681,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
}
buf = ReadBuffer(irel, P_NEW);
newblk = BufferGetBlockNumber(buf);
*was_extended = extended = true;
*extended = true;
BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u",
BufferGetBlockNumber(buf)));
@ -576,6 +711,25 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
{
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
/*
* It is possible that the new page was obtained from
* extending the relation. In that case, we must be sure to
* record it in the FSM before leaving, because otherwise the
* space would be lost forever. However, we cannot let an
* uninitialized page get in the FSM, so we need to initialize
* it first.
*/
if (*extended)
{
brin_initialize_empty_new_buffer(irel, buf);
/* shouldn't matter, but don't confuse caller */
*extended = false;
}
if (extensionLockHeld)
UnlockRelationForExtension(irel, ExclusiveLock);
ReleaseBuffer(buf);
return InvalidBuffer;
}
@ -588,9 +742,6 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
page = BufferGetPage(buf);
if (extended)
brin_page_init(page, BRIN_PAGETYPE_REGULAR);
/*
* We have a new buffer to insert into. Check that the new page has
* enough free space, and return it if it does; otherwise start over.
@ -600,7 +751,8 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
* (br_page_get_freespace also checks that the FSM didn't hand us a
* page that has since been repurposed for the revmap.)
*/
freespace = br_page_get_freespace(page);
freespace = *extended ?
BLCKSZ - sizeof(BrinSpecialSpace) : br_page_get_freespace(page);
if (freespace >= itemsz)
{
RelationSetTargetBlock(irel, BufferGetBlockNumber(buf));
@ -610,7 +762,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
* invalidations, make sure we update the more permanent FSM with
* data about it before going away.
*/
if (extended)
if (*extended)
RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf),
freespace);
@ -634,12 +786,13 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
/*
* If an entirely new page does not contain enough free space for the
* new item, then surely that item is oversized. Complain loudly; but
* first make sure we record the page as free, for next time.
* first make sure we initialize the page and record it as free, for
* next time.
*/
if (extended)
if (*extended)
{
RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf),
freespace);
brin_initialize_empty_new_buffer(irel, buf);
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
@ -658,6 +811,43 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
}
}
/*
* Initialize a page as an empty regular BRIN page, WAL-log this, and record
* the page in FSM.
*
* There are several corner situations in which we extend the relation to
* obtain a new page and later find that we cannot use it immediately. When
* that happens, we don't want to leave the page go unrecorded in FSM, because
* there is no mechanism to get the space back and the index would bloat.
* Also, because we would not WAL-log the action that would initialize the
* page, the page would go uninitialized in a standby (or after recovery).
*/
static void
brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
{
Page page;
BRIN_elog((DEBUG2,
"brin_initialize_empty_new_buffer: initializing blank page %u",
BufferGetBlockNumber(buffer)));
START_CRIT_SECTION();
page = BufferGetPage(buffer);
brin_page_init(page, BRIN_PAGETYPE_REGULAR);
MarkBufferDirty(buffer);
log_newpage_buffer(buffer, true);
END_CRIT_SECTION();
/*
* We update the FSM for this page, but this is not WAL-logged. This is
* acceptable because VACUUM will scan the index and update the FSM with
* pages whose FSM records were forgotten in a crash.
*/
RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer),
br_page_get_freespace(page));
}
/*
* Return the amount of free space on a regular BRIN index page.
*

View File

@ -52,6 +52,7 @@ typedef struct BrinSpecialSpace
#define BRIN_PAGETYPE_REVMAP 0xF092
#define BRIN_PAGETYPE_REGULAR 0xF093
#define BRIN_IS_META_PAGE(page) (BrinPageType(page) == BRIN_PAGETYPE_META)
#define BRIN_IS_REVMAP_PAGE(page) (BrinPageType(page) == BRIN_PAGETYPE_REVMAP)
#define BRIN_IS_REGULAR_PAGE(page) (BrinPageType(page) == BRIN_PAGETYPE_REGULAR)

View File

@ -33,4 +33,6 @@ extern bool brin_start_evacuating_page(Relation idxRel, Buffer buf);
extern void brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
BrinRevmap *revmap, Buffer buf);
extern bool brin_page_cleanup(Relation idxrel, Buffer buf);
#endif /* BRIN_PAGEOPS_H */