diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index fc9f964bf7..25d2a094dd 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -68,6 +68,7 @@ static void brinsummarize(Relation index, Relation heapRel, static void form_and_insert_tuple(BrinBuildState *state); static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b); +static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy); /* @@ -736,6 +737,8 @@ brinvacuumcleanup(PG_FUNCTION_ARGS) heapRel = heap_open(IndexGetRelation(RelationGetRelid(info->index), false), AccessShareLock); + brin_vacuum_scan(info->index, info->strategy); + brinsummarize(info->index, heapRel, &stats->num_index_tuples, &stats->num_index_tuples); @@ -1150,3 +1153,43 @@ union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b) MemoryContextDelete(cxt); } + +/* + * brin_vacuum_scan + * Do a complete scan of the index during VACUUM. + * + * This routine scans the complete index looking for uncatalogued index pages, + * i.e. those that might have been lost due to a crash after index extension + * and such. + */ +static void +brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy) +{ + bool vacuum_fsm = false; + BlockNumber blkno; + + /* + * Scan the index in physical order, and clean up any possible mess in + * each page. + */ + for (blkno = 0; blkno < RelationGetNumberOfBlocks(idxrel); blkno++) + { + Buffer buf; + + CHECK_FOR_INTERRUPTS(); + + buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno, + RBM_NORMAL, strategy); + + vacuum_fsm |= brin_page_cleanup(idxrel, buf); + + ReleaseBuffer(buf); + } + + /* + * If we made any change to the FSM, make sure the new info is visible all + * the way to the top. + */ + if (vacuum_fsm) + FreeSpaceMapVacuum(idxrel); +} diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c index 0b257d913b..875544e336 100644 --- a/src/backend/access/brin/brin_pageops.c +++ b/src/backend/access/brin/brin_pageops.c @@ -24,8 +24,9 @@ static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, - bool *was_extended); + bool *extended); static Size br_page_get_freespace(Page page); +static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer); /* @@ -53,7 +54,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, BrinTuple *oldtup; Size oldsz; Buffer newbuf; - bool extended = false; + bool extended; Assert(newsz == MAXALIGN(newsz)); @@ -64,11 +65,11 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, { /* need a page on which to put the item */ newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended); - /* XXX delay vacuuming FSM until locks are released? */ - if (extended) - FreeSpaceMapVacuum(idxrel); if (!BufferIsValid(newbuf)) + { + Assert(!extended); return false; + } /* * Note: it's possible (though unlikely) that the returned newbuf is @@ -76,7 +77,10 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, * buffer does in fact have enough space. */ if (newbuf == oldbuf) + { + Assert(!extended); newbuf = InvalidBuffer; + } } else { @@ -93,8 +97,20 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, if (!ItemIdIsNormal(oldlp)) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + /* + * If this happens, and the new buffer was obtained by extending the + * relation, then we need to ensure we don't leave it uninitialized or + * forget about it. + */ if (BufferIsValid(newbuf)) + { + if (extended) + brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); + if (extended) + FreeSpaceMapVacuum(idxrel); + } return false; } @@ -108,7 +124,13 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); if (BufferIsValid(newbuf)) + { + if (extended) + brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); + if (extended) + FreeSpaceMapVacuum(idxrel); + } return false; } @@ -125,7 +147,12 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, brin_can_do_samepage_update(oldbuf, origsz, newsz)) { if (BufferIsValid(newbuf)) + { + /* as above */ + if (extended) + brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); + } START_CRIT_SECTION(); PageIndexDeleteNoCompact(oldpage, &oldoff, 1); @@ -157,6 +184,10 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, END_CRIT_SECTION(); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + if (extended) + FreeSpaceMapVacuum(idxrel); + return true; } else if (newbuf == InvalidBuffer) @@ -178,11 +209,21 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, Buffer revmapbuf; ItemPointerData newtid; OffsetNumber newoff; + BlockNumber newblk = InvalidBlockNumber; + Size freespace = 0; revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); START_CRIT_SECTION(); + /* + * We need to initialize the page if it's newly obtained. Note we + * will WAL-log the initialization as part of the update, so we don't + * need to do that here. + */ + if (extended) + brin_page_init(BufferGetPage(newbuf), BRIN_PAGETYPE_REGULAR); + PageIndexDeleteNoCompact(oldpage, &oldoff, 1); newoff = PageAddItem(newpage, (Item) newtup, newsz, InvalidOffsetNumber, false, false); @@ -191,6 +232,13 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, MarkBufferDirty(oldbuf); MarkBufferDirty(newbuf); + /* needed to update FSM below */ + if (extended) + { + newblk = BufferGetBlockNumber(newbuf); + freespace = br_page_get_freespace(newpage); + } + ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff); brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid); MarkBufferDirty(revmapbuf); @@ -235,6 +283,14 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); UnlockReleaseBuffer(newbuf); + + if (extended) + { + Assert(BlockNumberIsValid(newblk)); + RecordPageWithFreeSpace(idxrel, newblk, freespace); + FreeSpaceMapVacuum(idxrel); + } + return true; } } @@ -271,7 +327,7 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, OffsetNumber off; Buffer revmapbuf; ItemPointerData tid; - bool extended = false; + bool extended; Assert(itemsz == MAXALIGN(itemsz)); @@ -302,7 +358,7 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, { *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended); Assert(BufferIsValid(*buffer)); - Assert(br_page_get_freespace(BufferGetPage(*buffer)) >= itemsz); + Assert(extended || br_page_get_freespace(BufferGetPage(*buffer)) >= itemsz); } /* Now obtain lock on revmap buffer */ @@ -312,6 +368,8 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, blk = BufferGetBlockNumber(*buffer); START_CRIT_SECTION(); + if (extended) + brin_page_init(BufferGetPage(*buffer), BRIN_PAGETYPE_REGULAR); off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber, false, false); if (off == InvalidOffsetNumber) @@ -489,27 +547,105 @@ brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, UnlockReleaseBuffer(buf); } +/* + * Given a BRIN index page, initialize it if necessary, and record it into the + * FSM if necessary. Return value is true if the FSM itself needs "vacuuming". + * The main use for this is when, during vacuuming, an uninitialized page is + * found, which could be the result of relation extension followed by a crash + * before the page can be used. + */ +bool +brin_page_cleanup(Relation idxrel, Buffer buf) +{ + Page page = BufferGetPage(buf); + Size freespace; + + /* + * If a page was left uninitialized, initialize it now; also record it in + * FSM. + * + * Somebody else might be extending the relation concurrently. To avoid + * re-initializing the page before they can grab the buffer lock, we + * acquire the extension lock momentarily. Since they hold the extension + * lock from before getting the page and after its been initialized, we're + * sure to see their initialization. + */ + if (PageIsNew(page)) + { + LockRelationForExtension(idxrel, ShareLock); + UnlockRelationForExtension(idxrel, ShareLock); + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + if (PageIsNew(page)) + { + brin_initialize_empty_new_buffer(idxrel, buf); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + return true; + } + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + + /* Nothing to be done for non-regular index pages */ + if (BRIN_IS_META_PAGE(BufferGetPage(buf)) || + BRIN_IS_REVMAP_PAGE(BufferGetPage(buf))) + return false; + + /* Measure free space and record it */ + freespace = br_page_get_freespace(page); + if (freespace > GetRecordedFreeSpace(idxrel, BufferGetBlockNumber(buf))) + { + RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf), freespace); + return true; + } + + return false; +} + /* * Return a pinned and exclusively locked buffer which can be used to insert an * index item of size itemsz. If oldbuf is a valid buffer, it is also locked * (in an order determined to avoid deadlocks.) * - * If there's no existing page with enough free space to accommodate the new - * item, the relation is extended. If this happens, *extended is set to true. - * * If we find that the old page is no longer a regular index page (because * of a revmap extension), the old buffer is unlocked and we return * InvalidBuffer. + * + * If there's no existing page with enough free space to accommodate the new + * item, the relation is extended. If this happens, *extended is set to true, + * and it is the caller's responsibility to initialize the page (and WAL-log + * that fact) prior to use. + * + * Note that in some corner cases it is possible for this routine to extend the + * relation and then not return the buffer. It is this routine's + * responsibility to WAL-log the page initialization and to record the page in + * FSM if that happens. Such a buffer may later be reused by this routine. */ static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, - bool *was_extended) + bool *extended) { BlockNumber oldblk; BlockNumber newblk; Page page; int freespace; + /* + * If the item is oversized, don't bother. We have another, more precise + * check below. + */ + if (itemsz > BLCKSZ - sizeof(BrinSpecialSpace)) + { + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", + (unsigned long) itemsz, + (unsigned long) BLCKSZ - sizeof(BrinSpecialSpace), + RelationGetRelationName(irel)))); + return InvalidBuffer; /* keep compiler quiet */ + } + + *extended = false; + if (BufferIsValid(oldbuf)) oldblk = BufferGetBlockNumber(oldbuf); else @@ -528,7 +664,6 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, { Buffer buf; bool extensionLockHeld = false; - bool extended = false; CHECK_FOR_INTERRUPTS(); @@ -546,7 +681,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, } buf = ReadBuffer(irel, P_NEW); newblk = BufferGetBlockNumber(buf); - *was_extended = extended = true; + *extended = true; BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u", BufferGetBlockNumber(buf))); @@ -576,6 +711,25 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + /* + * It is possible that the new page was obtained from + * extending the relation. In that case, we must be sure to + * record it in the FSM before leaving, because otherwise the + * space would be lost forever. However, we cannot let an + * uninitialized page get in the FSM, so we need to initialize + * it first. + */ + if (*extended) + { + brin_initialize_empty_new_buffer(irel, buf); + /* shouldn't matter, but don't confuse caller */ + *extended = false; + } + + if (extensionLockHeld) + UnlockRelationForExtension(irel, ExclusiveLock); + ReleaseBuffer(buf); return InvalidBuffer; } @@ -588,9 +742,6 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, page = BufferGetPage(buf); - if (extended) - brin_page_init(page, BRIN_PAGETYPE_REGULAR); - /* * We have a new buffer to insert into. Check that the new page has * enough free space, and return it if it does; otherwise start over. @@ -600,7 +751,8 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, * (br_page_get_freespace also checks that the FSM didn't hand us a * page that has since been repurposed for the revmap.) */ - freespace = br_page_get_freespace(page); + freespace = *extended ? + BLCKSZ - sizeof(BrinSpecialSpace) : br_page_get_freespace(page); if (freespace >= itemsz) { RelationSetTargetBlock(irel, BufferGetBlockNumber(buf)); @@ -610,7 +762,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, * invalidations, make sure we update the more permanent FSM with * data about it before going away. */ - if (extended) + if (*extended) RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf), freespace); @@ -634,12 +786,13 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, /* * If an entirely new page does not contain enough free space for the * new item, then surely that item is oversized. Complain loudly; but - * first make sure we record the page as free, for next time. + * first make sure we initialize the page and record it as free, for + * next time. */ - if (extended) + if (*extended) { - RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf), - freespace); + brin_initialize_empty_new_buffer(irel, buf); + ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", @@ -658,6 +811,43 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, } } +/* + * Initialize a page as an empty regular BRIN page, WAL-log this, and record + * the page in FSM. + * + * There are several corner situations in which we extend the relation to + * obtain a new page and later find that we cannot use it immediately. When + * that happens, we don't want to leave the page go unrecorded in FSM, because + * there is no mechanism to get the space back and the index would bloat. + * Also, because we would not WAL-log the action that would initialize the + * page, the page would go uninitialized in a standby (or after recovery). + */ +static void +brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer) +{ + Page page; + + BRIN_elog((DEBUG2, + "brin_initialize_empty_new_buffer: initializing blank page %u", + BufferGetBlockNumber(buffer))); + + START_CRIT_SECTION(); + page = BufferGetPage(buffer); + brin_page_init(page, BRIN_PAGETYPE_REGULAR); + MarkBufferDirty(buffer); + log_newpage_buffer(buffer, true); + END_CRIT_SECTION(); + + /* + * We update the FSM for this page, but this is not WAL-logged. This is + * acceptable because VACUUM will scan the index and update the FSM with + * pages whose FSM records were forgotten in a crash. + */ + RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer), + br_page_get_freespace(page)); +} + + /* * Return the amount of free space on a regular BRIN index page. * diff --git a/src/include/access/brin_page.h b/src/include/access/brin_page.h index ecbd13a9a3..c827ca367d 100644 --- a/src/include/access/brin_page.h +++ b/src/include/access/brin_page.h @@ -52,6 +52,7 @@ typedef struct BrinSpecialSpace #define BRIN_PAGETYPE_REVMAP 0xF092 #define BRIN_PAGETYPE_REGULAR 0xF093 +#define BRIN_IS_META_PAGE(page) (BrinPageType(page) == BRIN_PAGETYPE_META) #define BRIN_IS_REVMAP_PAGE(page) (BrinPageType(page) == BRIN_PAGETYPE_REVMAP) #define BRIN_IS_REGULAR_PAGE(page) (BrinPageType(page) == BRIN_PAGETYPE_REGULAR) diff --git a/src/include/access/brin_pageops.h b/src/include/access/brin_pageops.h index 6ebe8ef764..1007e07607 100644 --- a/src/include/access/brin_pageops.h +++ b/src/include/access/brin_pageops.h @@ -33,4 +33,6 @@ extern bool brin_start_evacuating_page(Relation idxRel, Buffer buf); extern void brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, BrinRevmap *revmap, Buffer buf); +extern bool brin_page_cleanup(Relation idxrel, Buffer buf); + #endif /* BRIN_PAGEOPS_H */