diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 6979aff727..94a9759322 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -183,7 +183,6 @@ static inline bool invariant_l_nontarget_offset(BtreeCheckState *state, OffsetNumber upperbound); static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum); static inline BTScanInsert bt_mkscankey_pivotsearch(Relation rel, - Relation heaprel, IndexTuple itup); static ItemId PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page, OffsetNumber offset); @@ -332,7 +331,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, RelationGetRelationName(indrel)))); /* Extract metadata from metapage, and sanitize it in passing */ - _bt_metaversion(indrel, heaprel, &heapkeyspace, &allequalimage); + _bt_metaversion(indrel, &heapkeyspace, &allequalimage); if (allequalimage && !heapkeyspace) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -1259,7 +1258,7 @@ bt_target_page_check(BtreeCheckState *state) } /* Build insertion scankey for current page offset */ - skey = bt_mkscankey_pivotsearch(state->rel, state->heaprel, itup); + skey = bt_mkscankey_pivotsearch(state->rel, itup); /* * Make sure tuple size does not exceed the relevant BTREE_VERSION @@ -1769,7 +1768,7 @@ bt_right_page_check_scankey(BtreeCheckState *state) * memory remaining allocated. */ firstitup = (IndexTuple) PageGetItem(rightpage, rightitem); - return bt_mkscankey_pivotsearch(state->rel, state->heaprel, firstitup); + return bt_mkscankey_pivotsearch(state->rel, firstitup); } /* @@ -2682,7 +2681,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) Buffer lbuf; bool exists; - key = _bt_mkscankey(state->rel, state->heaprel, itup); + key = _bt_mkscankey(state->rel, itup); Assert(key->heapkeyspace && key->scantid != NULL); /* @@ -2695,7 +2694,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) */ Assert(state->readonly && state->rootdescend); exists = false; - stack = _bt_search(state->rel, state->heaprel, key, &lbuf, BT_READ, NULL); + stack = _bt_search(state->rel, NULL, key, &lbuf, BT_READ, NULL); if (BufferIsValid(lbuf)) { @@ -3134,11 +3133,11 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) * the scankey is greater. */ static inline BTScanInsert -bt_mkscankey_pivotsearch(Relation rel, Relation heaprel, IndexTuple itup) +bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup) { BTScanInsert skey; - skey = _bt_mkscankey(rel, heaprel, itup); + skey = _bt_mkscankey(rel, itup); skey->pivotsearch = true; return skey; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 646135cc21..0755be8390 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -731,14 +731,9 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, *multi_cutoff); - /* - * Set up sorting if wanted. NewHeap is being passed to - * tuplesort_begin_cluster(), it could have been OldHeap too. It does not - * really matter, as the goal is to have a heap relation being passed to - * _bt_log_reuse_page() (which should not be called from this code path). - */ + /* Set up sorting if wanted */ if (use_sort) - tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex, NewHeap, + tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex, maintenance_work_mem, NULL, TUPLESORT_NONE); else diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 84bbd78d59..d33f814a93 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -59,7 +59,7 @@ static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, IndexTuple nposting, uint16 postingoff); static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly); -static Buffer _bt_newroot(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf); +static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf); static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem); static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, @@ -110,7 +110,7 @@ _bt_doinsert(Relation rel, IndexTuple itup, bool checkingunique = (checkUnique != UNIQUE_CHECK_NO); /* we need an insertion scan key to do our search, so build one */ - itup_key = _bt_mkscankey(rel, heapRel, itup); + itup_key = _bt_mkscankey(rel, itup); if (checkingunique) { @@ -1024,13 +1024,15 @@ _bt_findinsertloc(Relation rel, * indexes. */ static void -_bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate, BTStack stack) +_bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate, + BTStack stack) { Page page; BTPageOpaque opaque; Buffer rbuf; BlockNumber rblkno; + Assert(heaprel != NULL); page = BufferGetPage(insertstate->buf); opaque = BTPageGetOpaque(page); @@ -1145,7 +1147,7 @@ _bt_insertonpg(Relation rel, /* * Every internal page should have exactly one negative infinity item at - * all times. Only _bt_split() and _bt_newroot() should add items that + * all times. Only _bt_split() and _bt_newlevel() should add items that * become negative infinity items through truncation, since they're the * only routines that allocate new internal pages. */ @@ -1250,14 +1252,14 @@ _bt_insertonpg(Relation rel, * only one on its tree level, but was not the root, it may have been * the "fast root". We need to ensure that the fast root link points * at or above the current page. We can safely acquire a lock on the - * metapage here --- see comments for _bt_newroot(). + * metapage here --- see comments for _bt_newlevel(). */ if (unlikely(split_only_page)) { Assert(!isleaf); Assert(BufferIsValid(cbuf)); - metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_WRITE); + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); @@ -1421,7 +1423,7 @@ _bt_insertonpg(Relation rel, * call _bt_getrootheight while holding a buffer lock. */ if (BlockNumberIsValid(blockcache) && - _bt_getrootheight(rel, heaprel) >= BTREE_FASTPATH_MIN_LEVEL) + _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL) RelationSetTargetBlock(rel, blockcache); } @@ -1715,7 +1717,7 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, * way because it avoids an unnecessary PANIC when either origpage or its * existing sibling page are corrupt. */ - rbuf = _bt_getbuf(rel, heaprel, P_NEW, BT_WRITE); + rbuf = _bt_allocbuf(rel, heaprel); rightpage = BufferGetPage(rbuf); rightpagenumber = BufferGetBlockNumber(rbuf); /* rightpage was initialized by _bt_getbuf */ @@ -1888,7 +1890,7 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, */ if (!isrightmost) { - sbuf = _bt_getbuf(rel, heaprel, oopaque->btpo_next, BT_WRITE); + sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE); spage = BufferGetPage(sbuf); sopaque = BTPageGetOpaque(spage); if (sopaque->btpo_prev != origpagenumber) @@ -2102,6 +2104,8 @@ _bt_insert_parent(Relation rel, bool isroot, bool isonly) { + Assert(heaprel != NULL); + /* * Here we have to do something Lehman and Yao don't talk about: deal with * a root split and construction of a new root. If our stack is empty @@ -2121,8 +2125,8 @@ _bt_insert_parent(Relation rel, Assert(stack == NULL); Assert(isonly); - /* create a new root node and update the metapage */ - rootbuf = _bt_newroot(rel, heaprel, buf, rbuf); + /* create a new root node one level up and update the metapage */ + rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf); /* release the split buffers */ _bt_relbuf(rel, rootbuf); _bt_relbuf(rel, rbuf); @@ -2161,8 +2165,7 @@ _bt_insert_parent(Relation rel, BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* Find the leftmost page at the next level up */ - pbuf = _bt_get_endpoint(rel, heaprel, opaque->btpo_level + 1, false, - NULL); + pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL); /* Set up a phony stack entry pointing there */ stack = &fakestack; stack->bts_blkno = BufferGetBlockNumber(pbuf); @@ -2230,6 +2233,9 @@ _bt_insert_parent(Relation rel, * * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked * and unpinned. + * + * Caller must provide a valid heaprel, since finishing a page split requires + * allocating a new page if and when the parent page splits in turn. */ void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack) @@ -2243,9 +2249,10 @@ _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack) bool wasonly; Assert(P_INCOMPLETE_SPLIT(lpageop)); + Assert(heaprel != NULL); /* Lock right sibling, the one missing the downlink */ - rbuf = _bt_getbuf(rel, heaprel, lpageop->btpo_next, BT_WRITE); + rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE); rpage = BufferGetPage(rbuf); rpageop = BTPageGetOpaque(rpage); @@ -2257,7 +2264,7 @@ _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack) BTMetaPageData *metad; /* acquire lock on the metapage */ - metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_WRITE); + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); @@ -2323,10 +2330,11 @@ _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child Page page; BTPageOpaque opaque; - buf = _bt_getbuf(rel, heaprel, blkno, BT_WRITE); + buf = _bt_getbuf(rel, blkno, BT_WRITE); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); + Assert(heaprel != NULL); if (P_INCOMPLETE_SPLIT(opaque)) { _bt_finish_split(rel, heaprel, buf, stack->bts_parent); @@ -2415,7 +2423,7 @@ _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child } /* - * _bt_newroot() -- Create a new root page for the index. + * _bt_newlevel() -- Create a new level above root page. * * We've just split the old root page and need to create a new one. * In order to do this, we add a new root page to the file, then lock @@ -2433,7 +2441,7 @@ _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child * lbuf, rbuf & rootbuf. */ static Buffer -_bt_newroot(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf) +_bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf) { Buffer rootbuf; Page lpage, @@ -2459,12 +2467,12 @@ _bt_newroot(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf) lopaque = BTPageGetOpaque(lpage); /* get a new root page */ - rootbuf = _bt_getbuf(rel, heaprel, P_NEW, BT_WRITE); + rootbuf = _bt_allocbuf(rel, heaprel); rootpage = BufferGetPage(rootbuf); rootblknum = BufferGetBlockNumber(rootbuf); /* acquire lock on the metapage */ - metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_WRITE); + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 486d1563d8..c2050656e4 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -38,10 +38,9 @@ #include "utils/snapmgr.h" static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); -static void _bt_log_reuse_page(Relation rel, Relation heaprel, BlockNumber blkno, - FullTransactionId safexid); -static void _bt_delitems_delete(Relation rel, Relation heaprel, Buffer buf, +static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId snapshotConflictHorizon, + bool isCatalogRel, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable); static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, @@ -177,7 +176,7 @@ _bt_getmeta(Relation rel, Buffer metabuf) * index tuples needed to be deleted. */ bool -_bt_vacuum_needs_cleanup(Relation rel, Relation heaprel) +_bt_vacuum_needs_cleanup(Relation rel) { Buffer metabuf; Page metapg; @@ -190,7 +189,7 @@ _bt_vacuum_needs_cleanup(Relation rel, Relation heaprel) * * Note that we deliberately avoid using cached version of metapage here. */ - metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_READ); + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); btm_version = metad->btm_version; @@ -230,7 +229,7 @@ _bt_vacuum_needs_cleanup(Relation rel, Relation heaprel) * finalized. */ void -_bt_set_cleanup_info(Relation rel, Relation heaprel, BlockNumber num_delpages) +_bt_set_cleanup_info(Relation rel, BlockNumber num_delpages) { Buffer metabuf; Page metapg; @@ -254,7 +253,7 @@ _bt_set_cleanup_info(Relation rel, Relation heaprel, BlockNumber num_delpages) * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just * to be consistent. */ - metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_READ); + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); @@ -326,6 +325,9 @@ _bt_set_cleanup_info(Relation rel, Relation heaprel, BlockNumber num_delpages) * NOTE that the returned root page will have only a read lock set * on it even if access = BT_WRITE! * + * If access = BT_WRITE, heaprel must be set; otherwise caller can just + * pass NULL. See _bt_allocbuf for an explanation. + * * The returned page is not necessarily the true root --- it could be * a "fast root" (a page that is alone in its level due to deletions). * Also, if the root page is split while we are "in flight" to it, @@ -349,6 +351,8 @@ _bt_getroot(Relation rel, Relation heaprel, int access) uint32 rootlevel; BTMetaPageData *metad; + Assert(access == BT_READ || heaprel != NULL); + /* * Try to use previously-cached metapage data to find the root. This * normally saves one buffer access per index search, which is a very @@ -369,7 +373,7 @@ _bt_getroot(Relation rel, Relation heaprel, int access) Assert(rootblkno != P_NONE); rootlevel = metad->btm_fastlevel; - rootbuf = _bt_getbuf(rel, heaprel, rootblkno, BT_READ); + rootbuf = _bt_getbuf(rel, rootblkno, BT_READ); rootpage = BufferGetPage(rootbuf); rootopaque = BTPageGetOpaque(rootpage); @@ -395,7 +399,7 @@ _bt_getroot(Relation rel, Relation heaprel, int access) rel->rd_amcache = NULL; } - metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_READ); + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); metad = _bt_getmeta(rel, metabuf); /* if no root page initialized yet, do it */ @@ -436,7 +440,7 @@ _bt_getroot(Relation rel, Relation heaprel, int access) * the new root page. Since this is the first page in the tree, it's * a leaf as well as the root. */ - rootbuf = _bt_getbuf(rel, heaprel, P_NEW, BT_WRITE); + rootbuf = _bt_allocbuf(rel, heaprel); rootblkno = BufferGetBlockNumber(rootbuf); rootpage = BufferGetPage(rootbuf); rootopaque = BTPageGetOpaque(rootpage); @@ -573,7 +577,7 @@ _bt_getroot(Relation rel, Relation heaprel, int access) * moving to the root --- that'd deadlock against any concurrent root split.) */ Buffer -_bt_gettrueroot(Relation rel, Relation heaprel) +_bt_gettrueroot(Relation rel) { Buffer metabuf; Page metapg; @@ -595,7 +599,7 @@ _bt_gettrueroot(Relation rel, Relation heaprel) pfree(rel->rd_amcache); rel->rd_amcache = NULL; - metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_READ); + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); metapg = BufferGetPage(metabuf); metaopaque = BTPageGetOpaque(metapg); metad = BTPageGetMeta(metapg); @@ -668,7 +672,7 @@ _bt_gettrueroot(Relation rel, Relation heaprel) * about updating previously cached data. */ int -_bt_getrootheight(Relation rel, Relation heaprel) +_bt_getrootheight(Relation rel) { BTMetaPageData *metad; @@ -676,7 +680,7 @@ _bt_getrootheight(Relation rel, Relation heaprel) { Buffer metabuf; - metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_READ); + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); metad = _bt_getmeta(rel, metabuf); /* @@ -732,7 +736,7 @@ _bt_getrootheight(Relation rel, Relation heaprel) * pg_upgrade'd from Postgres 12. */ void -_bt_metaversion(Relation rel, Relation heaprel, bool *heapkeyspace, bool *allequalimage) +_bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage) { BTMetaPageData *metad; @@ -740,7 +744,7 @@ _bt_metaversion(Relation rel, Relation heaprel, bool *heapkeyspace, bool *allequ { Buffer metabuf; - metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_READ); + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); metad = _bt_getmeta(rel, metabuf); /* @@ -821,37 +825,7 @@ _bt_checkpage(Relation rel, Buffer buf) } /* - * Log the reuse of a page from the FSM. - */ -static void -_bt_log_reuse_page(Relation rel, Relation heaprel, BlockNumber blkno, - FullTransactionId safexid) -{ - xl_btree_reuse_page xlrec_reuse; - - /* - * Note that we don't register the buffer with the record, because this - * operation doesn't modify the page. This record only exists to provide a - * conflict point for Hot Standby. - */ - - /* XLOG stuff */ - xlrec_reuse.isCatalogRel = RelationIsAccessibleInLogicalDecoding(heaprel); - xlrec_reuse.locator = rel->rd_locator; - xlrec_reuse.block = blkno; - xlrec_reuse.snapshotConflictHorizon = safexid; - - XLogBeginInsert(); - XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage); - - XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE); -} - -/* - * _bt_getbuf() -- Get a buffer by block number for read or write. - * - * blkno == P_NEW means to get an unallocated index page. The page - * will be initialized before returning it. + * _bt_getbuf() -- Get an existing block in a buffer, for read or write. * * The general rule in nbtree is that it's never okay to access a * page without holding both a buffer pin and a buffer lock on @@ -860,136 +834,165 @@ _bt_log_reuse_page(Relation rel, Relation heaprel, BlockNumber blkno, * When this routine returns, the appropriate lock is set on the * requested buffer and its reference count has been incremented * (ie, the buffer is "locked and pinned"). Also, we apply - * _bt_checkpage to sanity-check the page (except in P_NEW case), - * and perform Valgrind client requests that help Valgrind detect - * unsafe page accesses. + * _bt_checkpage to sanity-check the page, and perform Valgrind + * client requests that help Valgrind detect unsafe page accesses. * * Note: raw LockBuffer() calls are disallowed in nbtree; all * buffer lock requests need to go through wrapper functions such * as _bt_lockbuf(). */ Buffer -_bt_getbuf(Relation rel, Relation heaprel, BlockNumber blkno, int access) +_bt_getbuf(Relation rel, BlockNumber blkno, int access) { Buffer buf; - if (blkno != P_NEW) + Assert(BlockNumberIsValid(blkno)); + + /* Read an existing block of the relation */ + buf = ReadBuffer(rel, blkno); + _bt_lockbuf(rel, buf, access); + _bt_checkpage(rel, buf); + + return buf; +} + +/* + * _bt_allocbuf() -- Allocate a new block/page. + * + * Returns a write-locked buffer containing an unallocated nbtree page. + * + * Callers are required to pass a valid heaprel. We need heaprel so that we + * can handle generating a snapshotConflictHorizon that makes reusing a page + * from the FSM safe for queries that may be running on standbys. + */ +Buffer +_bt_allocbuf(Relation rel, Relation heaprel) +{ + Buffer buf; + BlockNumber blkno; + Page page; + + Assert(heaprel != NULL); + + /* + * First see if the FSM knows of any free pages. + * + * We can't trust the FSM's report unreservedly; we have to check that the + * page is still free. (For example, an already-free page could have been + * re-used between the time the last VACUUM scanned it and the time the + * VACUUM made its FSM updates.) + * + * In fact, it's worse than that: we can't even assume that it's safe to + * take a lock on the reported page. If somebody else has a lock on it, + * or even worse our own caller does, we could deadlock. (The own-caller + * scenario is actually not improbable. Consider an index on a serial or + * timestamp column. Nearly all splits will be at the rightmost page, so + * it's entirely likely that _bt_split will call us while holding a lock + * on the page most recently acquired from FSM. A VACUUM running + * concurrently with the previous split could well have placed that page + * back in FSM.) + * + * To get around that, we ask for only a conditional lock on the reported + * page. If we fail, then someone else is using the page, and we may + * reasonably assume it's not free. (If we happen to be wrong, the worst + * consequence is the page will be lost to use till the next VACUUM, which + * is no big problem.) + */ + for (;;) { - /* Read an existing block of the relation */ + blkno = GetFreeIndexPage(rel); + if (blkno == InvalidBlockNumber) + break; buf = ReadBuffer(rel, blkno); - _bt_lockbuf(rel, buf, access); - _bt_checkpage(rel, buf); - } - else - { - Page page; - - Assert(access == BT_WRITE); - - /* - * First see if the FSM knows of any free pages. - * - * We can't trust the FSM's report unreservedly; we have to check that - * the page is still free. (For example, an already-free page could - * have been re-used between the time the last VACUUM scanned it and - * the time the VACUUM made its FSM updates.) - * - * In fact, it's worse than that: we can't even assume that it's safe - * to take a lock on the reported page. If somebody else has a lock - * on it, or even worse our own caller does, we could deadlock. (The - * own-caller scenario is actually not improbable. Consider an index - * on a serial or timestamp column. Nearly all splits will be at the - * rightmost page, so it's entirely likely that _bt_split will call us - * while holding a lock on the page most recently acquired from FSM. A - * VACUUM running concurrently with the previous split could well have - * placed that page back in FSM.) - * - * To get around that, we ask for only a conditional lock on the - * reported page. If we fail, then someone else is using the page, - * and we may reasonably assume it's not free. (If we happen to be - * wrong, the worst consequence is the page will be lost to use till - * the next VACUUM, which is no big problem.) - */ - for (;;) + if (_bt_conditionallockbuf(rel, buf)) { - blkno = GetFreeIndexPage(rel); - if (blkno == InvalidBlockNumber) - break; - buf = ReadBuffer(rel, blkno); - if (_bt_conditionallockbuf(rel, buf)) - { - page = BufferGetPage(buf); + page = BufferGetPage(buf); + /* + * It's possible to find an all-zeroes page in an index. For + * example, a backend might successfully extend the relation one + * page and then crash before it is able to make a WAL entry for + * adding the page. If we find a zeroed page then reclaim it + * immediately. + */ + if (PageIsNew(page)) + { + /* Okay to use page. Initialize and return it. */ + _bt_pageinit(page, BufferGetPageSize(buf)); + return buf; + } + + if (BTPageIsRecyclable(page, heaprel)) + { /* - * It's possible to find an all-zeroes page in an index. For - * example, a backend might successfully extend the relation - * one page and then crash before it is able to make a WAL - * entry for adding the page. If we find a zeroed page then - * reclaim it immediately. + * If we are generating WAL for Hot Standby then create a WAL + * record that will allow us to conflict with queries running + * on standby, in case they have snapshots older than safexid + * value */ - if (PageIsNew(page)) + if (RelationNeedsWAL(rel) && XLogStandbyInfoActive()) { - /* Okay to use page. Initialize and return it. */ - _bt_pageinit(page, BufferGetPageSize(buf)); - return buf; - } + xl_btree_reuse_page xlrec_reuse; - if (BTPageIsRecyclable(page, heaprel)) - { /* - * If we are generating WAL for Hot Standby then create a - * WAL record that will allow us to conflict with queries - * running on standby, in case they have snapshots older - * than safexid value + * Note that we don't register the buffer with the record, + * because this operation doesn't modify the page (that + * already happened, back when VACUUM deleted the page). + * This record only exists to provide a conflict point for + * Hot Standby. See record REDO routine comments. */ - if (XLogStandbyInfoActive() && RelationNeedsWAL(rel)) - _bt_log_reuse_page(rel, heaprel, blkno, - BTPageGetDeleteXid(page)); + xlrec_reuse.locator = rel->rd_locator; + xlrec_reuse.block = blkno; + xlrec_reuse.snapshotConflictHorizon = BTPageGetDeleteXid(page); + xlrec_reuse.isCatalogRel = + RelationIsAccessibleInLogicalDecoding(heaprel); - /* Okay to use page. Re-initialize and return it. */ - _bt_pageinit(page, BufferGetPageSize(buf)); - return buf; + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage); + + XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE); } - elog(DEBUG2, "FSM returned nonrecyclable page"); - _bt_relbuf(rel, buf); - } - else - { - elog(DEBUG2, "FSM returned nonlockable page"); - /* couldn't get lock, so just drop pin */ - ReleaseBuffer(buf); + + /* Okay to use page. Re-initialize and return it. */ + _bt_pageinit(page, BufferGetPageSize(buf)); + return buf; } + elog(DEBUG2, "FSM returned nonrecyclable page"); + _bt_relbuf(rel, buf); + } + else + { + elog(DEBUG2, "FSM returned nonlockable page"); + /* couldn't get lock, so just drop pin */ + ReleaseBuffer(buf); } - - /* - * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or - * we risk a race condition against btvacuumscan --- see comments - * therein. This forces us to repeat the valgrind request that - * _bt_lockbuf() otherwise would make, as we can't use _bt_lockbuf() - * without introducing a race. - */ - buf = ExtendBufferedRel(EB_REL(rel), MAIN_FORKNUM, NULL, - EB_LOCK_FIRST); - if (!RelationUsesLocalBuffers(rel)) - VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); - - /* Initialize the new page before returning it */ - page = BufferGetPage(buf); - Assert(PageIsNew(page)); - _bt_pageinit(page, BufferGetPageSize(buf)); } - /* ref count and lock type are correct */ + /* + * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we + * risk a race condition against btvacuumscan --- see comments therein. + * This forces us to repeat the valgrind request that _bt_lockbuf() + * otherwise would make, as we can't use _bt_lockbuf() without introducing + * a race. + */ + buf = ExtendBufferedRel(EB_REL(rel), MAIN_FORKNUM, NULL, EB_LOCK_FIRST); + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); + + /* Initialize the new page before returning it */ + page = BufferGetPage(buf); + Assert(PageIsNew(page)); + _bt_pageinit(page, BufferGetPageSize(buf)); + return buf; } /* * _bt_relandgetbuf() -- release a locked buffer and get another one. * - * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the - * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer - * then it reduces to just _bt_getbuf; allowing this case simplifies some - * callers. + * This is equivalent to _bt_relbuf followed by _bt_getbuf. Also, if obuf is + * InvalidBuffer then it reduces to just _bt_getbuf; allowing this case + * simplifies some callers. * * The original motivation for using this was to avoid two entries to the * bufmgr when one would do. However, now it's mainly just a notational @@ -1001,7 +1004,7 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access) { Buffer buf; - Assert(blkno != P_NEW); + Assert(BlockNumberIsValid(blkno)); if (BufferIsValid(obuf)) _bt_unlockbuf(rel, obuf); buf = ReleaseAndReadBuffer(obuf, rel, blkno); @@ -1272,14 +1275,14 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, * (a version that lacks the TIDs that are to be deleted). * * This is nearly the same as _bt_delitems_vacuum as far as what it does to - * the page, but it needs its own snapshotConflictHorizon (caller gets this - * from tableam). This is used by the REDO routine to generate recovery + * the page, but it needs its own snapshotConflictHorizon and isCatalogRel + * (from the tableam). This is used by the REDO routine to generate recovery * conflicts. The other difference is that only _bt_delitems_vacuum will * clear page's VACUUM cycle ID. */ static void -_bt_delitems_delete(Relation rel, Relation heaprel, Buffer buf, - TransactionId snapshotConflictHorizon, +_bt_delitems_delete(Relation rel, Buffer buf, + TransactionId snapshotConflictHorizon, bool isCatalogRel, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable) { @@ -1343,10 +1346,10 @@ _bt_delitems_delete(Relation rel, Relation heaprel, Buffer buf, XLogRecPtr recptr; xl_btree_delete xlrec_delete; - xlrec_delete.isCatalogRel = RelationIsAccessibleInLogicalDecoding(heaprel); xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon; xlrec_delete.ndeleted = ndeletable; xlrec_delete.nupdated = nupdatable; + xlrec_delete.isCatalogRel = isCatalogRel; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_STANDARD); @@ -1517,6 +1520,7 @@ _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, { Page page = BufferGetPage(buf); TransactionId snapshotConflictHorizon; + bool isCatalogRel; OffsetNumber postingidxoffnum = InvalidOffsetNumber; int ndeletable = 0, nupdatable = 0; @@ -1525,6 +1529,7 @@ _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, /* Use tableam interface to determine which tuples to delete first */ snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate); + isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel); /* Should not WAL-log snapshotConflictHorizon unless it's required */ if (!XLogStandbyInfoActive()) @@ -1670,8 +1675,8 @@ _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, } /* Physically delete tuples (or TIDs) using deletable (or updatable) */ - _bt_delitems_delete(rel, heapRel, buf, snapshotConflictHorizon, deletable, - ndeletable, updatable, nupdatable); + _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel, + deletable, ndeletable, updatable, nupdatable); /* be tidy */ for (int i = 0; i < nupdatable; i++) @@ -1692,8 +1697,7 @@ _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, * same level must always be locked left to right to avoid deadlocks. */ static bool -_bt_leftsib_splitflag(Relation rel, Relation heaprel, BlockNumber leftsib, - BlockNumber target) +_bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target) { Buffer buf; Page page; @@ -1704,7 +1708,7 @@ _bt_leftsib_splitflag(Relation rel, Relation heaprel, BlockNumber leftsib, if (leftsib == P_NONE) return false; - buf = _bt_getbuf(rel, heaprel, leftsib, BT_READ); + buf = _bt_getbuf(rel, leftsib, BT_READ); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); @@ -1750,7 +1754,7 @@ _bt_leftsib_splitflag(Relation rel, Relation heaprel, BlockNumber leftsib, * to-be-deleted subtree.) */ static bool -_bt_rightsib_halfdeadflag(Relation rel, Relation heaprel, BlockNumber leafrightsib) +_bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib) { Buffer buf; Page page; @@ -1759,7 +1763,7 @@ _bt_rightsib_halfdeadflag(Relation rel, Relation heaprel, BlockNumber leafrights Assert(leafrightsib != P_NONE); - buf = _bt_getbuf(rel, heaprel, leafrightsib, BT_READ); + buf = _bt_getbuf(rel, leafrightsib, BT_READ); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); @@ -1948,18 +1952,18 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) * marked with INCOMPLETE_SPLIT flag before proceeding */ Assert(leafblkno == scanblkno); - if (_bt_leftsib_splitflag(rel, vstate->info->heaprel, leftsib, leafblkno)) + if (_bt_leftsib_splitflag(rel, leftsib, leafblkno)) { ReleaseBuffer(leafbuf); return; } /* we need an insertion scan key for the search, so build one */ - itup_key = _bt_mkscankey(rel, vstate->info->heaprel, targetkey); + itup_key = _bt_mkscankey(rel, targetkey); /* find the leftmost leaf page with matching pivot/high key */ itup_key->pivotsearch = true; - stack = _bt_search(rel, vstate->info->heaprel, itup_key, - &sleafbuf, BT_READ, NULL); + stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ, + NULL); /* won't need a second lock or pin on leafbuf */ _bt_relbuf(rel, sleafbuf); @@ -1990,7 +1994,8 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) * leafbuf page half-dead. */ Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque)); - if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf, stack)) + if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf, + stack)) { _bt_relbuf(rel, leafbuf); return; @@ -2053,7 +2058,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) if (!rightsib_empty) break; - leafbuf = _bt_getbuf(rel, vstate->info->heaprel, rightsib, BT_WRITE); + leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE); } } @@ -2066,6 +2071,10 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) * but may include additional internal pages (at most one per level of the * tree below the root). * + * Caller must pass a valid heaprel, since it's just about possible that our + * call to _bt_lock_subtree_parent will need to allocate a new index page to + * complete a page split. Every call to _bt_allocbuf needs to pass a heaprel. + * * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is * the rightmost child of its parent (and parent has more than one downlink). * Returns 'true' when the first stage of page deletion completed @@ -2094,6 +2103,7 @@ _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) && P_ISLEAF(opaque) && !P_IGNORE(opaque) && P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); + Assert(heaprel != NULL); /* * Save info about the leaf page. @@ -2108,7 +2118,7 @@ _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, * delete the downlink. It would fail the "right sibling of target page * is also the next child in parent page" cross-check below. */ - if (_bt_rightsib_halfdeadflag(rel, heaprel, leafrightsib)) + if (_bt_rightsib_halfdeadflag(rel, leafrightsib)) { elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead", leafblkno, leafrightsib); @@ -2352,7 +2362,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, Assert(target != leafblkno); /* Fetch the block number of the target's left sibling */ - buf = _bt_getbuf(rel, vstate->info->heaprel, target, BT_READ); + buf = _bt_getbuf(rel, target, BT_READ); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); leftsib = opaque->btpo_prev; @@ -2379,7 +2389,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, _bt_lockbuf(rel, leafbuf, BT_WRITE); if (leftsib != P_NONE) { - lbuf = _bt_getbuf(rel, vstate->info->heaprel, leftsib, BT_WRITE); + lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = BTPageGetOpaque(page); while (P_ISDELETED(opaque) || opaque->btpo_next != target) @@ -2427,7 +2437,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, CHECK_FOR_INTERRUPTS(); /* step right one page */ - lbuf = _bt_getbuf(rel, vstate->info->heaprel, leftsib, BT_WRITE); + lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = BTPageGetOpaque(page); } @@ -2491,7 +2501,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, * And next write-lock the (current) right sibling. */ rightsib = opaque->btpo_next; - rbuf = _bt_getbuf(rel, vstate->info->heaprel, rightsib, BT_WRITE); + rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); page = BufferGetPage(rbuf); opaque = BTPageGetOpaque(page); @@ -2538,7 +2548,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, * of doing so are slim, and the locking considerations daunting.) * * We can safely acquire a lock on the metapage here --- see comments for - * _bt_newroot(). + * _bt_newlevel(). */ if (leftsib == P_NONE && rightsib_is_rightmost) { @@ -2547,8 +2557,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, if (P_RIGHTMOST(opaque)) { /* rightsib will be the only one left on the level */ - metabuf = _bt_getbuf(rel, vstate->info->heaprel, BTREE_METAPAGE, - BT_WRITE); + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); @@ -2906,7 +2915,7 @@ _bt_lock_subtree_parent(Relation rel, Relation heaprel, BlockNumber child, * * Note: We deliberately avoid completing incomplete splits here. */ - if (_bt_leftsib_splitflag(rel, heaprel, leftsibparent, parent)) + if (_bt_leftsib_splitflag(rel, leftsibparent, parent)) return false; /* Recurse to examine child page's grandparent page */ @@ -2976,6 +2985,7 @@ _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate) Relation heaprel = vstate->info->heaprel; Assert(stats->pages_newly_deleted >= vstate->npendingpages); + Assert(heaprel != NULL); if (vstate->npendingpages == 0) { diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 1ce5b15199..4553aaee53 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -835,7 +835,7 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) if (stats == NULL) { /* Check if VACUUM operation can entirely avoid btvacuumscan() call */ - if (!_bt_vacuum_needs_cleanup(info->index, info->heaprel)) + if (!_bt_vacuum_needs_cleanup(info->index)) return NULL; /* @@ -871,7 +871,7 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) */ Assert(stats->pages_deleted >= stats->pages_free); num_delpages = stats->pages_deleted - stats->pages_free; - _bt_set_cleanup_info(info->index, info->heaprel, num_delpages); + _bt_set_cleanup_info(info->index, num_delpages); /* * It's quite possible for us to be fooled by concurrent page splits into diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 263f75fce9..7e05e58676 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -42,8 +42,7 @@ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); -static Buffer _bt_walk_left(Relation rel, Relation heaprel, Buffer buf, - Snapshot snapshot); +static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); @@ -92,6 +91,9 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) * When access = BT_READ, an empty index will result in *bufP being set to * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered * during the search will be finished. + * + * heaprel must be provided by callers that pass access = BT_WRITE, since we + * might need to allocate a new root page for caller -- see _bt_allocbuf. */ BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, @@ -100,6 +102,10 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, BTStack stack_in = NULL; int page_access = BT_READ; + /* heaprel must be set whenever _bt_allocbuf is reachable */ + Assert(access == BT_READ || access == BT_WRITE); + Assert(access == BT_READ || heaprel != NULL); + /* Get the root page to start with */ *bufP = _bt_getroot(rel, heaprel, access); @@ -222,8 +228,8 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, * * If forupdate is true, we will attempt to finish any incomplete splits * that we encounter. This is required when locking a target page for an - * insertion, because we don't allow inserting on a page before the split - * is completed. 'stack' is only used if forupdate is true. + * insertion, because we don't allow inserting on a page before the split is + * completed. 'heaprel' and 'stack' are only used if forupdate is true. * * On entry, we have the buffer pinned and a lock of the type specified by * 'access'. If we move right, we release the buffer and lock and acquire @@ -247,6 +253,8 @@ _bt_moveright(Relation rel, BTPageOpaque opaque; int32 cmpval; + Assert(!forupdate || heaprel != NULL); + /* * When nextkey = false (normal case): if the scan key that brought us to * this page is > the high key stored on the page, then the page has split @@ -295,7 +303,7 @@ _bt_moveright(Relation rel, _bt_relbuf(rel, buf); /* re-acquire the lock in the right mode, and re-check */ - buf = _bt_getbuf(rel, heaprel, blkno, access); + buf = _bt_getbuf(rel, blkno, access); continue; } @@ -862,7 +870,6 @@ bool _bt_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; - Relation heaprel = scan->heapRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Buffer buf; BTStack stack; @@ -1355,7 +1362,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } /* Initialize remaining insertion scan key fields */ - _bt_metaversion(rel, heaprel, &inskey.heapkeyspace, &inskey.allequalimage); + _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage); inskey.anynullkeys = false; /* unused */ inskey.nextkey = nextkey; inskey.pivotsearch = false; @@ -1366,7 +1373,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * Use the manufactured insertion scan key to descend the tree and * position ourselves on the target leaf page. */ - stack = _bt_search(rel, heaprel, &inskey, &buf, BT_READ, scan->xs_snapshot); + stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ, scan->xs_snapshot); /* don't need to keep the stack around... */ _bt_freestack(stack); @@ -2007,7 +2014,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); /* step right one page */ - so->currPos.buf = _bt_getbuf(rel, scan->heapRelation, blkno, BT_READ); + so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(so->currPos.buf); TestForOldSnapshot(scan->xs_snapshot, rel, page); opaque = BTPageGetOpaque(page); @@ -2081,8 +2088,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) if (BTScanPosIsPinned(so->currPos)) _bt_lockbuf(rel, so->currPos.buf, BT_READ); else - so->currPos.buf = _bt_getbuf(rel, scan->heapRelation, - so->currPos.currPage, BT_READ); + so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); for (;;) { @@ -2096,8 +2102,8 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) } /* Step to next physical page */ - so->currPos.buf = _bt_walk_left(rel, scan->heapRelation, - so->currPos.buf, scan->xs_snapshot); + so->currPos.buf = _bt_walk_left(rel, so->currPos.buf, + scan->xs_snapshot); /* if we're physically at end of index, return failure */ if (so->currPos.buf == InvalidBuffer) @@ -2144,8 +2150,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) BTScanPosInvalidate(so->currPos); return false; } - so->currPos.buf = _bt_getbuf(rel, scan->heapRelation, blkno, - BT_READ); + so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); } } } @@ -2190,7 +2195,7 @@ _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) * again if it's important. */ static Buffer -_bt_walk_left(Relation rel, Relation heaprel, Buffer buf, Snapshot snapshot) +_bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot) { Page page; BTPageOpaque opaque; @@ -2218,7 +2223,7 @@ _bt_walk_left(Relation rel, Relation heaprel, Buffer buf, Snapshot snapshot) _bt_relbuf(rel, buf); /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); - buf = _bt_getbuf(rel, heaprel, blkno, BT_READ); + buf = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(buf); TestForOldSnapshot(snapshot, rel, page); opaque = BTPageGetOpaque(page); @@ -2309,7 +2314,7 @@ _bt_walk_left(Relation rel, Relation heaprel, Buffer buf, Snapshot snapshot) * The returned buffer is pinned and read-locked. */ Buffer -_bt_get_endpoint(Relation rel, Relation heaprel, uint32 level, bool rightmost, +_bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot) { Buffer buf; @@ -2325,9 +2330,9 @@ _bt_get_endpoint(Relation rel, Relation heaprel, uint32 level, bool rightmost, * smarter about intermediate levels.) */ if (level == 0) - buf = _bt_getroot(rel, heaprel, BT_READ); + buf = _bt_getroot(rel, NULL, BT_READ); else - buf = _bt_gettrueroot(rel, heaprel); + buf = _bt_gettrueroot(rel); if (!BufferIsValid(buf)) return InvalidBuffer; @@ -2408,8 +2413,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) * version of _bt_search(). We don't maintain a stack since we know we * won't need it. */ - buf = _bt_get_endpoint(rel, scan->heapRelation, 0, - ScanDirectionIsBackward(dir), scan->xs_snapshot); + buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot); if (!BufferIsValid(buf)) { diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 6ad3f3c54d..c2665fce41 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -566,7 +566,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) wstate.heap = btspool->heap; wstate.index = btspool->index; - wstate.inskey = _bt_mkscankey(wstate.index, btspool->heap, NULL); + wstate.inskey = _bt_mkscankey(wstate.index, NULL); /* _bt_mkscankey() won't set allequalimage without metapage */ wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true); wstate.btws_use_wal = RelationNeedsWAL(wstate.index); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 05abf36032..7da499c4dd 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -87,7 +87,7 @@ static int _bt_keep_natts(Relation rel, IndexTuple lastleft, * field themselves. */ BTScanInsert -_bt_mkscankey(Relation rel, Relation heaprel, IndexTuple itup) +_bt_mkscankey(Relation rel, IndexTuple itup) { BTScanInsert key; ScanKey skey; @@ -112,7 +112,7 @@ _bt_mkscankey(Relation rel, Relation heaprel, IndexTuple itup) key = palloc(offsetof(BTScanInsertData, scankeys) + sizeof(ScanKeyData) * indnkeyatts); if (itup) - _bt_metaversion(rel, heaprel, &key->heapkeyspace, &key->allequalimage); + _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage); else { /* Utility statement callers can set these fields themselves */ @@ -1761,8 +1761,7 @@ _bt_killitems(IndexScanDesc scan) droppedpin = true; /* Attempt to re-read the buffer, getting pin and lock. */ - buf = _bt_getbuf(scan->indexRelation, scan->heapRelation, - so->currPos.currPage, BT_READ); + buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ); page = BufferGetPage(buf); if (BufferGetLSNAtomic(buf) == so->currPos.lsn) diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 65adf04c4e..39932d3c2d 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -462,7 +462,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, * For btrees, get tree height while we have the index * open */ - info->tree_height = _bt_getrootheight(indexRelation, relation); + info->tree_height = _bt_getrootheight(indexRelation); } else { diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index 0188106925..eb6cfcfd00 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -207,7 +207,6 @@ tuplesort_begin_heap(TupleDesc tupDesc, Tuplesortstate * tuplesort_begin_cluster(TupleDesc tupDesc, Relation indexRel, - Relation heaprel, int workMem, SortCoordinate coordinate, int sortopt) { @@ -261,7 +260,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc, arg->tupDesc = tupDesc; /* assume we need not copy tupDesc */ - indexScanKey = _bt_mkscankey(indexRel, heaprel, NULL); + indexScanKey = _bt_mkscankey(indexRel, NULL); if (arg->indexInfo->ii_Expressions != NULL) { @@ -362,7 +361,7 @@ tuplesort_begin_index_btree(Relation heapRel, arg->enforceUnique = enforceUnique; arg->uniqueNullsNotDistinct = uniqueNullsNotDistinct; - indexScanKey = _bt_mkscankey(indexRel, heapRel, NULL); + indexScanKey = _bt_mkscankey(indexRel, NULL); /* Prepare SortSupport data for each column */ base->sortKeys = (SortSupport) palloc0(base->nKeys * diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index d684786095..8891fa7973 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -293,11 +293,14 @@ BTPageIsRecyclable(Page page, Relation heaprel) BTPageOpaque opaque; Assert(!PageIsNew(page)); + Assert(heaprel != NULL); /* Recycling okay iff page is deleted and safexid is old enough */ opaque = BTPageGetOpaque(page); if (P_ISDELETED(opaque)) { + FullTransactionId safexid = BTPageGetDeleteXid(page); + /* * The page was deleted, but when? If it was just deleted, a scan * might have seen the downlink to it, and will read the page later. @@ -308,7 +311,7 @@ BTPageIsRecyclable(Page page, Relation heaprel) * anyone. If not, then no scan that's still in progress could have * seen its downlink, and we can recycle it. */ - return GlobalVisCheckRemovableFullXid(heaprel, BTPageGetDeleteXid(page)); + return GlobalVisCheckRemovableFullXid(heaprel, safexid); } return false; @@ -1194,18 +1197,17 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, */ extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage); -extern bool _bt_vacuum_needs_cleanup(Relation rel, Relation heaprel); -extern void _bt_set_cleanup_info(Relation rel, Relation heaprel, - BlockNumber num_delpages); +extern bool _bt_vacuum_needs_cleanup(Relation rel); +extern void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages); extern void _bt_upgrademetapage(Page page); extern Buffer _bt_getroot(Relation rel, Relation heaprel, int access); -extern Buffer _bt_gettrueroot(Relation rel, Relation heaprel); -extern int _bt_getrootheight(Relation rel, Relation heaprel); -extern void _bt_metaversion(Relation rel, Relation heaprel, bool *heapkeyspace, +extern Buffer _bt_gettrueroot(Relation rel); +extern int _bt_getrootheight(Relation rel); +extern void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage); extern void _bt_checkpage(Relation rel, Buffer buf); -extern Buffer _bt_getbuf(Relation rel, Relation heaprel, BlockNumber blkno, - int access); +extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); +extern Buffer _bt_allocbuf(Relation rel, Relation heaprel); extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access); extern void _bt_relbuf(Relation rel, Buffer buf); @@ -1237,13 +1239,13 @@ extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate); extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); -extern Buffer _bt_get_endpoint(Relation rel, Relation heaprel, uint32 level, - bool rightmost, Snapshot snapshot); +extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + Snapshot snapshot); /* * prototypes for functions in nbtutils.c */ -extern BTScanInsert _bt_mkscankey(Relation rel, Relation heaprel, IndexTuple itup); +extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); extern void _bt_freestack(BTStack stack); extern void _bt_preprocess_array_keys(IndexScanDesc scan); extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index 656a2e9897..af057b6358 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -399,9 +399,7 @@ extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc, int workMem, SortCoordinate coordinate, int sortopt); extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc, - Relation indexRel, - Relation heaprel, - int workMem, + Relation indexRel, int workMem, SortCoordinate coordinate, int sortopt); extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel,