/*------------------------------------------------------------------------- * * nbtinsert.c * Item insertion in Lehman and Yao btrees for Postgres. * * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * src/backend/access/nbtree/nbtinsert.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/nbtree.h" #include "access/nbtxlog.h" #include "access/transam.h" #include "access/xloginsert.h" #include "common/int.h" #include "common/pg_prng.h" #include "lib/qunique.h" #include "miscadmin.h" #include "storage/lmgr.h" #include "storage/predicate.h" #include "storage/smgr.h" /* Minimum tree height for application of fastpath optimization */ #define BTREE_FASTPATH_MIN_LEVEL 2 static BTStack _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate); static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken); static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel); static void _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate, BTStack stack); static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page); static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff); static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly); static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf); static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem); static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, BTInsertState insertstate, bool simpleonly, bool checkingunique, bool uniquedup, bool indexUnchanged); static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, OffsetNumber minoff, OffsetNumber maxoff); static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, int *nblocks); static inline int _bt_blk_cmp(const void *arg1, const void *arg2); /* * _bt_doinsert() -- Handle insertion of a single index tuple in the tree. * * This routine is called by the public interface routine, btinsert. * By here, itup is filled in, including the TID. * * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate. * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and * don't actually insert. * * indexUnchanged executor hint indicates if itup is from an * UPDATE that didn't logically change the indexed value, but * must nevertheless have a new entry to point to a successor * version. * * The result value is only significant for UNIQUE_CHECK_PARTIAL: * it must be true if the entry is known unique, else false. * (In the current implementation we'll also return true after a * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but * that's just a coding artifact.) */ bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel) { bool is_unique = false; BTInsertStateData insertstate; BTScanInsert itup_key; BTStack stack; bool checkingunique = (checkUnique != UNIQUE_CHECK_NO); /* we need an insertion scan key to do our search, so build one */ itup_key = _bt_mkscankey(rel, itup); if (checkingunique) { if (!itup_key->anynullkeys) { /* No (heapkeyspace) scantid until uniqueness established */ itup_key->scantid = NULL; } else { /* * Scan key for new tuple contains NULL key values. Bypass * checkingunique steps. They are unnecessary because core code * considers NULL unequal to every value, including NULL. * * This optimization avoids O(N^2) behavior within the * _bt_findinsertloc() heapkeyspace path when a unique index has a * large number of "duplicates" with NULL key values. */ checkingunique = false; /* Tuple is unique in the sense that core code cares about */ Assert(checkUnique != UNIQUE_CHECK_EXISTING); is_unique = true; } } /* * Fill in the BTInsertState working area, to track the current page and * position within the page to insert on. * * Note that itemsz is passed down to lower level code that deals with * inserting the item. It must be MAXALIGN()'d. This ensures that space * accounting code consistently considers the alignment overhead that we * expect PageAddItem() will add later. (Actually, index_form_tuple() is * already conservative about alignment, but we don't rely on that from * this distance. Besides, preserving the "true" tuple size in index * tuple headers for the benefit of nbtsplitloc.c might happen someday. * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.) */ insertstate.itup = itup; insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); insertstate.itup_key = itup_key; insertstate.bounds_valid = false; insertstate.buf = InvalidBuffer; insertstate.postingoff = 0; search: /* * Find and lock the leaf page that the tuple should be added to by * searching from the root page. insertstate.buf will hold a buffer that * is locked in exclusive mode afterwards. */ stack = _bt_search_insert(rel, heapRel, &insertstate); /* * checkingunique inserts are not allowed to go ahead when two tuples with * equal key attribute values would be visible to new MVCC snapshots once * the xact commits. Check for conflicts in the locked page/buffer (if * needed) here. * * It might be necessary to check a page to the right in _bt_check_unique, * though that should be very rare. In practice the first page the value * could be on (with scantid omitted) is almost always also the only page * that a matching tuple might be found on. This is due to the behavior * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can * only be allowed to cross a page boundary when there is no candidate * leaf page split point that avoids it. Also, _bt_check_unique can use * the leaf page high key to determine that there will be no duplicates on * the right sibling without actually visiting it (it uses the high key in * cases where the new item happens to belong at the far right of the leaf * page). * * NOTE: obviously, _bt_check_unique can only detect keys that are already * in the index; so it cannot defend against concurrent insertions of the * same key. We protect against that by means of holding a write lock on * the first page the value could be on, with omitted/-inf value for the * implicit heap TID tiebreaker attribute. Any other would-be inserter of * the same key must acquire a write lock on the same page, so only one * would-be inserter can be making the check at one time. Furthermore, * once we are past the check we hold write locks continuously until we * have performed our insertion, so no later inserter can fail to see our * insertion. (This requires some care in _bt_findinsertloc.) * * If we must wait for another xact, we release the lock while waiting, * and then must perform a new search. * * For a partial uniqueness check, we don't wait for the other xact. Just * let the tuple in and return false for possibly non-unique, or true for * definitely unique. */ if (checkingunique) { TransactionId xwait; uint32 speculativeToken; xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique, &is_unique, &speculativeToken); if (unlikely(TransactionIdIsValid(xwait))) { /* Have to wait for the other guy ... */ _bt_relbuf(rel, insertstate.buf); insertstate.buf = InvalidBuffer; /* * If it's a speculative insertion, wait for it to finish (ie. to * go ahead with the insertion, or kill the tuple). Otherwise * wait for the transaction to finish as usual. */ if (speculativeToken) SpeculativeInsertionWait(xwait, speculativeToken); else XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex); /* start over... */ if (stack) _bt_freestack(stack); goto search; } /* Uniqueness is established -- restore heap tid as scantid */ if (itup_key->heapkeyspace) itup_key->scantid = &itup->t_tid; } if (checkUnique != UNIQUE_CHECK_EXISTING) { OffsetNumber newitemoff; /* * The only conflict predicate locking cares about for indexes is when * an index tuple insert conflicts with an existing lock. We don't * know the actual page we're going to insert on for sure just yet in * checkingunique and !heapkeyspace cases, but it's okay to use the * first page the value could be on (with scantid omitted) instead. */ CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf)); /* * Do the insertion. Note that insertstate contains cached binary * search bounds established within _bt_check_unique when insertion is * checkingunique. */ newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique, indexUnchanged, stack, heapRel); _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer, stack, itup, insertstate.itemsz, newitemoff, insertstate.postingoff, false); } else { /* just release the buffer */ _bt_relbuf(rel, insertstate.buf); } /* be tidy */ if (stack) _bt_freestack(stack); pfree(itup_key); return is_unique; } /* * _bt_search_insert() -- _bt_search() wrapper for inserts * * Search the tree for a particular scankey, or more precisely for the first * leaf page it could be on. Try to make use of the fastpath optimization's * rightmost leaf page cache before actually searching the tree from the root * page, though. * * Return value is a stack of parent-page pointers (though see notes about * fastpath optimization and page splits below). insertstate->buf is set to * the address of the leaf-page buffer, which is write-locked and pinned in * all cases (if necessary by creating a new empty root page for caller). * * The fastpath optimization avoids most of the work of searching the tree * repeatedly when a single backend inserts successive new tuples on the * rightmost leaf page of an index. A backend cache of the rightmost leaf * page is maintained within _bt_insertonpg(), and used here. The cache is * invalidated here when an insert of a non-pivot tuple must take place on a * non-rightmost leaf page. * * The optimization helps with indexes on an auto-incremented field. It also * helps with indexes on datetime columns, as well as indexes with lots of * NULL values. (NULLs usually get inserted in the rightmost page for single * column indexes, since they usually get treated as coming after everything * else in the key space. Individual NULL tuples will generally be placed on * the rightmost leaf page due to the influence of the heap TID column.) * * Note that we avoid applying the optimization when there is insufficient * space on the rightmost page to fit caller's new item. This is necessary * because we'll need to return a real descent stack when a page split is * expected (actually, caller can cope with a leaf page split that uses a NULL * stack, but that's very slow and so must be avoided). Note also that the * fastpath optimization acquires the lock on the page conditionally as a way * of reducing extra contention when there are concurrent insertions into the * rightmost page (we give up if we'd have to wait for the lock). We assume * that it isn't useful to apply the optimization when there is contention, * since each per-backend cache won't stay valid for long. */ static BTStack _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate) { Assert(insertstate->buf == InvalidBuffer); Assert(!insertstate->bounds_valid); Assert(insertstate->postingoff == 0); if (RelationGetTargetBlock(rel) != InvalidBlockNumber) { /* Simulate a _bt_getbuf() call with conditional locking */ insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel)); if (_bt_conditionallockbuf(rel, insertstate->buf)) { Page page; BTPageOpaque opaque; _bt_checkpage(rel, insertstate->buf); page = BufferGetPage(insertstate->buf); opaque = BTPageGetOpaque(page); /* * Check if the page is still the rightmost leaf page and has * enough free space to accommodate the new tuple. Also check * that the insertion scan key is strictly greater than the first * non-pivot tuple on the page. (Note that we expect itup_key's * scantid to be unset when our caller is a checkingunique * inserter.) */ if (P_RIGHTMOST(opaque) && P_ISLEAF(opaque) && !P_IGNORE(opaque) && PageGetFreeSpace(page) > insertstate->itemsz && PageGetMaxOffsetNumber(page) >= P_HIKEY && _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0) { /* * Caller can use the fastpath optimization because cached * block is still rightmost leaf page, which can fit caller's * new tuple without splitting. Keep block in local cache for * next insert, and have caller use NULL stack. * * Note that _bt_insert_parent() has an assertion that catches * leaf page splits that somehow follow from a fastpath insert * (it should only be passed a NULL stack when it must deal * with a concurrent root page split, and never because a NULL * stack was returned here). */ return NULL; } /* Page unsuitable for caller, drop lock and pin */ _bt_relbuf(rel, insertstate->buf); } else { /* Lock unavailable, drop pin */ ReleaseBuffer(insertstate->buf); } /* Forget block, since cache doesn't appear to be useful */ RelationSetTargetBlock(rel, InvalidBlockNumber); } /* Cannot use optimization -- descend tree, return proper descent stack */ return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf, BT_WRITE); } /* * _bt_check_unique() -- Check for violation of unique index constraint * * Returns InvalidTransactionId if there is no conflict, else an xact ID * we must wait for to see if it commits a conflicting tuple. If an actual * conflict is detected, no return --- just ereport(). If an xact ID is * returned, and the conflicting tuple still has a speculative insertion in * progress, *speculativeToken is set to non-zero, and the caller can wait for * the verdict on the insertion using SpeculativeInsertionWait(). * * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return * InvalidTransactionId because we don't want to wait. In this case we * set *is_unique to false if there is a potential conflict, and the * core code must redo the uniqueness check later. * * As a side-effect, sets state in insertstate that can later be used by * _bt_findinsertloc() to reuse most of the binary search work we do * here. * * This code treats NULLs as equal, unlike the default semantics for unique * indexes. So do not call here when there are NULL values in scan key and * the index uses the default NULLS DISTINCT mode. */ static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken) { IndexTuple itup = insertstate->itup; IndexTuple curitup = NULL; ItemId curitemid = NULL; BTScanInsert itup_key = insertstate->itup_key; SnapshotData SnapshotDirty; OffsetNumber offset; OffsetNumber maxoff; Page page; BTPageOpaque opaque; Buffer nbuf = InvalidBuffer; bool found = false; bool inposting = false; bool prevalldead = true; int curposti = 0; /* Assume unique until we find a duplicate */ *is_unique = true; InitDirtySnapshot(SnapshotDirty); page = BufferGetPage(insertstate->buf); opaque = BTPageGetOpaque(page); maxoff = PageGetMaxOffsetNumber(page); /* * Find the first tuple with the same key. * * This also saves the binary search bounds in insertstate. We use them * in the fastpath below, but also in the _bt_findinsertloc() call later. */ Assert(!insertstate->bounds_valid); offset = _bt_binsrch_insert(rel, insertstate); /* * Scan over all equal tuples, looking for live conflicts. */ Assert(!insertstate->bounds_valid || insertstate->low == offset); Assert(!itup_key->anynullkeys); Assert(itup_key->scantid == NULL); for (;;) { /* * Each iteration of the loop processes one heap TID, not one index * tuple. Current offset number for page isn't usually advanced on * iterations that process heap TIDs from posting list tuples. * * "inposting" state is set when _inside_ a posting list --- not when * we're at the start (or end) of a posting list. We advance curposti * at the end of the iteration when inside a posting list tuple. In * general, every loop iteration either advances the page offset or * advances curposti --- an iteration that handles the rightmost/max * heap TID in a posting list finally advances the page offset (and * unsets "inposting"). * * Make sure the offset points to an actual index tuple before trying * to examine it... */ if (offset <= maxoff) { /* * Fastpath: In most cases, we can use cached search bounds to * limit our consideration to items that are definitely * duplicates. This fastpath doesn't apply when the original page * is empty, or when initial offset is past the end of the * original page, which may indicate that we need to examine a * second or subsequent page. * * Note that this optimization allows us to avoid calling * _bt_compare() directly when there are no duplicates, as long as * the offset where the key will go is not at the end of the page. */ if (nbuf == InvalidBuffer && offset == insertstate->stricthigh) { Assert(insertstate->bounds_valid); Assert(insertstate->low >= P_FIRSTDATAKEY(opaque)); Assert(insertstate->low <= insertstate->stricthigh); Assert(_bt_compare(rel, itup_key, page, offset) < 0); break; } /* * We can skip items that are already marked killed. * * In the presence of heavy update activity an index may contain * many killed items with the same key; running _bt_compare() on * each killed item gets expensive. Just advance over killed * items as quickly as we can. We only apply _bt_compare() when * we get to a non-killed item. We could reuse the bounds to * avoid _bt_compare() calls for known equal tuples, but it * doesn't seem worth it. */ if (!inposting) curitemid = PageGetItemId(page, offset); if (inposting || !ItemIdIsDead(curitemid)) { ItemPointerData htid; bool all_dead = false; if (!inposting) { /* Plain tuple, or first TID in posting list tuple */ if (_bt_compare(rel, itup_key, page, offset) != 0) break; /* we're past all the equal tuples */ /* Advanced curitup */ curitup = (IndexTuple) PageGetItem(page, curitemid); Assert(!BTreeTupleIsPivot(curitup)); } /* okay, we gotta fetch the heap tuple using htid ... */ if (!BTreeTupleIsPosting(curitup)) { /* ... htid is from simple non-pivot tuple */ Assert(!inposting); htid = curitup->t_tid; } else if (!inposting) { /* ... htid is first TID in new posting list */ inposting = true; prevalldead = true; curposti = 0; htid = *BTreeTupleGetPostingN(curitup, 0); } else { /* ... htid is second or subsequent TID in posting list */ Assert(curposti > 0); htid = *BTreeTupleGetPostingN(curitup, curposti); } /* * If we are doing a recheck, we expect to find the tuple we * are rechecking. It's not a duplicate, but we have to keep * scanning. */ if (checkUnique == UNIQUE_CHECK_EXISTING && ItemPointerCompare(&htid, &itup->t_tid) == 0) { found = true; } /* * Check if there's any table tuples for this index entry * satisfying SnapshotDirty. This is necessary because for AMs * with optimizations like heap's HOT, we have just a single * index entry for the entire chain. */ else if (table_index_fetch_tuple_check(heapRel, &htid, &SnapshotDirty, &all_dead)) { TransactionId xwait; /* * It is a duplicate. If we are only doing a partial * check, then don't bother checking if the tuple is being * updated in another transaction. Just return the fact * that it is a potential conflict and leave the full * check till later. Don't invalidate binary search * bounds. */ if (checkUnique == UNIQUE_CHECK_PARTIAL) { if (nbuf != InvalidBuffer) _bt_relbuf(rel, nbuf); *is_unique = false; return InvalidTransactionId; } /* * If this tuple is being updated by other transaction * then we have to wait for its commit/abort. */ xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ? SnapshotDirty.xmin : SnapshotDirty.xmax; if (TransactionIdIsValid(xwait)) { if (nbuf != InvalidBuffer) _bt_relbuf(rel, nbuf); /* Tell _bt_doinsert to wait... */ *speculativeToken = SnapshotDirty.speculativeToken; /* Caller releases lock on buf immediately */ insertstate->bounds_valid = false; return xwait; } /* * Otherwise we have a definite conflict. But before * complaining, look to see if the tuple we want to insert * is itself now committed dead --- if so, don't complain. * This is a waste of time in normal scenarios but we must * do it to support CREATE INDEX CONCURRENTLY. * * We must follow HOT-chains here because during * concurrent index build, we insert the root TID though * the actual tuple may be somewhere in the HOT-chain. * While following the chain we might not stop at the * exact tuple which triggered the insert, but that's OK * because if we find a live tuple anywhere in this chain, * we have a unique key conflict. The other live tuple is * not part of this chain because it had a different index * entry. */ htid = itup->t_tid; if (table_index_fetch_tuple_check(heapRel, &htid, SnapshotSelf, NULL)) { /* Normal case --- it's still live */ } else { /* * It's been deleted, so no error, and no need to * continue searching */ break; } /* * Check for a conflict-in as we would if we were going to * write to this page. We aren't actually going to write, * but we want a chance to report SSI conflicts that would * otherwise be masked by this unique constraint * violation. */ CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf)); /* * This is a definite conflict. Break the tuple down into * datums and report the error. But first, make sure we * release the buffer locks we're holding --- * BuildIndexValueDescription could make catalog accesses, * which in the worst case might touch this same index and * cause deadlocks. */ if (nbuf != InvalidBuffer) _bt_relbuf(rel, nbuf); _bt_relbuf(rel, insertstate->buf); insertstate->buf = InvalidBuffer; insertstate->bounds_valid = false; { Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; char *key_desc; index_deform_tuple(itup, RelationGetDescr(rel), values, isnull); key_desc = BuildIndexValueDescription(rel, values, isnull); ereport(ERROR, (errcode(ERRCODE_UNIQUE_VIOLATION), errmsg("duplicate key value violates unique constraint \"%s\"", RelationGetRelationName(rel)), key_desc ? errdetail("Key %s already exists.", key_desc) : 0, errtableconstraint(heapRel, RelationGetRelationName(rel)))); } } else if (all_dead && (!inposting || (prevalldead && curposti == BTreeTupleGetNPosting(curitup) - 1))) { /* * The conflicting tuple (or all HOT chains pointed to by * all posting list TIDs) is dead to everyone, so mark the * index entry killed. */ ItemIdMarkDead(curitemid); opaque->btpo_flags |= BTP_HAS_GARBAGE; /* * Mark buffer with a dirty hint, since state is not * crucial. Be sure to mark the proper buffer dirty. */ if (nbuf != InvalidBuffer) MarkBufferDirtyHint(nbuf, true); else MarkBufferDirtyHint(insertstate->buf, true); } /* * Remember if posting list tuple has even a single HOT chain * whose members are not all dead */ if (!all_dead && inposting) prevalldead = false; } } if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1) { /* Advance to next TID in same posting list */ curposti++; continue; } else if (offset < maxoff) { /* Advance to next tuple */ curposti = 0; inposting = false; offset = OffsetNumberNext(offset); } else { int highkeycmp; /* If scankey == hikey we gotta check the next page too */ if (P_RIGHTMOST(opaque)) break; highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY); Assert(highkeycmp <= 0); if (highkeycmp != 0) break; /* Advance to next non-dead page --- there must be one */ for (;;) { BlockNumber nblkno = opaque->btpo_next; nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ); page = BufferGetPage(nbuf); opaque = BTPageGetOpaque(page); if (!P_IGNORE(opaque)) break; if (P_RIGHTMOST(opaque)) elog(ERROR, "fell off the end of index \"%s\"", RelationGetRelationName(rel)); } /* Will also advance to next tuple */ curposti = 0; inposting = false; maxoff = PageGetMaxOffsetNumber(page); offset = P_FIRSTDATAKEY(opaque); /* Don't invalidate binary search bounds */ } } /* * If we are doing a recheck then we should have found the tuple we are * checking. Otherwise there's something very wrong --- probably, the * index is on a non-immutable expression. */ if (checkUnique == UNIQUE_CHECK_EXISTING && !found) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("failed to re-find tuple within index \"%s\"", RelationGetRelationName(rel)), errhint("This may be because of a non-immutable index expression."), errtableconstraint(heapRel, RelationGetRelationName(rel)))); if (nbuf != InvalidBuffer) _bt_relbuf(rel, nbuf); return InvalidTransactionId; } /* * _bt_findinsertloc() -- Finds an insert location for a tuple * * On entry, insertstate buffer contains the page the new tuple belongs * on. It is exclusive-locked and pinned by the caller. * * If 'checkingunique' is true, the buffer on entry is the first page * that contains duplicates of the new key. If there are duplicates on * multiple pages, the correct insertion position might be some page to * the right, rather than the first page. In that case, this function * moves right to the correct target page. * * (In a !heapkeyspace index, there can be multiple pages with the same * high key, where the new tuple could legitimately be placed on. In * that case, the caller passes the first page containing duplicates, * just like when checkingunique=true. If that page doesn't have enough * room for the new tuple, this function moves right, trying to find a * legal page that does.) * * If 'indexUnchanged' is true, this is for an UPDATE that didn't * logically change the indexed value, but must nevertheless have a new * entry to point to a successor version. This hint from the executor * will influence our behavior when the page might have to be split and * we must consider our options. Bottom-up index deletion can avoid * pathological version-driven page splits, but we only want to go to the * trouble of trying it when we already have moderate confidence that * it's appropriate. The hint should not significantly affect our * behavior over time unless practically all inserts on to the leaf page * get the hint. * * On exit, insertstate buffer contains the chosen insertion page, and * the offset within that page is returned. If _bt_findinsertloc needed * to move right, the lock and pin on the original page are released, and * the new buffer is exclusively locked and pinned instead. * * If insertstate contains cached binary search bounds, we will take * advantage of them. This avoids repeating comparisons that we made in * _bt_check_unique() already. */ static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel) { BTScanInsert itup_key = insertstate->itup_key; Page page = BufferGetPage(insertstate->buf); BTPageOpaque opaque; OffsetNumber newitemoff; opaque = BTPageGetOpaque(page); /* Check 1/3 of a page restriction */ if (unlikely(insertstate->itemsz > BTMaxItemSize(page))) _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page, insertstate->itup); Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque)); Assert(!insertstate->bounds_valid || checkingunique); Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL); Assert(itup_key->heapkeyspace || itup_key->scantid == NULL); Assert(!itup_key->allequalimage || itup_key->heapkeyspace); if (itup_key->heapkeyspace) { /* Keep track of whether checkingunique duplicate seen */ bool uniquedup = indexUnchanged; /* * If we're inserting into a unique index, we may have to walk right * through leaf pages to find the one leaf page that we must insert on * to. * * This is needed for checkingunique callers because a scantid was not * used when we called _bt_search(). scantid can only be set after * _bt_check_unique() has checked for duplicates. The buffer * initially stored in insertstate->buf has the page where the first * duplicate key might be found, which isn't always the page that new * tuple belongs on. The heap TID attribute for new tuple (scantid) * could force us to insert on a sibling page, though that should be * very rare in practice. */ if (checkingunique) { if (insertstate->low < insertstate->stricthigh) { /* Encountered a duplicate in _bt_check_unique() */ Assert(insertstate->bounds_valid); uniquedup = true; } for (;;) { /* * Does the new tuple belong on this page? * * The earlier _bt_check_unique() call may well have * established a strict upper bound on the offset for the new * item. If it's not the last item of the page (i.e. if there * is at least one tuple on the page that goes after the tuple * we're inserting) then we know that the tuple belongs on * this page. We can skip the high key check. */ if (insertstate->bounds_valid && insertstate->low <= insertstate->stricthigh && insertstate->stricthigh <= PageGetMaxOffsetNumber(page)) break; /* Test '<=', not '!=', since scantid is set now */ if (P_RIGHTMOST(opaque) || _bt_compare(rel, itup_key, page, P_HIKEY) <= 0) break; _bt_stepright(rel, heapRel, insertstate, stack); /* Update local state after stepping right */ page = BufferGetPage(insertstate->buf); opaque = BTPageGetOpaque(page); /* Assume duplicates (if checkingunique) */ uniquedup = true; } } /* * If the target page cannot fit newitem, try to avoid splitting the * page on insert by performing deletion or deduplication now */ if (PageGetFreeSpace(page) < insertstate->itemsz) _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false, checkingunique, uniquedup, indexUnchanged); } else { /*---------- * This is a !heapkeyspace (version 2 or 3) index. The current page * is the first page that we could insert the new tuple to, but there * may be other pages to the right that we could opt to use instead. * * If the new key is equal to one or more existing keys, we can * legitimately place it anywhere in the series of equal keys. In * fact, if the new key is equal to the page's "high key" we can place * it on the next page. If it is equal to the high key, and there's * not room to insert the new tuple on the current page without * splitting, then we move right hoping to find more free space and * avoid a split. * * Keep scanning right until we * (a) find a page with enough free space, * (b) reach the last page where the tuple can legally go, or * (c) get tired of searching. * (c) is not flippant; it is important because if there are many * pages' worth of equal keys, it's better to split one of the early * pages than to scan all the way to the end of the run of equal keys * on every insert. We implement "get tired" as a random choice, * since stopping after scanning a fixed number of pages wouldn't work * well (we'd never reach the right-hand side of previously split * pages). The probability of moving right is set at 0.99, which may * seem too high to change the behavior much, but it does an excellent * job of preventing O(N^2) behavior with many equal keys. *---------- */ while (PageGetFreeSpace(page) < insertstate->itemsz) { /* * Before considering moving right, see if we can obtain enough * space by erasing LP_DEAD items */ if (P_HAS_GARBAGE(opaque)) { /* Perform simple deletion */ _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, false, false, false); if (PageGetFreeSpace(page) >= insertstate->itemsz) break; /* OK, now we have enough space */ } /* * Nope, so check conditions (b) and (c) enumerated above * * The earlier _bt_check_unique() call may well have established a * strict upper bound on the offset for the new item. If it's not * the last item of the page (i.e. if there is at least one tuple * on the page that's greater than the tuple we're inserting to) * then we know that the tuple belongs on this page. We can skip * the high key check. */ if (insertstate->bounds_valid && insertstate->low <= insertstate->stricthigh && insertstate->stricthigh <= PageGetMaxOffsetNumber(page)) break; if (P_RIGHTMOST(opaque) || _bt_compare(rel, itup_key, page, P_HIKEY) != 0 || pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100)) break; _bt_stepright(rel, heapRel, insertstate, stack); /* Update local state after stepping right */ page = BufferGetPage(insertstate->buf); opaque = BTPageGetOpaque(page); } } /* * We should now be on the correct page. Find the offset within the page * for the new tuple. (Possibly reusing earlier search bounds.) */ Assert(P_RIGHTMOST(opaque) || _bt_compare(rel, itup_key, page, P_HIKEY) <= 0); newitemoff = _bt_binsrch_insert(rel, insertstate); if (insertstate->postingoff == -1) { /* * There is an overlapping posting list tuple with its LP_DEAD bit * set. We don't want to unnecessarily unset its LP_DEAD bit while * performing a posting list split, so perform simple index tuple * deletion early. */ _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, false, false, false); /* * Do new binary search. New insert location cannot overlap with any * posting list now. */ Assert(!insertstate->bounds_valid); insertstate->postingoff = 0; newitemoff = _bt_binsrch_insert(rel, insertstate); Assert(insertstate->postingoff == 0); } return newitemoff; } /* * Step right to next non-dead page, during insertion. * * This is a bit more complicated than moving right in a search. We must * write-lock the target page before releasing write lock on current page; * else someone else's _bt_check_unique scan could fail to see our insertion. * Write locks on intermediate dead pages won't do because we don't know when * they will get de-linked from the tree. * * This is more aggressive than it needs to be for non-unique !heapkeyspace * indexes. */ static void _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate, BTStack stack) { Page page; BTPageOpaque opaque; Buffer rbuf; BlockNumber rblkno; Assert(heaprel != NULL); page = BufferGetPage(insertstate->buf); opaque = BTPageGetOpaque(page); rbuf = InvalidBuffer; rblkno = opaque->btpo_next; for (;;) { rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE); page = BufferGetPage(rbuf); opaque = BTPageGetOpaque(page); /* * If this page was incompletely split, finish the split now. We do * this while holding a lock on the left sibling, which is not good * because finishing the split could be a fairly lengthy operation. * But this should happen very seldom. */ if (P_INCOMPLETE_SPLIT(opaque)) { _bt_finish_split(rel, heaprel, rbuf, stack); rbuf = InvalidBuffer; continue; } if (!P_IGNORE(opaque)) break; if (P_RIGHTMOST(opaque)) elog(ERROR, "fell off the end of index \"%s\"", RelationGetRelationName(rel)); rblkno = opaque->btpo_next; } /* rbuf locked; unlock buf, update state for caller */ _bt_relbuf(rel, insertstate->buf); insertstate->buf = rbuf; insertstate->bounds_valid = false; } /*---------- * _bt_insertonpg() -- Insert a tuple on a particular page in the index. * * This recursive procedure does the following things: * * + if postingoff != 0, splits existing posting list tuple * (since it overlaps with new 'itup' tuple). * + if necessary, splits the target page, using 'itup_key' for * suffix truncation on leaf pages (caller passes NULL for * non-leaf pages). * + inserts the new tuple (might be split from posting list). * + if the page was split, pops the parent stack, and finds the * right place to insert the new child pointer (by walking * right using information stored in the parent stack). * + invokes itself with the appropriate tuple for the right * child page on the parent. * + updates the metapage if a true root or fast root is split. * * On entry, we must have the correct buffer in which to do the * insertion, and the buffer must be pinned and write-locked. On return, * we will have dropped both the pin and the lock on the buffer. * * This routine only performs retail tuple insertions. 'itup' should * always be either a non-highkey leaf item, or a downlink (new high * key items are created indirectly, when a page is split). When * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page * we're inserting the downlink for. This function will clear the * INCOMPLETE_SPLIT flag on it, and release the buffer. *---------- */ static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page) { Page page; BTPageOpaque opaque; bool isleaf, isroot, isrightmost, isonly; IndexTuple oposting = NULL; IndexTuple origitup = NULL; IndexTuple nposting = NULL; page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); isleaf = P_ISLEAF(opaque); isroot = P_ISROOT(opaque); isrightmost = P_RIGHTMOST(opaque); isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque); /* child buffer must be given iff inserting on an internal page */ Assert(isleaf == !BufferIsValid(cbuf)); /* tuple must have appropriate number of attributes */ Assert(!isleaf || BTreeTupleGetNAtts(itup, rel) == IndexRelationGetNumberOfAttributes(rel)); Assert(isleaf || BTreeTupleGetNAtts(itup, rel) <= IndexRelationGetNumberOfKeyAttributes(rel)); Assert(!BTreeTupleIsPosting(itup)); Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz); /* Caller must always finish incomplete split for us */ Assert(!P_INCOMPLETE_SPLIT(opaque)); /* * Every internal page should have exactly one negative infinity item at * all times. Only _bt_split() and _bt_newlevel() should add items that * become negative infinity items through truncation, since they're the * only routines that allocate new internal pages. */ Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque)); /* * Do we need to split an existing posting list item? */ if (postingoff != 0) { ItemId itemid = PageGetItemId(page, newitemoff); /* * The new tuple is a duplicate with a heap TID that falls inside the * range of an existing posting list tuple on a leaf page. Prepare to * split an existing posting list. Overwriting the posting list with * its post-split version is treated as an extra step in either the * insert or page split critical section. */ Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage); oposting = (IndexTuple) PageGetItem(page, itemid); /* * postingoff value comes from earlier call to _bt_binsrch_posting(). * Its binary search might think that a plain tuple must be a posting * list tuple that needs to be split. This can happen with corruption * involving an existing plain tuple that is a duplicate of the new * item, up to and including its table TID. Check for that here in * passing. * * Also verify that our caller has made sure that the existing posting * list tuple does not have its LP_DEAD bit set. */ if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"", ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), newitemoff, BufferGetBlockNumber(buf), RelationGetRelationName(rel)))); /* use a mutable copy of itup as our itup from here on */ origitup = itup; itup = CopyIndexTuple(origitup); nposting = _bt_swap_posting(itup, oposting, postingoff); /* itup now contains rightmost/max TID from oposting */ /* Alter offset so that newitem goes after posting list */ newitemoff = OffsetNumberNext(newitemoff); } /* * Do we need to split the page to fit the item on it? * * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, * so this comparison is correct even though we appear to be accounting * only for the item and not for its line pointer. */ if (PageGetFreeSpace(page) < itemsz) { Buffer rbuf; Assert(!split_only_page); /* split the buffer into left and right halves */ rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz, itup, origitup, nposting, postingoff); PredicateLockPageSplit(rel, BufferGetBlockNumber(buf), BufferGetBlockNumber(rbuf)); /*---------- * By here, * * + our target page has been split; * + the original tuple has been inserted; * + we have write locks on both the old (left half) * and new (right half) buffers, after the split; and * + we know the key we want to insert into the parent * (it's the "high key" on the left child page). * * We're ready to do the parent insertion. We need to hold onto the * locks for the child pages until we locate the parent, but we can * at least release the lock on the right child before doing the * actual insertion. The lock on the left child will be released * last of all by parent insertion, where it is the 'cbuf' of parent * page. *---------- */ _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly); } else { Buffer metabuf = InvalidBuffer; Page metapg = NULL; BTMetaPageData *metad = NULL; BlockNumber blockcache; /* * If we are doing this insert because we split a page that was the * only one on its tree level, but was not the root, it may have been * the "fast root". We need to ensure that the fast root link points * at or above the current page. We can safely acquire a lock on the * metapage here --- see comments for _bt_newlevel(). */ if (unlikely(split_only_page)) { Assert(!isleaf); Assert(BufferIsValid(cbuf)); metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); if (metad->btm_fastlevel >= opaque->btpo_level) { /* no update wanted */ _bt_relbuf(rel, metabuf); metabuf = InvalidBuffer; } } /* Do the update. No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); if (postingoff != 0) memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add new item to block %u in index \"%s\"", BufferGetBlockNumber(buf), RelationGetRelationName(rel)); MarkBufferDirty(buf); if (BufferIsValid(metabuf)) { /* upgrade meta-page if needed */ if (metad->btm_version < BTREE_NOVAC_VERSION) _bt_upgrademetapage(metapg); metad->btm_fastroot = BufferGetBlockNumber(buf); metad->btm_fastlevel = opaque->btpo_level; MarkBufferDirty(metabuf); } /* * Clear INCOMPLETE_SPLIT flag on child if inserting the new item * finishes a split */ if (!isleaf) { Page cpage = BufferGetPage(cbuf); BTPageOpaque cpageop = BTPageGetOpaque(cpage); Assert(P_INCOMPLETE_SPLIT(cpageop)); cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; MarkBufferDirty(cbuf); } /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_btree_insert xlrec; xl_btree_metadata xlmeta; uint8 xlinfo; XLogRecPtr recptr; uint16 upostingoff; xlrec.offnum = newitemoff; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert); if (isleaf && postingoff == 0) { /* Simple leaf insert */ xlinfo = XLOG_BTREE_INSERT_LEAF; } else if (postingoff != 0) { /* * Leaf insert with posting list split. Must include * postingoff field before newitem/orignewitem. */ Assert(isleaf); xlinfo = XLOG_BTREE_INSERT_POST; } else { /* Internal page insert, which finishes a split on cbuf */ xlinfo = XLOG_BTREE_INSERT_UPPER; XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD); if (BufferIsValid(metabuf)) { /* Actually, it's an internal page insert + meta update */ xlinfo = XLOG_BTREE_INSERT_META; Assert(metad->btm_version >= BTREE_NOVAC_VERSION); xlmeta.version = metad->btm_version; xlmeta.root = metad->btm_root; xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; xlmeta.allequalimage = metad->btm_allequalimage; XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata)); } } XLogRegisterBuffer(0, buf, REGBUF_STANDARD); if (postingoff == 0) { /* Just log itup from caller */ XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup)); } else { /* * Insert with posting list split (XLOG_BTREE_INSERT_POST * record) case. * * Log postingoff. Also log origitup, not itup. REDO routine * must reconstruct final itup (as well as nposting) using * _bt_swap_posting(). */ upostingoff = postingoff; XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16)); XLogRegisterBufData(0, (char *) origitup, IndexTupleSize(origitup)); } recptr = XLogInsert(RM_BTREE_ID, xlinfo); if (BufferIsValid(metabuf)) PageSetLSN(metapg, recptr); if (!isleaf) PageSetLSN(BufferGetPage(cbuf), recptr); PageSetLSN(page, recptr); } END_CRIT_SECTION(); /* Release subsidiary buffers */ if (BufferIsValid(metabuf)) _bt_relbuf(rel, metabuf); if (!isleaf) _bt_relbuf(rel, cbuf); /* * Cache the block number if this is the rightmost leaf page. Cache * may be used by a future inserter within _bt_search_insert(). */ blockcache = InvalidBlockNumber; if (isrightmost && isleaf && !isroot) blockcache = BufferGetBlockNumber(buf); /* Release buffer for insertion target block */ _bt_relbuf(rel, buf); /* * If we decided to cache the insertion target block before releasing * its buffer lock, then cache it now. Check the height of the tree * first, though. We don't go for the optimization with small * indexes. Defer final check to this point to ensure that we don't * call _bt_getrootheight while holding a buffer lock. */ if (BlockNumberIsValid(blockcache) && _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL) RelationSetTargetBlock(rel, blockcache); } /* be tidy */ if (postingoff != 0) { /* itup is actually a modified copy of caller's original */ pfree(nposting); pfree(itup); } } /* * _bt_split() -- split a page in the btree. * * On entry, buf is the page to split, and is pinned and write-locked. * newitemoff etc. tell us about the new item that must be inserted * along with the data from the original page. * * itup_key is used for suffix truncation on leaf pages (internal * page callers pass NULL). When splitting a non-leaf page, 'cbuf' * is the left-sibling of the page we're inserting the downlink for. * This function will clear the INCOMPLETE_SPLIT flag on it, and * release the buffer. * * orignewitem, nposting, and postingoff are needed when an insert of * orignewitem results in both a posting list split and a page split. * These extra posting list split details are used here in the same * way as they are used in the more common case where a posting list * split does not coincide with a page split. We need to deal with * posting list splits directly in order to ensure that everything * that follows from the insert of orignewitem is handled as a single * atomic operation (though caller's insert of a new pivot/downlink * into parent page will still be a separate operation). See * nbtree/README for details on the design of posting list splits. * * Returns the new right sibling of buf, pinned and write-locked. * The pin and lock on buf are maintained. */ static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff) { Buffer rbuf; Page origpage; Page leftpage, rightpage; BlockNumber origpagenumber, rightpagenumber; BTPageOpaque ropaque, lopaque, oopaque; Buffer sbuf = InvalidBuffer; Page spage = NULL; BTPageOpaque sopaque = NULL; Size itemsz; ItemId itemid; IndexTuple firstright, lefthighkey; OffsetNumber firstrightoff; OffsetNumber afterleftoff, afterrightoff, minusinfoff; OffsetNumber origpagepostingoff; OffsetNumber maxoff; OffsetNumber i; bool newitemonleft, isleaf, isrightmost; /* * origpage is the original page to be split. leftpage is a temporary * buffer that receives the left-sibling data, which will be copied back * into origpage on success. rightpage is the new page that will receive * the right-sibling data. * * leftpage is allocated after choosing a split point. rightpage's new * buffer isn't acquired until after leftpage is initialized and has new * high key, the last point where splitting the page may fail (barring * corruption). Failing before acquiring new buffer won't have lasting * consequences, since origpage won't have been modified and leftpage is * only workspace. */ origpage = BufferGetPage(buf); oopaque = BTPageGetOpaque(origpage); isleaf = P_ISLEAF(oopaque); isrightmost = P_RIGHTMOST(oopaque); maxoff = PageGetMaxOffsetNumber(origpage); origpagenumber = BufferGetBlockNumber(buf); /* * Choose a point to split origpage at. * * A split point can be thought of as a point _between_ two existing data * items on origpage (the lastleft and firstright tuples), provided you * pretend that the new item that didn't fit is already on origpage. * * Since origpage does not actually contain newitem, the representation of * split points needs to work with two boundary cases: splits where * newitem is lastleft, and splits where newitem is firstright. * newitemonleft resolves the ambiguity that would otherwise exist when * newitemoff == firstrightoff. In all other cases it's clear which side * of the split every tuple goes on from context. newitemonleft is * usually (but not always) redundant information. * * firstrightoff is supposed to be an origpage offset number, but it's * possible that its value will be maxoff+1, which is "past the end" of * origpage. This happens in the rare case where newitem goes after all * existing items (i.e. newitemoff is maxoff+1) and we end up splitting * origpage at the point that leaves newitem alone on new right page. Any * "!newitemonleft && newitemoff == firstrightoff" split point makes * newitem the firstright tuple, though, so this case isn't a special * case. */ firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz, newitem, &newitemonleft); /* Allocate temp buffer for leftpage */ leftpage = PageGetTempPage(origpage); _bt_pageinit(leftpage, BufferGetPageSize(buf)); lopaque = BTPageGetOpaque(leftpage); /* * leftpage won't be the root when we're done. Also, clear the SPLIT_END * and HAS_GARBAGE flags. */ lopaque->btpo_flags = oopaque->btpo_flags; lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); /* set flag in leftpage indicating that rightpage has no downlink yet */ lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT; lopaque->btpo_prev = oopaque->btpo_prev; /* handle btpo_next after rightpage buffer acquired */ lopaque->btpo_level = oopaque->btpo_level; /* handle btpo_cycleid after rightpage buffer acquired */ /* * Copy the original page's LSN into leftpage, which will become the * updated version of the page. We need this because XLogInsert will * examine the LSN and possibly dump it in a page image. */ PageSetLSN(leftpage, PageGetLSN(origpage)); /* * Determine page offset number of existing overlapped-with-orignewitem * posting list when it is necessary to perform a posting list split in * passing. Note that newitem was already changed by caller (newitem no * longer has the orignewitem TID). * * This page offset number (origpagepostingoff) will be used to pretend * that the posting split has already taken place, even though the * required modifications to origpage won't occur until we reach the * critical section. The lastleft and firstright tuples of our page split * point should, in effect, come from an imaginary version of origpage * that has the nposting tuple instead of the original posting list tuple. * * Note: _bt_findsplitloc() should have compensated for coinciding posting * list splits in just the same way, at least in theory. It doesn't * bother with that, though. In practice it won't affect its choice of * split point. */ origpagepostingoff = InvalidOffsetNumber; if (postingoff != 0) { Assert(isleaf); Assert(ItemPointerCompare(&orignewitem->t_tid, &newitem->t_tid) < 0); Assert(BTreeTupleIsPosting(nposting)); origpagepostingoff = OffsetNumberPrev(newitemoff); } /* * The high key for the new left page is a possibly-truncated copy of * firstright on the leaf level (it's "firstright itself" on internal * pages; see !isleaf comments below). This may seem to be contrary to * Lehman & Yao's approach of using a copy of lastleft as the new high key * when splitting on the leaf level. It isn't, though. * * Suffix truncation will leave the left page's high key fully equal to * lastleft when lastleft and firstright are equal prior to heap TID (that * is, the tiebreaker TID value comes from lastleft). It isn't actually * necessary for a new leaf high key to be a copy of lastleft for the L&Y * "subtree" invariant to hold. It's sufficient to make sure that the new * leaf high key is strictly less than firstright, and greater than or * equal to (not necessarily equal to) lastleft. In other words, when * suffix truncation isn't possible during a leaf page split, we take * L&Y's exact approach to generating a new high key for the left page. * (Actually, that is slightly inaccurate. We don't just use a copy of * lastleft. A tuple with all the keys from firstright but the max heap * TID from lastleft is used, to avoid introducing a special case.) */ if (!newitemonleft && newitemoff == firstrightoff) { /* incoming tuple becomes firstright */ itemsz = newitemsz; firstright = newitem; } else { /* existing item at firstrightoff becomes firstright */ itemid = PageGetItemId(origpage, firstrightoff); itemsz = ItemIdGetLength(itemid); firstright = (IndexTuple) PageGetItem(origpage, itemid); if (firstrightoff == origpagepostingoff) firstright = nposting; } if (isleaf) { IndexTuple lastleft; /* Attempt suffix truncation for leaf page splits */ if (newitemonleft && newitemoff == firstrightoff) { /* incoming tuple becomes lastleft */ lastleft = newitem; } else { OffsetNumber lastleftoff; /* existing item before firstrightoff becomes lastleft */ lastleftoff = OffsetNumberPrev(firstrightoff); Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque)); itemid = PageGetItemId(origpage, lastleftoff); lastleft = (IndexTuple) PageGetItem(origpage, itemid); if (lastleftoff == origpagepostingoff) lastleft = nposting; } lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key); itemsz = IndexTupleSize(lefthighkey); } else { /* * Don't perform suffix truncation on a copy of firstright to make * left page high key for internal page splits. Must use firstright * as new high key directly. * * Each distinct separator key value originates as a leaf level high * key; all other separator keys/pivot tuples are copied from one * level down. A separator key in a grandparent page must be * identical to high key in rightmost parent page of the subtree to * its left, which must itself be identical to high key in rightmost * child page of that same subtree (this even applies to separator * from grandparent's high key). There must always be an unbroken * "seam" of identical separator keys that guide index scans at every * level, starting from the grandparent. That's why suffix truncation * is unsafe here. * * Internal page splits will truncate firstright into a "negative * infinity" data item when it gets inserted on the new right page * below, though. This happens during the call to _bt_pgaddtup() for * the new first data item for right page. Do not confuse this * mechanism with suffix truncation. It is just a convenient way of * implementing page splits that split the internal page "inside" * firstright. The lefthighkey separator key cannot appear a second * time in the right page (only firstright's downlink goes in right * page). */ lefthighkey = firstright; } /* * Add new high key to leftpage */ afterleftoff = P_HIKEY; Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0); Assert(BTreeTupleGetNAtts(lefthighkey, rel) <= IndexRelationGetNumberOfKeyAttributes(rel)); Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey))); if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add high key to the left sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); afterleftoff = OffsetNumberNext(afterleftoff); /* * Acquire a new right page to split into, now that left page has a new * high key. From here on, it's not okay to throw an error without * zeroing rightpage first. This coding rule ensures that we won't * confuse future VACUUM operations, which might otherwise try to re-find * a downlink to a leftover junk page as the page undergoes deletion. * * It would be reasonable to start the critical section just after the new * rightpage buffer is acquired instead; that would allow us to avoid * leftover junk pages without bothering to zero rightpage. We do it this * way because it avoids an unnecessary PANIC when either origpage or its * existing sibling page are corrupt. */ rbuf = _bt_allocbuf(rel, heaprel); rightpage = BufferGetPage(rbuf); rightpagenumber = BufferGetBlockNumber(rbuf); /* rightpage was initialized by _bt_getbuf */ ropaque = BTPageGetOpaque(rightpage); /* * Finish off remaining leftpage special area fields. They cannot be set * before both origpage (leftpage) and rightpage buffers are acquired and * locked. * * btpo_cycleid is only used with leaf pages, though we set it here in all * cases just to be consistent. */ lopaque->btpo_next = rightpagenumber; lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel); /* * rightpage won't be the root when we're done. Also, clear the SPLIT_END * and HAS_GARBAGE flags. */ ropaque->btpo_flags = oopaque->btpo_flags; ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); ropaque->btpo_prev = origpagenumber; ropaque->btpo_next = oopaque->btpo_next; ropaque->btpo_level = oopaque->btpo_level; ropaque->btpo_cycleid = lopaque->btpo_cycleid; /* * Add new high key to rightpage where necessary. * * If the page we're splitting is not the rightmost page at its level in * the tree, then the first entry on the page is the high key from * origpage. */ afterrightoff = P_HIKEY; if (!isrightmost) { IndexTuple righthighkey; itemid = PageGetItemId(origpage, P_HIKEY); itemsz = ItemIdGetLength(itemid); righthighkey = (IndexTuple) PageGetItem(origpage, itemid); Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0); Assert(BTreeTupleGetNAtts(righthighkey, rel) <= IndexRelationGetNumberOfKeyAttributes(rel)); if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff, false, false) == InvalidOffsetNumber) { memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add high key to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); } afterrightoff = OffsetNumberNext(afterrightoff); } /* * Internal page splits truncate first data item on right page -- it * becomes "minus infinity" item for the page. Set this up here. */ minusinfoff = InvalidOffsetNumber; if (!isleaf) minusinfoff = afterrightoff; /* * Now transfer all the data items (non-pivot tuples in isleaf case, or * additional pivot tuples in !isleaf case) to the appropriate page. * * Note: we *must* insert at least the right page's items in item-number * order, for the benefit of _bt_restore_page(). */ for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i)) { IndexTuple dataitem; itemid = PageGetItemId(origpage, i); itemsz = ItemIdGetLength(itemid); dataitem = (IndexTuple) PageGetItem(origpage, itemid); /* replace original item with nposting due to posting split? */ if (i == origpagepostingoff) { Assert(BTreeTupleIsPosting(dataitem)); Assert(itemsz == MAXALIGN(IndexTupleSize(nposting))); dataitem = nposting; } /* does new item belong before this one? */ else if (i == newitemoff) { if (newitemonleft) { Assert(newitemoff <= firstrightoff); if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff, false)) { memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add new item to the left sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); } afterleftoff = OffsetNumberNext(afterleftoff); } else { Assert(newitemoff >= firstrightoff); if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff, afterrightoff == minusinfoff)) { memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add new item to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); } afterrightoff = OffsetNumberNext(afterrightoff); } } /* decide which page to put it on */ if (i < firstrightoff) { if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false)) { memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add old item to the left sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); } afterleftoff = OffsetNumberNext(afterleftoff); } else { if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff, afterrightoff == minusinfoff)) { memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add old item to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); } afterrightoff = OffsetNumberNext(afterrightoff); } } /* Handle case where newitem goes at the end of rightpage */ if (i <= newitemoff) { /* * Can't have newitemonleft here; that would imply we were told to put * *everything* on the left page, which cannot fit (if it could, we'd * not be splitting the page). */ Assert(!newitemonleft && newitemoff == maxoff + 1); if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff, afterrightoff == minusinfoff)) { memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add new item to the right sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); } afterrightoff = OffsetNumberNext(afterrightoff); } /* * We have to grab the original right sibling (if any) and update its prev * link. We are guaranteed that this is deadlock-free, since we couple * the locks in the standard order: left to right. */ if (!isrightmost) { sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE); spage = BufferGetPage(sbuf); sopaque = BTPageGetOpaque(spage); if (sopaque->btpo_prev != origpagenumber) { memset(rightpage, 0, BufferGetPageSize(rbuf)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("right sibling's left-link doesn't match: " "block %u links to %u instead of expected %u in index \"%s\"", oopaque->btpo_next, sopaque->btpo_prev, origpagenumber, RelationGetRelationName(rel)))); } /* * Check to see if we can set the SPLIT_END flag in the right-hand * split page; this can save some I/O for vacuum since it need not * proceed to the right sibling. We can set the flag if the right * sibling has a different cycleid: that means it could not be part of * a group of pages that were all split off from the same ancestor * page. If you're confused, imagine that page A splits to A B and * then again, yielding A C B, while vacuum is in progress. Tuples * originally in A could now be in either B or C, hence vacuum must * examine both pages. But if D, our right sibling, has a different * cycleid then it could not contain any tuples that were in A when * the vacuum started. */ if (sopaque->btpo_cycleid != ropaque->btpo_cycleid) ropaque->btpo_flags |= BTP_SPLIT_END; } /* * Right sibling is locked, new siblings are prepared, but original page * is not updated yet. * * NO EREPORT(ERROR) till right sibling is updated. We can get away with * not starting the critical section till here because we haven't been * scribbling on the original page yet; see comments above. */ START_CRIT_SECTION(); /* * By here, the original data page has been split into two new halves, and * these are correct. The algorithm requires that the left page never * move during a split, so we copy the new left page back on top of the * original. We need to do this before writing the WAL record, so that * XLogInsert can WAL log an image of the page if necessary. */ PageRestoreTempPage(leftpage, origpage); /* leftpage, lopaque must not be used below here */ MarkBufferDirty(buf); MarkBufferDirty(rbuf); if (!isrightmost) { sopaque->btpo_prev = rightpagenumber; MarkBufferDirty(sbuf); } /* * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes * a split */ if (!isleaf) { Page cpage = BufferGetPage(cbuf); BTPageOpaque cpageop = BTPageGetOpaque(cpage); cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; MarkBufferDirty(cbuf); } /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_btree_split xlrec; uint8 xlinfo; XLogRecPtr recptr; xlrec.level = ropaque->btpo_level; /* See comments below on newitem, orignewitem, and posting lists */ xlrec.firstrightoff = firstrightoff; xlrec.newitemoff = newitemoff; xlrec.postingoff = 0; if (postingoff != 0 && origpagepostingoff < firstrightoff) xlrec.postingoff = postingoff; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit); XLogRegisterBuffer(0, buf, REGBUF_STANDARD); XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT); /* Log original right sibling, since we've changed its prev-pointer */ if (!isrightmost) XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD); if (!isleaf) XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD); /* * Log the new item, if it was inserted on the left page. (If it was * put on the right page, we don't need to explicitly WAL log it * because it's included with all the other items on the right page.) * Show the new item as belonging to the left page buffer, so that it * is not stored if XLogInsert decides it needs a full-page image of * the left page. We always store newitemoff in the record, though. * * The details are sometimes slightly different for page splits that * coincide with a posting list split. If both the replacement * posting list and newitem go on the right page, then we don't need * to log anything extra, just like the simple !newitemonleft * no-posting-split case (postingoff is set to zero in the WAL record, * so recovery doesn't need to process a posting list split at all). * Otherwise, we set postingoff and log orignewitem instead of * newitem, despite having actually inserted newitem. REDO routine * must reconstruct nposting and newitem using _bt_swap_posting(). * * Note: It's possible that our page split point is the point that * makes the posting list lastleft and newitem firstright. This is * the only case where we log orignewitem/newitem despite newitem * going on the right page. If XLogInsert decides that it can omit * orignewitem due to logging a full-page image of the left page, * everything still works out, since recovery only needs to log * orignewitem for items on the left page (just like the regular * newitem-logged case). */ if (newitemonleft && xlrec.postingoff == 0) XLogRegisterBufData(0, (char *) newitem, newitemsz); else if (xlrec.postingoff != 0) { Assert(isleaf); Assert(newitemonleft || firstrightoff == newitemoff); Assert(newitemsz == IndexTupleSize(orignewitem)); XLogRegisterBufData(0, (char *) orignewitem, newitemsz); } /* Log the left page's new high key */ if (!isleaf) { /* lefthighkey isn't local copy, get current pointer */ itemid = PageGetItemId(origpage, P_HIKEY); lefthighkey = (IndexTuple) PageGetItem(origpage, itemid); } XLogRegisterBufData(0, (char *) lefthighkey, MAXALIGN(IndexTupleSize(lefthighkey))); /* * Log the contents of the right page in the format understood by * _bt_restore_page(). The whole right page will be recreated. * * Direct access to page is not good but faster - we should implement * some new func in page API. Note we only store the tuples * themselves, knowing that they were inserted in item-number order * and so the line pointers can be reconstructed. See comments for * _bt_restore_page(). */ XLogRegisterBufData(1, (char *) rightpage + ((PageHeader) rightpage)->pd_upper, ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper); xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R; recptr = XLogInsert(RM_BTREE_ID, xlinfo); PageSetLSN(origpage, recptr); PageSetLSN(rightpage, recptr); if (!isrightmost) PageSetLSN(spage, recptr); if (!isleaf) PageSetLSN(BufferGetPage(cbuf), recptr); } END_CRIT_SECTION(); /* release the old right sibling */ if (!isrightmost) _bt_relbuf(rel, sbuf); /* release the child */ if (!isleaf) _bt_relbuf(rel, cbuf); /* be tidy */ if (isleaf) pfree(lefthighkey); /* split's done */ return rbuf; } /* * _bt_insert_parent() -- Insert downlink into parent, completing split. * * On entry, buf and rbuf are the left and right split pages, which we * still hold write locks on. Both locks will be released here. We * release the rbuf lock once we have a write lock on the page that we * intend to insert a downlink to rbuf on (i.e. buf's current parent page). * The lock on buf is released at the same point as the lock on the parent * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same * atomic operation that completes the split by inserting a new downlink. * * stack - stack showing how we got here. Will be NULL when splitting true * root, or during concurrent root split, where we can be inefficient * isroot - we split the true root * isonly - we split a page alone on its level (might have been fast root) */ static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly) { Assert(heaprel != NULL); /* * Here we have to do something Lehman and Yao don't talk about: deal with * a root split and construction of a new root. If our stack is empty * then we have just split a node on what had been the root level when we * descended the tree. If it was still the root then we perform a * new-root construction. If it *wasn't* the root anymore, search to find * the next higher level that someone constructed meanwhile, and find the * right place to insert as for the normal case. * * If we have to search for the parent level, we do so by re-descending * from the root. This is not super-efficient, but it's rare enough not * to matter. */ if (isroot) { Buffer rootbuf; Assert(stack == NULL); Assert(isonly); /* create a new root node one level up and update the metapage */ rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf); /* release the split buffers */ _bt_relbuf(rel, rootbuf); _bt_relbuf(rel, rbuf); _bt_relbuf(rel, buf); } else { BlockNumber bknum = BufferGetBlockNumber(buf); BlockNumber rbknum = BufferGetBlockNumber(rbuf); Page page = BufferGetPage(buf); IndexTuple new_item; BTStackData fakestack; IndexTuple ritem; Buffer pbuf; if (stack == NULL) { BTPageOpaque opaque; elog(DEBUG2, "concurrent ROOT page split"); opaque = BTPageGetOpaque(page); /* * We should never reach here when a leaf page split takes place * despite the insert of newitem being able to apply the fastpath * optimization. Make sure of that with an assertion. * * This is more of a performance issue than a correctness issue. * The fastpath won't have a descent stack. Using a phony stack * here works, but never rely on that. The fastpath should be * rejected within _bt_search_insert() when the rightmost leaf * page will split, since it's faster to go through _bt_search() * and get a stack in the usual way. */ Assert(!(P_ISLEAF(opaque) && BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* Find the leftmost page at the next level up */ pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false); /* Set up a phony stack entry pointing there */ stack = &fakestack; stack->bts_blkno = BufferGetBlockNumber(pbuf); stack->bts_offset = InvalidOffsetNumber; stack->bts_parent = NULL; _bt_relbuf(rel, pbuf); } /* get high key from left, a strict lower bound for new right page */ ritem = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_HIKEY)); /* form an index tuple that points at the new right page */ new_item = CopyIndexTuple(ritem); BTreeTupleSetDownLink(new_item, rbknum); /* * Re-find and write lock the parent of buf. * * It's possible that the location of buf's downlink has changed since * our initial _bt_search() descent. _bt_getstackbuf() will detect * and recover from this, updating the stack, which ensures that the * new downlink will be inserted at the correct offset. Even buf's * parent may have changed. */ pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum); /* * Unlock the right child. The left child will be unlocked in * _bt_insertonpg(). * * Unlocking the right child must be delayed until here to ensure that * no concurrent VACUUM operation can become confused. Page deletion * cannot be allowed to fail to re-find a downlink for the rbuf page. * (Actually, this is just a vestige of how things used to work. The * page deletion code is expected to check for the INCOMPLETE_SPLIT * flag on the left child. It won't attempt deletion of the right * child until the split is complete. Despite all this, we opt to * conservatively delay unlocking the right child until here.) */ _bt_relbuf(rel, rbuf); if (pbuf == InvalidBuffer) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u", RelationGetRelationName(rel), bknum, rbknum))); /* Recursively insert into the parent */ _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent, new_item, MAXALIGN(IndexTupleSize(new_item)), stack->bts_offset + 1, 0, isonly); /* be tidy */ pfree(new_item); } } /* * _bt_finish_split() -- Finish an incomplete split * * A crash or other failure can leave a split incomplete. The insertion * routines won't allow to insert on a page that is incompletely split. * Before inserting on such a page, call _bt_finish_split(). * * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked * and unpinned. * * Caller must provide a valid heaprel, since finishing a page split requires * allocating a new page if and when the parent page splits in turn. */ void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack) { Page lpage = BufferGetPage(lbuf); BTPageOpaque lpageop = BTPageGetOpaque(lpage); Buffer rbuf; Page rpage; BTPageOpaque rpageop; bool wasroot; bool wasonly; Assert(P_INCOMPLETE_SPLIT(lpageop)); Assert(heaprel != NULL); /* Lock right sibling, the one missing the downlink */ rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE); rpage = BufferGetPage(rbuf); rpageop = BTPageGetOpaque(rpage); /* Could this be a root split? */ if (!stack) { Buffer metabuf; Page metapg; BTMetaPageData *metad; /* acquire lock on the metapage */ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf)); _bt_relbuf(rel, metabuf); } else wasroot = false; /* Was this the only page on the level before split? */ wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop)); elog(DEBUG1, "finishing incomplete split of %u/%u", BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf)); _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly); } /* * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot * tuple whose downlink points to child page. * * Caller passes child's block number, which is used to identify * associated pivot tuple in parent page using a linear search that * matches on pivot's downlink/block number. The expected location of * the pivot tuple is taken from the stack one level above the child * page. This is used as a starting point. Insertions into the * parent level could cause the pivot tuple to move right; deletions * could cause it to move left, but not left of the page we previously * found it on. * * Caller can use its stack to relocate the pivot tuple/downlink for * any same-level page to the right of the page found by its initial * descent. This is necessary because of the possibility that caller * moved right to recover from a concurrent page split. It's also * convenient for certain callers to be able to step right when there * wasn't a concurrent page split, while still using their original * stack. For example, the checkingunique _bt_doinsert() case may * have to step right when there are many physical duplicates, and its * scantid forces an insertion to the right of the "first page the * value could be on". (This is also relied on by all of our callers * when dealing with !heapkeyspace indexes.) * * Returns write-locked parent page buffer, or InvalidBuffer if pivot * tuple not found (should not happen). Adjusts bts_blkno & * bts_offset if changed. Page split caller should insert its new * pivot tuple for its new right sibling page on parent page, at the * offset number bts_offset + 1. */ Buffer _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child) { BlockNumber blkno; OffsetNumber start; blkno = stack->bts_blkno; start = stack->bts_offset; for (;;) { Buffer buf; Page page; BTPageOpaque opaque; buf = _bt_getbuf(rel, blkno, BT_WRITE); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); Assert(heaprel != NULL); if (P_INCOMPLETE_SPLIT(opaque)) { _bt_finish_split(rel, heaprel, buf, stack->bts_parent); continue; } if (!P_IGNORE(opaque)) { OffsetNumber offnum, minoff, maxoff; ItemId itemid; IndexTuple item; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); /* * start = InvalidOffsetNumber means "search the whole page". We * need this test anyway due to possibility that page has a high * key now when it didn't before. */ if (start < minoff) start = minoff; /* * Need this check too, to guard against possibility that page * split since we visited it originally. */ if (start > maxoff) start = OffsetNumberNext(maxoff); /* * These loops will check every item on the page --- but in an * order that's attuned to the probability of where it actually * is. Scan to the right first, then to the left. */ for (offnum = start; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { itemid = PageGetItemId(page, offnum); item = (IndexTuple) PageGetItem(page, itemid); if (BTreeTupleGetDownLink(item) == child) { /* Return accurate pointer to where link is now */ stack->bts_blkno = blkno; stack->bts_offset = offnum; return buf; } } for (offnum = OffsetNumberPrev(start); offnum >= minoff; offnum = OffsetNumberPrev(offnum)) { itemid = PageGetItemId(page, offnum); item = (IndexTuple) PageGetItem(page, itemid); if (BTreeTupleGetDownLink(item) == child) { /* Return accurate pointer to where link is now */ stack->bts_blkno = blkno; stack->bts_offset = offnum; return buf; } } } /* * The item we're looking for moved right at least one page. * * Lehman and Yao couple/chain locks when moving right here, which we * can avoid. See nbtree/README. */ if (P_RIGHTMOST(opaque)) { _bt_relbuf(rel, buf); return InvalidBuffer; } blkno = opaque->btpo_next; start = InvalidOffsetNumber; _bt_relbuf(rel, buf); } } /* * _bt_newlevel() -- Create a new level above root page. * * We've just split the old root page and need to create a new one. * In order to do this, we add a new root page to the file, then lock * the metadata page and update it. This is guaranteed to be deadlock- * free, because all readers release their locks on the metadata page * before trying to lock the root, and all writers lock the root before * trying to lock the metadata page. We have a write lock on the old * root page, so we have not introduced any cycles into the waits-for * graph. * * On entry, lbuf (the old root) and rbuf (its new peer) are write- * locked. On exit, a new root page exists with entries for the * two new children, metapage is updated and unlocked/unpinned. * The new root buffer is returned to caller which has to unlock/unpin * lbuf, rbuf & rootbuf. */ static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf) { Buffer rootbuf; Page lpage, rootpage; BlockNumber lbkno, rbkno; BlockNumber rootblknum; BTPageOpaque rootopaque; BTPageOpaque lopaque; ItemId itemid; IndexTuple item; IndexTuple left_item; Size left_item_sz; IndexTuple right_item; Size right_item_sz; Buffer metabuf; Page metapg; BTMetaPageData *metad; lbkno = BufferGetBlockNumber(lbuf); rbkno = BufferGetBlockNumber(rbuf); lpage = BufferGetPage(lbuf); lopaque = BTPageGetOpaque(lpage); /* get a new root page */ rootbuf = _bt_allocbuf(rel, heaprel); rootpage = BufferGetPage(rootbuf); rootblknum = BufferGetBlockNumber(rootbuf); /* acquire lock on the metapage */ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); /* * Create downlink item for left page (old root). The key value used is * "minus infinity", a sentinel value that's reliably less than any real * key value that could appear in the left page. */ left_item_sz = sizeof(IndexTupleData); left_item = (IndexTuple) palloc(left_item_sz); left_item->t_info = left_item_sz; BTreeTupleSetDownLink(left_item, lbkno); BTreeTupleSetNAtts(left_item, 0, false); /* * Create downlink item for right page. The key for it is obtained from * the "high key" position in the left page. */ itemid = PageGetItemId(lpage, P_HIKEY); right_item_sz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(lpage, itemid); right_item = CopyIndexTuple(item); BTreeTupleSetDownLink(right_item, rbkno); /* NO EREPORT(ERROR) from here till newroot op is logged */ START_CRIT_SECTION(); /* upgrade metapage if needed */ if (metad->btm_version < BTREE_NOVAC_VERSION) _bt_upgrademetapage(metapg); /* set btree special data */ rootopaque = BTPageGetOpaque(rootpage); rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; rootopaque->btpo_flags = BTP_ROOT; rootopaque->btpo_level = (BTPageGetOpaque(lpage))->btpo_level + 1; rootopaque->btpo_cycleid = 0; /* update metapage data */ metad->btm_root = rootblknum; metad->btm_level = rootopaque->btpo_level; metad->btm_fastroot = rootblknum; metad->btm_fastlevel = rootopaque->btpo_level; /* * Insert the left page pointer into the new root page. The root page is * the rightmost page on its level so there is no "high key" in it; the * two items will go into positions P_HIKEY and P_FIRSTKEY. * * Note: we *must* insert the two items in item-number order, for the * benefit of _bt_restore_page(). */ Assert(BTreeTupleGetNAtts(left_item, rel) == 0); if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add leftkey to new root page" " while splitting block %u of index \"%s\"", BufferGetBlockNumber(lbuf), RelationGetRelationName(rel)); /* * insert the right page pointer into the new root page. */ Assert(BTreeTupleGetNAtts(right_item, rel) > 0); Assert(BTreeTupleGetNAtts(right_item, rel) <= IndexRelationGetNumberOfKeyAttributes(rel)); if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add rightkey to new root page" " while splitting block %u of index \"%s\"", BufferGetBlockNumber(lbuf), RelationGetRelationName(rel)); /* Clear the incomplete-split flag in the left child */ Assert(P_INCOMPLETE_SPLIT(lopaque)); lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; MarkBufferDirty(lbuf); MarkBufferDirty(rootbuf); MarkBufferDirty(metabuf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_btree_newroot xlrec; XLogRecPtr recptr; xl_btree_metadata md; xlrec.rootblk = rootblknum; xlrec.level = metad->btm_level; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot); XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT); XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD); XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); Assert(metad->btm_version >= BTREE_NOVAC_VERSION); md.version = metad->btm_version; md.root = rootblknum; md.level = metad->btm_level; md.fastroot = rootblknum; md.fastlevel = metad->btm_level; md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; md.allequalimage = metad->btm_allequalimage; XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); /* * Direct access to page is not good but faster - we should implement * some new func in page API. */ XLogRegisterBufData(0, (char *) rootpage + ((PageHeader) rootpage)->pd_upper, ((PageHeader) rootpage)->pd_special - ((PageHeader) rootpage)->pd_upper); recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT); PageSetLSN(lpage, recptr); PageSetLSN(rootpage, recptr); PageSetLSN(metapg, recptr); } END_CRIT_SECTION(); /* done with metapage */ _bt_relbuf(rel, metabuf); pfree(left_item); pfree(right_item); return rootbuf; } /* * _bt_pgaddtup() -- add a data item to a particular page during split. * * The difference between this routine and a bare PageAddItem call is * that this code can deal with the first data item on an internal btree * page in passing. This data item (which is called "firstright" within * _bt_split()) has a key that must be treated as minus infinity after * the split. Therefore, we truncate away all attributes when caller * specifies it's the first data item on page (downlink is not changed, * though). This extra step is only needed for the right page of an * internal page split. There is no need to do this for the first data * item on the existing/left page, since that will already have been * truncated during an earlier page split. * * See _bt_split() for a high level explanation of why we truncate here. * Note that this routine has nothing to do with suffix truncation, * despite using some of the same infrastructure. */ static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem) { IndexTupleData trunctuple; if (newfirstdataitem) { trunctuple = *itup; trunctuple.t_info = sizeof(IndexTupleData); BTreeTupleSetNAtts(&trunctuple, 0, false); itup = &trunctuple; itemsize = sizeof(IndexTupleData); } if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false, false) == InvalidOffsetNumber)) return false; return true; } /* * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split. * * There are three operations performed here: simple index deletion, bottom-up * index deletion, and deduplication. If all three operations fail to free * enough space for the incoming item then caller will go on to split the * page. We always consider simple deletion first. If that doesn't work out * we consider alternatives. Callers that only want us to consider simple * deletion (without any fallback) ask for that using the 'simpleonly' * argument. * * We usually pick only one alternative "complex" operation when simple * deletion alone won't prevent a page split. The 'checkingunique', * 'uniquedup', and 'indexUnchanged' arguments are used for that. * * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page * level flag was found set. The flag was useful back when there wasn't * necessarily one single page for a duplicate tuple to go on (before heap TID * became a part of the key space in version 4 indexes). But we don't * actually look at the flag anymore (it's not a gating condition for our * caller). That would cause us to miss tuples that are safe to delete, * without getting any benefit in return. We know that the alternative is to * split the page; scanning the line pointer array in passing won't have * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite * all this because !heapkeyspace indexes must still do a "getting tired" * linear search, and so are likely to get some benefit from using it as a * gating condition.) */ static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, BTInsertState insertstate, bool simpleonly, bool checkingunique, bool uniquedup, bool indexUnchanged) { OffsetNumber deletable[MaxIndexTuplesPerPage]; int ndeletable = 0; OffsetNumber offnum, minoff, maxoff; Buffer buffer = insertstate->buf; BTScanInsert itup_key = insertstate->itup_key; Page page = BufferGetPage(buffer); BTPageOpaque opaque = BTPageGetOpaque(page); Assert(P_ISLEAF(opaque)); Assert(simpleonly || itup_key->heapkeyspace); Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged)); /* * Scan over all items to see which ones need to be deleted according to * LP_DEAD flags. We'll usually manage to delete a few extra items that * are not marked LP_DEAD in passing. Often the extra items that actually * end up getting deleted are items that would have had their LP_DEAD bit * set before long anyway (if we opted not to include them as extras). */ minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemId = PageGetItemId(page, offnum); if (ItemIdIsDead(itemId)) deletable[ndeletable++] = offnum; } if (ndeletable > 0) { _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable, insertstate->itup, minoff, maxoff); insertstate->bounds_valid = false; /* Return when a page split has already been avoided */ if (PageGetFreeSpace(page) >= insertstate->itemsz) return; /* Might as well assume duplicates (if checkingunique) */ uniquedup = true; } /* * We're done with simple deletion. Return early with callers that only * call here so that simple deletion can be considered. This includes * callers that explicitly ask for this and checkingunique callers that * probably don't have any version churn duplicates on the page. * * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we * return at this point (or when we go on the try either or both of our * other strategies and they also fail). We do not bother expending a * separate write to clear it, however. Caller will definitely clear it * when it goes on to split the page (note also that the deduplication * process will clear the flag in passing, just to keep things tidy). */ if (simpleonly || (checkingunique && !uniquedup)) { Assert(!indexUnchanged); return; } /* Assume bounds about to be invalidated (this is almost certain now) */ insertstate->bounds_valid = false; /* * Perform bottom-up index deletion pass when executor hint indicated that * incoming item is logically unchanged, or for a unique index that is * known to have physical duplicates for some other reason. (There is a * large overlap between these two cases for a unique index. It's worth * having both triggering conditions in order to apply the optimization in * the event of successive related INSERT and DELETE statements.) * * We'll go on to do a deduplication pass when a bottom-up pass fails to * delete an acceptable amount of free space (a significant fraction of * the page, or space for the new item, whichever is greater). * * Note: Bottom-up index deletion uses the same equality/equivalence * routines as deduplication internally. However, it does not merge * together index tuples, so the same correctness considerations do not * apply. We deliberately omit an index-is-allequalimage test here. */ if ((indexUnchanged || uniquedup) && _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz)) return; /* Perform deduplication pass (when enabled and index-is-allequalimage) */ if (BTGetDeduplicateItems(rel) && itup_key->allequalimage) _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz, (indexUnchanged || uniquedup)); } /* * _bt_simpledel_pass - Simple index tuple deletion pass. * * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers * of all such tuples are determined by caller (caller passes these to us as * its 'deletable' argument). * * We might also delete extra index tuples that turn out to be safe to delete * in passing (though they must be cheap to check in passing to begin with). * There is no certainty that any extra tuples will be deleted, though. The * high level goal of the approach we take is to get the most out of each call * here (without noticeably increasing the per-call overhead compared to what * we need to do just to be able to delete the page's LP_DEAD-marked index * tuples). * * The number of extra index tuples that turn out to be deletable might * greatly exceed the number of LP_DEAD-marked index tuples due to various * locality related effects. For example, it's possible that the total number * of table blocks (pointed to by all TIDs on the leaf page) is naturally * quite low, in which case we might end up checking if it's possible to * delete _most_ index tuples on the page (without the tableam needing to * access additional table blocks). The tableam will sometimes stumble upon * _many_ extra deletable index tuples in indexes where this pattern is * common. * * See nbtree/README for further details on simple index tuple deletion. */ static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, OffsetNumber minoff, OffsetNumber maxoff) { Page page = BufferGetPage(buffer); BlockNumber *deadblocks; int ndeadblocks; TM_IndexDeleteOp delstate; OffsetNumber offnum; /* Get array of table blocks pointed to by LP_DEAD-set tuples */ deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem, &ndeadblocks); /* Initialize tableam state that describes index deletion operation */ delstate.irel = rel; delstate.iblknum = BufferGetBlockNumber(buffer); delstate.bottomup = false; delstate.bottomupfreespace = 0; delstate.ndeltids = 0; delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete)); delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus)); for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid = PageGetItemId(page, offnum); IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids]; TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids]; BlockNumber tidblock; void *match; if (!BTreeTupleIsPosting(itup)) { tidblock = ItemPointerGetBlockNumber(&itup->t_tid); match = bsearch(&tidblock, deadblocks, ndeadblocks, sizeof(BlockNumber), _bt_blk_cmp); if (!match) { Assert(!ItemIdIsDead(itemid)); continue; } /* * TID's table block is among those pointed to by the TIDs from * LP_DEAD-bit set tuples on page -- add TID to deltids */ odeltid->tid = itup->t_tid; odeltid->id = delstate.ndeltids; ostatus->idxoffnum = offnum; ostatus->knowndeletable = ItemIdIsDead(itemid); ostatus->promising = false; /* unused */ ostatus->freespace = 0; /* unused */ delstate.ndeltids++; } else { int nitem = BTreeTupleGetNPosting(itup); for (int p = 0; p < nitem; p++) { ItemPointer tid = BTreeTupleGetPostingN(itup, p); tidblock = ItemPointerGetBlockNumber(tid); match = bsearch(&tidblock, deadblocks, ndeadblocks, sizeof(BlockNumber), _bt_blk_cmp); if (!match) { Assert(!ItemIdIsDead(itemid)); continue; } /* * TID's table block is among those pointed to by the TIDs * from LP_DEAD-bit set tuples on page -- add TID to deltids */ odeltid->tid = *tid; odeltid->id = delstate.ndeltids; ostatus->idxoffnum = offnum; ostatus->knowndeletable = ItemIdIsDead(itemid); ostatus->promising = false; /* unused */ ostatus->freespace = 0; /* unused */ odeltid++; ostatus++; delstate.ndeltids++; } } } pfree(deadblocks); Assert(delstate.ndeltids >= ndeletable); /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */ _bt_delitems_delete_check(rel, buffer, heapRel, &delstate); pfree(delstate.deltids); pfree(delstate.status); } /* * _bt_deadblocks() -- Get LP_DEAD related table blocks. * * Builds sorted and unique-ified array of table block numbers from index * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table * block from incoming newitem just in case it isn't among the LP_DEAD-related * table blocks. * * Always counting the newitem's table block as an LP_DEAD related block makes * sense because the cost is consistently low; it is practically certain that * the table block will not incur a buffer miss in tableam. On the other hand * the benefit is often quite high. There is a decent chance that there will * be some deletable items from this block, since in general most garbage * tuples became garbage in the recent past (in many cases this won't be the * first logical row that core code added to/modified in table block * recently). * * Returns final array, and sets *nblocks to its final size for caller. */ static BlockNumber * _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable, IndexTuple newitem, int *nblocks) { int spacentids, ntids; BlockNumber *tidblocks; /* * Accumulate each TID's block in array whose initial size has space for * one table block per LP_DEAD-set tuple (plus space for the newitem table * block). Array will only need to grow when there are LP_DEAD-marked * posting list tuples (which is not that common). */ spacentids = ndeletable + 1; ntids = 0; tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids); /* * First add the table block for the incoming newitem. This is the one * case where simple deletion can visit a table block that doesn't have * any known deletable items. */ Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem)); tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid); for (int i = 0; i < ndeletable; i++) { ItemId itemid = PageGetItemId(page, deletable[i]); IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); Assert(ItemIdIsDead(itemid)); if (!BTreeTupleIsPosting(itup)) { if (ntids + 1 > spacentids) { spacentids *= 2; tidblocks = (BlockNumber *) repalloc(tidblocks, sizeof(BlockNumber) * spacentids); } tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid); } else { int nposting = BTreeTupleGetNPosting(itup); if (ntids + nposting > spacentids) { spacentids = Max(spacentids * 2, ntids + nposting); tidblocks = (BlockNumber *) repalloc(tidblocks, sizeof(BlockNumber) * spacentids); } for (int j = 0; j < nposting; j++) { ItemPointer tid = BTreeTupleGetPostingN(itup, j); tidblocks[ntids++] = ItemPointerGetBlockNumber(tid); } } } qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp); *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp); return tidblocks; } /* * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass */ static inline int _bt_blk_cmp(const void *arg1, const void *arg2) { BlockNumber b1 = *((BlockNumber *) arg1); BlockNumber b2 = *((BlockNumber *) arg2); return pg_cmp_u32(b1, b2); }