/*------------------------------------------------------------------------- * * nbtdedup.c * Deduplicate or bottom-up delete items in Postgres btrees. * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * src/backend/access/nbtree/nbtdedup.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/nbtree.h" #include "access/nbtxlog.h" #include "access/xloginsert.h" #include "miscadmin.h" #include "utils/rel.h" static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, TM_IndexDeleteOp *delstate); static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem); static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz); #ifdef USE_ASSERT_CHECKING static bool _bt_posting_valid(IndexTuple posting); #endif /* * Perform a deduplication pass. * * The general approach taken here is to perform as much deduplication as * possible to free as much space as possible. Note, however, that "single * value" strategy is used for !bottomupdedup callers when the page is full of * tuples of a single value. Deduplication passes that apply the strategy * will leave behind a few untouched tuples at the end of the page, preparing * the page for an anticipated page split that uses nbtsplitloc.c's own single * value strategy. Our high level goal is to delay merging the untouched * tuples until after the page splits. * * When a call to _bt_bottomupdel_pass() just took place (and failed), our * high level goal is to prevent a page split entirely by buying more time. * We still hope that a page split can be avoided altogether. That's why * single value strategy is not even considered for bottomupdedup callers. * * The page will have to be split if we cannot successfully free at least * newitemsz (we also need space for newitem's line pointer, which isn't * included in caller's newitemsz). * * Note: Caller should have already deleted all existing items with their * LP_DEAD bits set. */ void _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, bool bottomupdedup) { OffsetNumber offnum, minoff, maxoff; Page page = BufferGetPage(buf); BTPageOpaque opaque = BTPageGetOpaque(page); Page newpage; BTDedupState state; Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0; bool singlevalstrat = false; int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ newitemsz += sizeof(ItemIdData); /* * Initialize deduplication state. * * It would be possible for maxpostingsize (limit on posting list tuple * size) to be set to one third of the page. However, it seems like a * good idea to limit the size of posting lists to one sixth of a page. * That ought to leave us with a good split point when pages full of * duplicates can be split several times. */ state = (BTDedupState) palloc(sizeof(BTDedupStateData)); state->deduplicate = true; state->nmaxitems = 0; state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK); /* Metadata about base tuple of current pending posting list */ state->base = NULL; state->baseoff = InvalidOffsetNumber; state->basetupsize = 0; /* Metadata about current pending posting list TIDs */ state->htids = palloc(state->maxpostingsize); state->nhtids = 0; state->nitems = 0; /* Size of all physical tuples to be replaced by pending posting list */ state->phystupsize = 0; /* nintervals should be initialized to zero */ state->nintervals = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); /* * Consider applying "single value" strategy, though only if the page * seems likely to be split in the near future */ if (!bottomupdedup) singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem); /* * Deduplicate items from page, and write them to newpage. * * Copy the original page's LSN into newpage copy. This will become the * updated version of the page. We need this because XLogInsert will * examine the LSN and possibly dump it in a page image. */ newpage = PageGetTempPageCopySpecial(page); PageSetLSN(newpage, PageGetLSN(page)); /* Copy high key, if any */ if (!P_RIGHTMOST(opaque)) { ItemId hitemid = PageGetItemId(page, P_HIKEY); Size hitemsz = ItemIdGetLength(hitemid); IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid); if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "deduplication failed to add highkey"); } for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid = PageGetItemId(page, offnum); IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); Assert(!ItemIdIsDead(itemid)); if (offnum == minoff) { /* * No previous/base tuple for the data item -- use the data item * as base tuple of pending posting list */ _bt_dedup_start_pending(state, itup, offnum); } else if (state->deduplicate && _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* * Tuple is equal to base tuple of pending posting list. Heap * TID(s) for itup have been saved in state. */ } else { /* * Tuple is not equal to pending posting list tuple, or * _bt_dedup_save_htid() opted to not merge current item into * pending posting list for some other reason (e.g., adding more * TIDs would have caused posting list to exceed current * maxpostingsize). * * If state contains pending posting list with more than one item, * form new posting tuple and add it to our temp page (newpage). * Else add pending interval's base tuple to the temp page as-is. */ pagesaving += _bt_dedup_finish_pending(newpage, state); if (singlevalstrat) { /* * Single value strategy's extra steps. * * Lower maxpostingsize for sixth and final large posting list * tuple at the point where 5 maxpostingsize-capped tuples * have either been formed or observed. * * When a sixth maxpostingsize-capped item is formed/observed, * stop merging together tuples altogether. The few tuples * that remain at the end of the page won't be merged together * at all (at least not until after a future page split takes * place, when this page's newly allocated right sibling page * gets its first deduplication pass). */ if (state->nmaxitems == 5) _bt_singleval_fillfactor(page, state, newitemsz); else if (state->nmaxitems == 6) { state->deduplicate = false; singlevalstrat = false; /* won't be back here */ } } /* itup starts new pending posting list */ _bt_dedup_start_pending(state, itup, offnum); } } /* Handle the last item */ pagesaving += _bt_dedup_finish_pending(newpage, state); /* * If no items suitable for deduplication were found, newpage must be * exactly the same as the original page, so just return from function. * * We could determine whether or not to proceed on the basis the space * savings being sufficient to avoid an immediate page split instead. We * don't do that because there is some small value in nbtsplitloc.c always * operating against a page that is fully deduplicated (apart from * newitem). Besides, most of the cost has already been paid. */ if (state->nintervals == 0) { /* cannot leak memory here */ pfree(newpage); pfree(state->htids); pfree(state); return; } /* * By here, it's clear that deduplication will definitely go ahead. * * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway. * But keep things tidy. */ if (P_HAS_GARBAGE(opaque)) { BTPageOpaque nopaque = BTPageGetOpaque(newpage); nopaque->btpo_flags &= ~BTP_HAS_GARBAGE; } START_CRIT_SECTION(); PageRestoreTempPage(newpage, page); MarkBufferDirty(buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; xl_btree_dedup xlrec_dedup; xlrec_dedup.nintervals = state->nintervals; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_STANDARD); XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup); /* * The intervals array is not in the buffer, but pretend that it is. * When XLogInsert stores the whole buffer, the array need not be * stored too. */ XLogRegisterBufData(0, (char *) state->intervals, state->nintervals * sizeof(BTDedupInterval)); recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP); PageSetLSN(page, recptr); } END_CRIT_SECTION(); /* Local space accounting should agree with page accounting */ Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz); /* cannot leak memory here */ pfree(state->htids); pfree(state); } /* * Perform bottom-up index deletion pass. * * See if duplicate index tuples (plus certain nearby tuples) are eligible to * be deleted via bottom-up index deletion. The high level goal here is to * entirely prevent "unnecessary" page splits caused by MVCC version churn * from UPDATEs (when the UPDATEs don't logically modify any of the columns * covered by the 'rel' index). This is qualitative, not quantitative -- we * do not particularly care about once-off opportunities to delete many index * tuples together. * * See nbtree/README for details on the design of nbtree bottom-up deletion. * See access/tableam.h for a description of how we're expected to cooperate * with the tableam. * * Returns true on success, in which case caller can assume page split will be * avoided for a reasonable amount of time. Returns false when caller should * deduplicate the page (if possible at all). * * Note: Occasionally we return true despite failing to delete enough items to * avoid a split. This makes caller skip deduplication and go split the page * right away. Our return value is always just advisory information. * * Note: Caller should have already deleted all existing items with their * LP_DEAD bits set. */ bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, Size newitemsz) { OffsetNumber offnum, minoff, maxoff; Page page = BufferGetPage(buf); BTPageOpaque opaque = BTPageGetOpaque(page); BTDedupState state; TM_IndexDeleteOp delstate; bool neverdedup; int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ newitemsz += sizeof(ItemIdData); /* Initialize deduplication state */ state = (BTDedupState) palloc(sizeof(BTDedupStateData)); state->deduplicate = true; state->nmaxitems = 0; state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */ state->base = NULL; state->baseoff = InvalidOffsetNumber; state->basetupsize = 0; state->htids = palloc(state->maxpostingsize); state->nhtids = 0; state->nitems = 0; state->phystupsize = 0; state->nintervals = 0; /* * Initialize tableam state that describes bottom-up index deletion * operation. * * We'll go on to ask the tableam to search for TIDs whose index tuples we * can safely delete. The tableam will search until our leaf page space * target is satisfied, or until the cost of continuing with the tableam * operation seems too high. It focuses its efforts on TIDs associated * with duplicate index tuples that we mark "promising". * * This space target is a little arbitrary. The tableam must be able to * keep the costs and benefits in balance. We provide the tableam with * exhaustive information about what might work, without directly * concerning ourselves with avoiding work during the tableam call. Our * role in costing the bottom-up deletion process is strictly advisory. */ delstate.irel = rel; delstate.iblknum = BufferGetBlockNumber(buf); delstate.bottomup = true; delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz); delstate.ndeltids = 0; delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete)); delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus)); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid = PageGetItemId(page, offnum); IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); Assert(!ItemIdIsDead(itemid)); if (offnum == minoff) { /* itup starts first pending interval */ _bt_dedup_start_pending(state, itup, offnum); } else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* Tuple is equal; just added its TIDs to pending interval */ } else { /* Finalize interval -- move its TIDs to delete state */ _bt_bottomupdel_finish_pending(page, state, &delstate); /* itup starts new pending interval */ _bt_dedup_start_pending(state, itup, offnum); } } /* Finalize final interval -- move its TIDs to delete state */ _bt_bottomupdel_finish_pending(page, state, &delstate); /* * We don't give up now in the event of having few (or even zero) * promising tuples for the tableam because it's not up to us as the index * AM to manage costs (note that the tableam might have heuristics of its * own that work out what to do). We should at least avoid having our * caller do a useless deduplication pass after we return in the event of * zero promising tuples, though. */ neverdedup = false; if (state->nintervals == 0) neverdedup = true; pfree(state->htids); pfree(state); /* Ask tableam which TIDs are deletable, then physically delete them */ _bt_delitems_delete_check(rel, buf, heapRel, &delstate); pfree(delstate.deltids); pfree(delstate.status); /* Report "success" to caller unconditionally to avoid deduplication */ if (neverdedup) return true; /* Don't dedup when we won't end up back here any time soon anyway */ return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz); } /* * Create a new pending posting list tuple based on caller's base tuple. * * Every tuple processed by deduplication either becomes the base tuple for a * posting list, or gets its heap TID(s) accepted into a pending posting list. * A tuple that starts out as the base tuple for a posting list will only * actually be rewritten within _bt_dedup_finish_pending() when it turns out * that there are duplicates that can be merged into the base tuple. */ void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff) { Assert(state->nhtids == 0); Assert(state->nitems == 0); Assert(!BTreeTupleIsPivot(base)); /* * Copy heap TID(s) from new base tuple for new candidate posting list * into working state's array */ if (!BTreeTupleIsPosting(base)) { memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData)); state->nhtids = 1; state->basetupsize = IndexTupleSize(base); } else { int nposting; nposting = BTreeTupleGetNPosting(base); memcpy(state->htids, BTreeTupleGetPosting(base), sizeof(ItemPointerData) * nposting); state->nhtids = nposting; /* basetupsize should not include existing posting list */ state->basetupsize = BTreeTupleGetPostingOffset(base); } /* * Save new base tuple itself -- it'll be needed if we actually create a * new posting list from new pending posting list. * * Must maintain physical size of all existing tuples (including line * pointer overhead) so that we can calculate space savings on page. */ state->nitems = 1; state->base = base; state->baseoff = baseoff; state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData); /* Also save baseoff in pending state for interval */ state->intervals[state->nintervals].baseoff = state->baseoff; } /* * Save itup heap TID(s) into pending posting list where possible. * * Returns bool indicating if the pending posting list managed by state now * includes itup's heap TID(s). */ bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup) { int nhtids; ItemPointer htids; Size mergedtupsz; Assert(!BTreeTupleIsPivot(itup)); if (!BTreeTupleIsPosting(itup)) { nhtids = 1; htids = &itup->t_tid; } else { nhtids = BTreeTupleGetNPosting(itup); htids = BTreeTupleGetPosting(itup); } /* * Don't append (have caller finish pending posting list as-is) if * appending heap TID(s) from itup would put us over maxpostingsize limit. * * This calculation needs to match the code used within _bt_form_posting() * for new posting list tuples. */ mergedtupsz = MAXALIGN(state->basetupsize + (state->nhtids + nhtids) * sizeof(ItemPointerData)); if (mergedtupsz > state->maxpostingsize) { /* * Count this as an oversized item for single value strategy, though * only when there are 50 TIDs in the final posting list tuple. This * limit (which is fairly arbitrary) avoids confusion about how many * 1/6 of a page tuples have been encountered/created by the current * deduplication pass. * * Note: We deliberately don't consider which deduplication pass * merged together tuples to create this item (could be a previous * deduplication pass, or current pass). See _bt_do_singleval() * comments. */ if (state->nhtids > 50) state->nmaxitems++; return false; } /* * Save heap TIDs to pending posting list tuple -- itup can be merged into * pending posting list */ state->nitems++; memcpy(state->htids + state->nhtids, htids, sizeof(ItemPointerData) * nhtids); state->nhtids += nhtids; state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); return true; } /* * Finalize pending posting list tuple, and add it to the page. Final tuple * is based on saved base tuple, and saved list of heap TIDs. * * Returns space saving from deduplicating to make a new posting list tuple. * Note that this includes line pointer overhead. This is zero in the case * where no deduplication was possible. */ Size _bt_dedup_finish_pending(Page newpage, BTDedupState state) { OffsetNumber tupoff; Size tuplesz; Size spacesaving; Assert(state->nitems > 0); Assert(state->nitems <= state->nhtids); Assert(state->intervals[state->nintervals].baseoff == state->baseoff); tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage)); if (state->nitems == 1) { /* Use original, unchanged base tuple */ tuplesz = IndexTupleSize(state->base); Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base))); Assert(tuplesz <= BTMaxItemSize(newpage)); if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff, false, false) == InvalidOffsetNumber) elog(ERROR, "deduplication failed to add tuple to page"); spacesaving = 0; } else { IndexTuple final; /* Form a tuple with a posting list */ final = _bt_form_posting(state->base, state->htids, state->nhtids); tuplesz = IndexTupleSize(final); Assert(tuplesz <= state->maxpostingsize); /* Save final number of items for posting list */ state->intervals[state->nintervals].nitems = state->nitems; Assert(tuplesz == MAXALIGN(IndexTupleSize(final))); Assert(tuplesz <= BTMaxItemSize(newpage)); if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false, false) == InvalidOffsetNumber) elog(ERROR, "deduplication failed to add tuple to page"); pfree(final); spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData)); /* Increment nintervals, since we wrote a new posting list tuple */ state->nintervals++; Assert(spacesaving > 0 && spacesaving < BLCKSZ); } /* Reset state for next pending posting list */ state->nhtids = 0; state->nitems = 0; state->phystupsize = 0; return spacesaving; } /* * Finalize interval during bottom-up index deletion. * * During a bottom-up pass we expect that TIDs will be recorded in dedup state * first, and then get moved over to delstate (in variable-sized batches) by * calling here. Call here happens when the number of TIDs in a dedup * interval is known, and interval gets finalized (i.e. when caller sees next * tuple on the page is not a duplicate, or when caller runs out of tuples to * process from leaf page). * * This is where bottom-up deletion determines and remembers which entries are * duplicates. This will be important information to the tableam delete * infrastructure later on. Plain index tuple duplicates are marked * "promising" here, per tableam contract. * * Our approach to marking entries whose TIDs come from posting lists is more * complicated. Posting lists can only be formed by a deduplication pass (or * during an index build), so recent version churn affecting the pointed-to * logical rows is not particularly likely. We may still give a weak signal * about posting list tuples' entries (by marking just one of its TIDs/entries * promising), though this is only a possibility in the event of further * duplicate index tuples in final interval that covers posting list tuple (as * in the plain tuple case). A weak signal/hint will be useful to the tableam * when it has no stronger signal to go with for the deletion operation as a * whole. * * The heuristics we use work well in practice because we only need to give * the tableam the right _general_ idea about where to look. Garbage tends to * naturally get concentrated in relatively few table blocks with workloads * that bottom-up deletion targets. The tableam cannot possibly rank all * available table blocks sensibly based on the hints we provide, but that's * okay -- only the extremes matter. The tableam just needs to be able to * predict which few table blocks will have the most tuples that are safe to * delete for each deletion operation, with low variance across related * deletion operations. */ static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, TM_IndexDeleteOp *delstate) { bool dupinterval = (state->nitems > 1); Assert(state->nitems > 0); Assert(state->nitems <= state->nhtids); Assert(state->intervals[state->nintervals].baseoff == state->baseoff); for (int i = 0; i < state->nitems; i++) { OffsetNumber offnum = state->baseoff + i; ItemId itemid = PageGetItemId(page, offnum); IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids]; TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids]; if (!BTreeTupleIsPosting(itup)) { /* Simple case: A plain non-pivot tuple */ ideltid->tid = itup->t_tid; ideltid->id = delstate->ndeltids; istatus->idxoffnum = offnum; istatus->knowndeletable = false; /* for now */ istatus->promising = dupinterval; /* simple rule */ istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData); delstate->ndeltids++; } else { /* * Complicated case: A posting list tuple. * * We make the conservative assumption that there can only be at * most one affected logical row per posting list tuple. There * will be at most one promising entry in deltids to represent * this presumed lone logical row. Note that this isn't even * considered unless the posting list tuple is also in an interval * of duplicates -- this complicated rule is just a variant of the * simple rule used to decide if plain index tuples are promising. */ int nitem = BTreeTupleGetNPosting(itup); bool firstpromising = false; bool lastpromising = false; Assert(_bt_posting_valid(itup)); if (dupinterval) { /* * Complicated rule: either the first or last TID in the * posting list gets marked promising (if any at all) */ BlockNumber minblocklist, midblocklist, maxblocklist; ItemPointer mintid, midtid, maxtid; mintid = BTreeTupleGetHeapTID(itup); midtid = BTreeTupleGetPostingN(itup, nitem / 2); maxtid = BTreeTupleGetMaxHeapTID(itup); minblocklist = ItemPointerGetBlockNumber(mintid); midblocklist = ItemPointerGetBlockNumber(midtid); maxblocklist = ItemPointerGetBlockNumber(maxtid); /* Only entry with predominant table block can be promising */ firstpromising = (minblocklist == midblocklist); lastpromising = (!firstpromising && midblocklist == maxblocklist); } for (int p = 0; p < nitem; p++) { ItemPointer htid = BTreeTupleGetPostingN(itup, p); ideltid->tid = *htid; ideltid->id = delstate->ndeltids; istatus->idxoffnum = offnum; istatus->knowndeletable = false; /* for now */ istatus->promising = false; if ((firstpromising && p == 0) || (lastpromising && p == nitem - 1)) istatus->promising = true; istatus->freespace = sizeof(ItemPointerData); /* at worst */ ideltid++; istatus++; delstate->ndeltids++; } } } if (dupinterval) { state->intervals[state->nintervals].nitems = state->nitems; state->nintervals++; } /* Reset state for next interval */ state->nhtids = 0; state->nitems = 0; state->phystupsize = 0; } /* * Determine if page non-pivot tuples (data items) are all duplicates of the * same value -- if they are, deduplication's "single value" strategy should * be applied. The general goal of this strategy is to ensure that * nbtsplitloc.c (which uses its own single value strategy) will find a useful * split point as further duplicates are inserted, and successive rightmost * page splits occur among pages that store the same duplicate value. When * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full, * just like it would if deduplication were disabled. * * We expect that affected workloads will require _several_ single value * strategy deduplication passes (over a page that only stores duplicates) * before the page is finally split. The first deduplication pass should only * find regular non-pivot tuples. Later deduplication passes will find * existing maxpostingsize-capped posting list tuples, which must be skipped * over. The penultimate pass is generally the first pass that actually * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a * few untouched non-pivot tuples. The final deduplication pass won't free * any space -- it will skip over everything without merging anything (it * retraces the steps of the penultimate pass). * * Fortunately, having several passes isn't too expensive. Each pass (after * the first pass) won't spend many cycles on the large posting list tuples * left by previous passes. Each pass will find a large contiguous group of * smaller duplicate tuples to merge together at the end of the page. */ static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem) { int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); ItemId itemid; IndexTuple itup; itemid = PageGetItemId(page, minoff); itup = (IndexTuple) PageGetItem(page, itemid); if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) { itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); itup = (IndexTuple) PageGetItem(page, itemid); if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) return true; } return false; } /* * Lower maxpostingsize when using "single value" strategy, to avoid a sixth * and final maxpostingsize-capped tuple. The sixth and final posting list * tuple will end up somewhat smaller than the first five. (Note: The first * five tuples could actually just be very large duplicate tuples that * couldn't be merged together at all. Deduplication will simply not modify * the page when that happens.) * * When there are six posting lists on the page (after current deduplication * pass goes on to create/observe a sixth very large tuple), caller should end * its deduplication pass. It isn't useful to try to deduplicate items that * are supposed to end up on the new right sibling page following the * anticipated page split. A future deduplication pass of future right * sibling page might take care of it. (This is why the first single value * strategy deduplication pass for a given leaf page will generally find only * plain non-pivot tuples -- see _bt_do_singleval() comments.) */ static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz) { Size leftfree; int reduction; /* This calculation needs to match nbtsplitloc.c */ leftfree = PageGetPageSize(page) - SizeOfPageHeaderData - MAXALIGN(sizeof(BTPageOpaqueData)); /* Subtract size of new high key (includes pivot heap TID space) */ leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData)); /* * Reduce maxpostingsize by an amount equal to target free space on left * half of page */ reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0); if (state->maxpostingsize > reduction) state->maxpostingsize -= reduction; else state->maxpostingsize = 0; } /* * Build a posting list tuple based on caller's "base" index tuple and list of * heap TIDs. When nhtids == 1, builds a standard non-pivot tuple without a * posting list. (Posting list tuples can never have a single heap TID, partly * because that ensures that deduplication always reduces final MAXALIGN()'d * size of entire tuple.) * * Convention is that posting list starts at a MAXALIGN()'d offset (rather * than a SHORTALIGN()'d offset), in line with the approach taken when * appending a heap TID to new pivot tuple/high key during suffix truncation. * This sometimes wastes a little space that was only needed as alignment * padding in the original tuple. Following this convention simplifies the * space accounting used when deduplicating a page (the same convention * simplifies the accounting for choosing a point to split a page at). * * Note: Caller's "htids" array must be unique and already in ascending TID * order. Any existing heap TIDs from "base" won't automatically appear in * returned posting list tuple (they must be included in htids array.) */ IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids) { uint32 keysize, newsize; IndexTuple itup; if (BTreeTupleIsPosting(base)) keysize = BTreeTupleGetPostingOffset(base); else keysize = IndexTupleSize(base); Assert(!BTreeTupleIsPivot(base)); Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX); Assert(keysize == MAXALIGN(keysize)); /* Determine final size of new tuple */ if (nhtids > 1) newsize = MAXALIGN(keysize + nhtids * sizeof(ItemPointerData)); else newsize = keysize; Assert(newsize <= INDEX_SIZE_MASK); Assert(newsize == MAXALIGN(newsize)); /* Allocate memory using palloc0() (matches index_form_tuple()) */ itup = palloc0(newsize); memcpy(itup, base, keysize); itup->t_info &= ~INDEX_SIZE_MASK; itup->t_info |= newsize; if (nhtids > 1) { /* Form posting list tuple */ BTreeTupleSetPosting(itup, nhtids, keysize); memcpy(BTreeTupleGetPosting(itup), htids, sizeof(ItemPointerData) * nhtids); Assert(_bt_posting_valid(itup)); } else { /* Form standard non-pivot tuple */ itup->t_info &= ~INDEX_ALT_TID_MASK; ItemPointerCopy(htids, &itup->t_tid); Assert(ItemPointerIsValid(&itup->t_tid)); } return itup; } /* * Generate a replacement tuple by "updating" a posting list tuple so that it * no longer has TIDs that need to be deleted. * * Used by both VACUUM and index deletion. Caller's vacposting argument * points to the existing posting list tuple to be updated. * * On return, caller's vacposting argument will point to final "updated" * tuple, which will be palloc()'d in caller's memory context. */ void _bt_update_posting(BTVacuumPosting vacposting) { IndexTuple origtuple = vacposting->itup; uint32 keysize, newsize; IndexTuple itup; int nhtids; int ui, d; ItemPointer htids; nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids; Assert(_bt_posting_valid(origtuple)); Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple)); /* * Determine final size of new tuple. * * This calculation needs to match the code used within _bt_form_posting() * for new posting list tuples. We avoid calling _bt_form_posting() here * to save ourselves a second memory allocation for a htids workspace. */ keysize = BTreeTupleGetPostingOffset(origtuple); if (nhtids > 1) newsize = MAXALIGN(keysize + nhtids * sizeof(ItemPointerData)); else newsize = keysize; Assert(newsize <= INDEX_SIZE_MASK); Assert(newsize == MAXALIGN(newsize)); /* Allocate memory using palloc0() (matches index_form_tuple()) */ itup = palloc0(newsize); memcpy(itup, origtuple, keysize); itup->t_info &= ~INDEX_SIZE_MASK; itup->t_info |= newsize; if (nhtids > 1) { /* Form posting list tuple */ BTreeTupleSetPosting(itup, nhtids, keysize); htids = BTreeTupleGetPosting(itup); } else { /* Form standard non-pivot tuple */ itup->t_info &= ~INDEX_ALT_TID_MASK; htids = &itup->t_tid; } ui = 0; d = 0; for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++) { if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i) { d++; continue; } htids[ui++] = *BTreeTupleGetPostingN(origtuple, i); } Assert(ui == nhtids); Assert(d == vacposting->ndeletedtids); Assert(nhtids == 1 || _bt_posting_valid(itup)); Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid)); /* vacposting arg's itup will now point to updated version */ vacposting->itup = itup; } /* * Prepare for a posting list split by swapping heap TID in newitem with heap * TID from original posting list (the 'oposting' heap TID located at offset * 'postingoff'). Modifies newitem, so caller should pass their own private * copy that can safely be modified. * * Returns new posting list tuple, which is palloc()'d in caller's context. * This is guaranteed to be the same size as 'oposting'. Modified newitem is * what caller actually inserts. (This happens inside the same critical * section that performs an in-place update of old posting list using new * posting list returned here.) * * While the keys from newitem and oposting must be opclass equal, and must * generate identical output when run through the underlying type's output * function, it doesn't follow that their representations match exactly. * Caller must avoid assuming that there can't be representational differences * that make datums from oposting bigger or smaller than the corresponding * datums from newitem. For example, differences in TOAST input state might * break a faulty assumption about tuple size (the executor is entitled to * apply TOAST compression based on its own criteria). It also seems possible * that further representational variation will be introduced in the future, * in order to support nbtree features like page-level prefix compression. * * See nbtree/README for details on the design of posting list splits. */ IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff) { int nhtids; char *replacepos; char *replaceposright; Size nmovebytes; IndexTuple nposting; nhtids = BTreeTupleGetNPosting(oposting); Assert(_bt_posting_valid(oposting)); /* * The postingoff argument originated as a _bt_binsrch_posting() return * value. It will be 0 in the event of corruption that makes a leaf page * contain a non-pivot tuple that's somehow identical to newitem (no two * non-pivot tuples should ever have the same TID). This has been known * to happen in the field from time to time. * * Perform a basic sanity check to catch this case now. */ if (!(postingoff > 0 && postingoff < nhtids)) elog(ERROR, "posting list tuple with %d items cannot be split at offset %d", nhtids, postingoff); /* * Move item pointers in posting list to make a gap for the new item's * heap TID. We shift TIDs one place to the right, losing original * rightmost TID. (nmovebytes must not include TIDs to the left of * postingoff, nor the existing rightmost/max TID that gets overwritten.) */ nposting = CopyIndexTuple(oposting); replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff); replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1); nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData); memmove(replaceposright, replacepos, nmovebytes); /* Fill the gap at postingoff with TID of new item (original new TID) */ Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem)); ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos); /* Now copy oposting's rightmost/max TID into new item (final new TID) */ ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid); Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting), BTreeTupleGetHeapTID(newitem)) < 0); Assert(_bt_posting_valid(nposting)); return nposting; } /* * Verify posting list invariants for "posting", which must be a posting list * tuple. Used within assertions. */ #ifdef USE_ASSERT_CHECKING static bool _bt_posting_valid(IndexTuple posting) { ItemPointerData last; ItemPointer htid; if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2) return false; /* Remember first heap TID for loop */ ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last); if (!ItemPointerIsValid(&last)) return false; /* Iterate, starting from second TID */ for (int i = 1; i < BTreeTupleGetNPosting(posting); i++) { htid = BTreeTupleGetPostingN(posting, i); if (!ItemPointerIsValid(htid)) return false; if (ItemPointerCompare(htid, &last) <= 0) return false; ItemPointerCopy(htid, &last); } return true; } #endif