1106 lines
37 KiB
C
1106 lines
37 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* nbtdedup.c
|
|
* Deduplicate or bottom-up delete items in Postgres btrees.
|
|
*
|
|
* Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/access/nbtree/nbtdedup.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/nbtree.h"
|
|
#include "access/nbtxlog.h"
|
|
#include "access/xloginsert.h"
|
|
#include "miscadmin.h"
|
|
#include "utils/rel.h"
|
|
|
|
static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
|
|
TM_IndexDeleteOp *delstate);
|
|
static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
|
|
OffsetNumber minoff, IndexTuple newitem);
|
|
static void _bt_singleval_fillfactor(Page page, BTDedupState state,
|
|
Size newitemsz);
|
|
#ifdef USE_ASSERT_CHECKING
|
|
static bool _bt_posting_valid(IndexTuple posting);
|
|
#endif
|
|
|
|
/*
|
|
* Perform a deduplication pass.
|
|
*
|
|
* The general approach taken here is to perform as much deduplication as
|
|
* possible to free as much space as possible. Note, however, that "single
|
|
* value" strategy is used for !bottomupdedup callers when the page is full of
|
|
* tuples of a single value. Deduplication passes that apply the strategy
|
|
* will leave behind a few untouched tuples at the end of the page, preparing
|
|
* the page for an anticipated page split that uses nbtsplitloc.c's own single
|
|
* value strategy. Our high level goal is to delay merging the untouched
|
|
* tuples until after the page splits.
|
|
*
|
|
* When a call to _bt_bottomupdel_pass() just took place (and failed), our
|
|
* high level goal is to prevent a page split entirely by buying more time.
|
|
* We still hope that a page split can be avoided altogether. That's why
|
|
* single value strategy is not even considered for bottomupdedup callers.
|
|
*
|
|
* The page will have to be split if we cannot successfully free at least
|
|
* newitemsz (we also need space for newitem's line pointer, which isn't
|
|
* included in caller's newitemsz).
|
|
*
|
|
* Note: Caller should have already deleted all existing items with their
|
|
* LP_DEAD bits set.
|
|
*/
|
|
void
|
|
_bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz,
|
|
bool bottomupdedup)
|
|
{
|
|
OffsetNumber offnum,
|
|
minoff,
|
|
maxoff;
|
|
Page page = BufferGetPage(buf);
|
|
BTPageOpaque opaque = BTPageGetOpaque(page);
|
|
Page newpage;
|
|
BTDedupState state;
|
|
Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
|
|
bool singlevalstrat = false;
|
|
int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
|
|
|
|
/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
|
|
newitemsz += sizeof(ItemIdData);
|
|
|
|
/*
|
|
* Initialize deduplication state.
|
|
*
|
|
* It would be possible for maxpostingsize (limit on posting list tuple
|
|
* size) to be set to one third of the page. However, it seems like a
|
|
* good idea to limit the size of posting lists to one sixth of a page.
|
|
* That ought to leave us with a good split point when pages full of
|
|
* duplicates can be split several times.
|
|
*/
|
|
state = (BTDedupState) palloc(sizeof(BTDedupStateData));
|
|
state->deduplicate = true;
|
|
state->nmaxitems = 0;
|
|
state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
|
|
/* Metadata about base tuple of current pending posting list */
|
|
state->base = NULL;
|
|
state->baseoff = InvalidOffsetNumber;
|
|
state->basetupsize = 0;
|
|
/* Metadata about current pending posting list TIDs */
|
|
state->htids = palloc(state->maxpostingsize);
|
|
state->nhtids = 0;
|
|
state->nitems = 0;
|
|
/* Size of all physical tuples to be replaced by pending posting list */
|
|
state->phystupsize = 0;
|
|
/* nintervals should be initialized to zero */
|
|
state->nintervals = 0;
|
|
|
|
minoff = P_FIRSTDATAKEY(opaque);
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
/*
|
|
* Consider applying "single value" strategy, though only if the page
|
|
* seems likely to be split in the near future
|
|
*/
|
|
if (!bottomupdedup)
|
|
singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
|
|
|
|
/*
|
|
* Deduplicate items from page, and write them to newpage.
|
|
*
|
|
* Copy the original page's LSN into newpage copy. This will become the
|
|
* updated version of the page. We need this because XLogInsert will
|
|
* examine the LSN and possibly dump it in a page image.
|
|
*/
|
|
newpage = PageGetTempPageCopySpecial(page);
|
|
PageSetLSN(newpage, PageGetLSN(page));
|
|
|
|
/* Copy high key, if any */
|
|
if (!P_RIGHTMOST(opaque))
|
|
{
|
|
ItemId hitemid = PageGetItemId(page, P_HIKEY);
|
|
Size hitemsz = ItemIdGetLength(hitemid);
|
|
IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
|
|
|
|
if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
|
|
false, false) == InvalidOffsetNumber)
|
|
elog(ERROR, "deduplication failed to add highkey");
|
|
}
|
|
|
|
for (offnum = minoff;
|
|
offnum <= maxoff;
|
|
offnum = OffsetNumberNext(offnum))
|
|
{
|
|
ItemId itemid = PageGetItemId(page, offnum);
|
|
IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
|
|
|
|
Assert(!ItemIdIsDead(itemid));
|
|
|
|
if (offnum == minoff)
|
|
{
|
|
/*
|
|
* No previous/base tuple for the data item -- use the data item
|
|
* as base tuple of pending posting list
|
|
*/
|
|
_bt_dedup_start_pending(state, itup, offnum);
|
|
}
|
|
else if (state->deduplicate &&
|
|
_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
|
|
_bt_dedup_save_htid(state, itup))
|
|
{
|
|
/*
|
|
* Tuple is equal to base tuple of pending posting list. Heap
|
|
* TID(s) for itup have been saved in state.
|
|
*/
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* Tuple is not equal to pending posting list tuple, or
|
|
* _bt_dedup_save_htid() opted to not merge current item into
|
|
* pending posting list for some other reason (e.g., adding more
|
|
* TIDs would have caused posting list to exceed current
|
|
* maxpostingsize).
|
|
*
|
|
* If state contains pending posting list with more than one item,
|
|
* form new posting tuple and add it to our temp page (newpage).
|
|
* Else add pending interval's base tuple to the temp page as-is.
|
|
*/
|
|
pagesaving += _bt_dedup_finish_pending(newpage, state);
|
|
|
|
if (singlevalstrat)
|
|
{
|
|
/*
|
|
* Single value strategy's extra steps.
|
|
*
|
|
* Lower maxpostingsize for sixth and final large posting list
|
|
* tuple at the point where 5 maxpostingsize-capped tuples
|
|
* have either been formed or observed.
|
|
*
|
|
* When a sixth maxpostingsize-capped item is formed/observed,
|
|
* stop merging together tuples altogether. The few tuples
|
|
* that remain at the end of the page won't be merged together
|
|
* at all (at least not until after a future page split takes
|
|
* place, when this page's newly allocated right sibling page
|
|
* gets its first deduplication pass).
|
|
*/
|
|
if (state->nmaxitems == 5)
|
|
_bt_singleval_fillfactor(page, state, newitemsz);
|
|
else if (state->nmaxitems == 6)
|
|
{
|
|
state->deduplicate = false;
|
|
singlevalstrat = false; /* won't be back here */
|
|
}
|
|
}
|
|
|
|
/* itup starts new pending posting list */
|
|
_bt_dedup_start_pending(state, itup, offnum);
|
|
}
|
|
}
|
|
|
|
/* Handle the last item */
|
|
pagesaving += _bt_dedup_finish_pending(newpage, state);
|
|
|
|
/*
|
|
* If no items suitable for deduplication were found, newpage must be
|
|
* exactly the same as the original page, so just return from function.
|
|
*
|
|
* We could determine whether or not to proceed on the basis the space
|
|
* savings being sufficient to avoid an immediate page split instead. We
|
|
* don't do that because there is some small value in nbtsplitloc.c always
|
|
* operating against a page that is fully deduplicated (apart from
|
|
* newitem). Besides, most of the cost has already been paid.
|
|
*/
|
|
if (state->nintervals == 0)
|
|
{
|
|
/* cannot leak memory here */
|
|
pfree(newpage);
|
|
pfree(state->htids);
|
|
pfree(state);
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* By here, it's clear that deduplication will definitely go ahead.
|
|
*
|
|
* Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
|
|
* index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
|
|
* But keep things tidy.
|
|
*/
|
|
if (P_HAS_GARBAGE(opaque))
|
|
{
|
|
BTPageOpaque nopaque = BTPageGetOpaque(newpage);
|
|
|
|
nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
|
|
}
|
|
|
|
START_CRIT_SECTION();
|
|
|
|
PageRestoreTempPage(newpage, page);
|
|
MarkBufferDirty(buf);
|
|
|
|
/* XLOG stuff */
|
|
if (RelationNeedsWAL(rel))
|
|
{
|
|
XLogRecPtr recptr;
|
|
xl_btree_dedup xlrec_dedup;
|
|
|
|
xlrec_dedup.nintervals = state->nintervals;
|
|
|
|
XLogBeginInsert();
|
|
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
|
|
XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
|
|
|
|
/*
|
|
* The intervals array is not in the buffer, but pretend that it is.
|
|
* When XLogInsert stores the whole buffer, the array need not be
|
|
* stored too.
|
|
*/
|
|
XLogRegisterBufData(0, (char *) state->intervals,
|
|
state->nintervals * sizeof(BTDedupInterval));
|
|
|
|
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
|
|
|
|
PageSetLSN(page, recptr);
|
|
}
|
|
|
|
END_CRIT_SECTION();
|
|
|
|
/* Local space accounting should agree with page accounting */
|
|
Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
|
|
|
|
/* cannot leak memory here */
|
|
pfree(state->htids);
|
|
pfree(state);
|
|
}
|
|
|
|
/*
|
|
* Perform bottom-up index deletion pass.
|
|
*
|
|
* See if duplicate index tuples (plus certain nearby tuples) are eligible to
|
|
* be deleted via bottom-up index deletion. The high level goal here is to
|
|
* entirely prevent "unnecessary" page splits caused by MVCC version churn
|
|
* from UPDATEs (when the UPDATEs don't logically modify any of the columns
|
|
* covered by the 'rel' index). This is qualitative, not quantitative -- we
|
|
* do not particularly care about once-off opportunities to delete many index
|
|
* tuples together.
|
|
*
|
|
* See nbtree/README for details on the design of nbtree bottom-up deletion.
|
|
* See access/tableam.h for a description of how we're expected to cooperate
|
|
* with the tableam.
|
|
*
|
|
* Returns true on success, in which case caller can assume page split will be
|
|
* avoided for a reasonable amount of time. Returns false when caller should
|
|
* deduplicate the page (if possible at all).
|
|
*
|
|
* Note: Occasionally we return true despite failing to delete enough items to
|
|
* avoid a split. This makes caller skip deduplication and go split the page
|
|
* right away. Our return value is always just advisory information.
|
|
*
|
|
* Note: Caller should have already deleted all existing items with their
|
|
* LP_DEAD bits set.
|
|
*/
|
|
bool
|
|
_bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
|
|
Size newitemsz)
|
|
{
|
|
OffsetNumber offnum,
|
|
minoff,
|
|
maxoff;
|
|
Page page = BufferGetPage(buf);
|
|
BTPageOpaque opaque = BTPageGetOpaque(page);
|
|
BTDedupState state;
|
|
TM_IndexDeleteOp delstate;
|
|
bool neverdedup;
|
|
int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
|
|
|
|
/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
|
|
newitemsz += sizeof(ItemIdData);
|
|
|
|
/* Initialize deduplication state */
|
|
state = (BTDedupState) palloc(sizeof(BTDedupStateData));
|
|
state->deduplicate = true;
|
|
state->nmaxitems = 0;
|
|
state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
|
|
state->base = NULL;
|
|
state->baseoff = InvalidOffsetNumber;
|
|
state->basetupsize = 0;
|
|
state->htids = palloc(state->maxpostingsize);
|
|
state->nhtids = 0;
|
|
state->nitems = 0;
|
|
state->phystupsize = 0;
|
|
state->nintervals = 0;
|
|
|
|
/*
|
|
* Initialize tableam state that describes bottom-up index deletion
|
|
* operation.
|
|
*
|
|
* We'll go on to ask the tableam to search for TIDs whose index tuples we
|
|
* can safely delete. The tableam will search until our leaf page space
|
|
* target is satisfied, or until the cost of continuing with the tableam
|
|
* operation seems too high. It focuses its efforts on TIDs associated
|
|
* with duplicate index tuples that we mark "promising".
|
|
*
|
|
* This space target is a little arbitrary. The tableam must be able to
|
|
* keep the costs and benefits in balance. We provide the tableam with
|
|
* exhaustive information about what might work, without directly
|
|
* concerning ourselves with avoiding work during the tableam call. Our
|
|
* role in costing the bottom-up deletion process is strictly advisory.
|
|
*/
|
|
delstate.irel = rel;
|
|
delstate.iblknum = BufferGetBlockNumber(buf);
|
|
delstate.bottomup = true;
|
|
delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
|
|
delstate.ndeltids = 0;
|
|
delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
|
|
delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
|
|
|
|
minoff = P_FIRSTDATAKEY(opaque);
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
for (offnum = minoff;
|
|
offnum <= maxoff;
|
|
offnum = OffsetNumberNext(offnum))
|
|
{
|
|
ItemId itemid = PageGetItemId(page, offnum);
|
|
IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
|
|
|
|
Assert(!ItemIdIsDead(itemid));
|
|
|
|
if (offnum == minoff)
|
|
{
|
|
/* itup starts first pending interval */
|
|
_bt_dedup_start_pending(state, itup, offnum);
|
|
}
|
|
else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
|
|
_bt_dedup_save_htid(state, itup))
|
|
{
|
|
/* Tuple is equal; just added its TIDs to pending interval */
|
|
}
|
|
else
|
|
{
|
|
/* Finalize interval -- move its TIDs to delete state */
|
|
_bt_bottomupdel_finish_pending(page, state, &delstate);
|
|
|
|
/* itup starts new pending interval */
|
|
_bt_dedup_start_pending(state, itup, offnum);
|
|
}
|
|
}
|
|
/* Finalize final interval -- move its TIDs to delete state */
|
|
_bt_bottomupdel_finish_pending(page, state, &delstate);
|
|
|
|
/*
|
|
* We don't give up now in the event of having few (or even zero)
|
|
* promising tuples for the tableam because it's not up to us as the index
|
|
* AM to manage costs (note that the tableam might have heuristics of its
|
|
* own that work out what to do). We should at least avoid having our
|
|
* caller do a useless deduplication pass after we return in the event of
|
|
* zero promising tuples, though.
|
|
*/
|
|
neverdedup = false;
|
|
if (state->nintervals == 0)
|
|
neverdedup = true;
|
|
|
|
pfree(state->htids);
|
|
pfree(state);
|
|
|
|
/* Ask tableam which TIDs are deletable, then physically delete them */
|
|
_bt_delitems_delete_check(rel, buf, heapRel, &delstate);
|
|
|
|
pfree(delstate.deltids);
|
|
pfree(delstate.status);
|
|
|
|
/* Report "success" to caller unconditionally to avoid deduplication */
|
|
if (neverdedup)
|
|
return true;
|
|
|
|
/* Don't dedup when we won't end up back here any time soon anyway */
|
|
return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
|
|
}
|
|
|
|
/*
|
|
* Create a new pending posting list tuple based on caller's base tuple.
|
|
*
|
|
* Every tuple processed by deduplication either becomes the base tuple for a
|
|
* posting list, or gets its heap TID(s) accepted into a pending posting list.
|
|
* A tuple that starts out as the base tuple for a posting list will only
|
|
* actually be rewritten within _bt_dedup_finish_pending() when it turns out
|
|
* that there are duplicates that can be merged into the base tuple.
|
|
*/
|
|
void
|
|
_bt_dedup_start_pending(BTDedupState state, IndexTuple base,
|
|
OffsetNumber baseoff)
|
|
{
|
|
Assert(state->nhtids == 0);
|
|
Assert(state->nitems == 0);
|
|
Assert(!BTreeTupleIsPivot(base));
|
|
|
|
/*
|
|
* Copy heap TID(s) from new base tuple for new candidate posting list
|
|
* into working state's array
|
|
*/
|
|
if (!BTreeTupleIsPosting(base))
|
|
{
|
|
memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
|
|
state->nhtids = 1;
|
|
state->basetupsize = IndexTupleSize(base);
|
|
}
|
|
else
|
|
{
|
|
int nposting;
|
|
|
|
nposting = BTreeTupleGetNPosting(base);
|
|
memcpy(state->htids, BTreeTupleGetPosting(base),
|
|
sizeof(ItemPointerData) * nposting);
|
|
state->nhtids = nposting;
|
|
/* basetupsize should not include existing posting list */
|
|
state->basetupsize = BTreeTupleGetPostingOffset(base);
|
|
}
|
|
|
|
/*
|
|
* Save new base tuple itself -- it'll be needed if we actually create a
|
|
* new posting list from new pending posting list.
|
|
*
|
|
* Must maintain physical size of all existing tuples (including line
|
|
* pointer overhead) so that we can calculate space savings on page.
|
|
*/
|
|
state->nitems = 1;
|
|
state->base = base;
|
|
state->baseoff = baseoff;
|
|
state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
|
|
/* Also save baseoff in pending state for interval */
|
|
state->intervals[state->nintervals].baseoff = state->baseoff;
|
|
}
|
|
|
|
/*
|
|
* Save itup heap TID(s) into pending posting list where possible.
|
|
*
|
|
* Returns bool indicating if the pending posting list managed by state now
|
|
* includes itup's heap TID(s).
|
|
*/
|
|
bool
|
|
_bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
|
|
{
|
|
int nhtids;
|
|
ItemPointer htids;
|
|
Size mergedtupsz;
|
|
|
|
Assert(!BTreeTupleIsPivot(itup));
|
|
|
|
if (!BTreeTupleIsPosting(itup))
|
|
{
|
|
nhtids = 1;
|
|
htids = &itup->t_tid;
|
|
}
|
|
else
|
|
{
|
|
nhtids = BTreeTupleGetNPosting(itup);
|
|
htids = BTreeTupleGetPosting(itup);
|
|
}
|
|
|
|
/*
|
|
* Don't append (have caller finish pending posting list as-is) if
|
|
* appending heap TID(s) from itup would put us over maxpostingsize limit.
|
|
*
|
|
* This calculation needs to match the code used within _bt_form_posting()
|
|
* for new posting list tuples.
|
|
*/
|
|
mergedtupsz = MAXALIGN(state->basetupsize +
|
|
(state->nhtids + nhtids) * sizeof(ItemPointerData));
|
|
|
|
if (mergedtupsz > state->maxpostingsize)
|
|
{
|
|
/*
|
|
* Count this as an oversized item for single value strategy, though
|
|
* only when there are 50 TIDs in the final posting list tuple. This
|
|
* limit (which is fairly arbitrary) avoids confusion about how many
|
|
* 1/6 of a page tuples have been encountered/created by the current
|
|
* deduplication pass.
|
|
*
|
|
* Note: We deliberately don't consider which deduplication pass
|
|
* merged together tuples to create this item (could be a previous
|
|
* deduplication pass, or current pass). See _bt_do_singleval()
|
|
* comments.
|
|
*/
|
|
if (state->nhtids > 50)
|
|
state->nmaxitems++;
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Save heap TIDs to pending posting list tuple -- itup can be merged into
|
|
* pending posting list
|
|
*/
|
|
state->nitems++;
|
|
memcpy(state->htids + state->nhtids, htids,
|
|
sizeof(ItemPointerData) * nhtids);
|
|
state->nhtids += nhtids;
|
|
state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Finalize pending posting list tuple, and add it to the page. Final tuple
|
|
* is based on saved base tuple, and saved list of heap TIDs.
|
|
*
|
|
* Returns space saving from deduplicating to make a new posting list tuple.
|
|
* Note that this includes line pointer overhead. This is zero in the case
|
|
* where no deduplication was possible.
|
|
*/
|
|
Size
|
|
_bt_dedup_finish_pending(Page newpage, BTDedupState state)
|
|
{
|
|
OffsetNumber tupoff;
|
|
Size tuplesz;
|
|
Size spacesaving;
|
|
|
|
Assert(state->nitems > 0);
|
|
Assert(state->nitems <= state->nhtids);
|
|
Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
|
|
|
|
tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
|
|
if (state->nitems == 1)
|
|
{
|
|
/* Use original, unchanged base tuple */
|
|
tuplesz = IndexTupleSize(state->base);
|
|
Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
|
|
Assert(tuplesz <= BTMaxItemSize(newpage));
|
|
if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
|
|
false, false) == InvalidOffsetNumber)
|
|
elog(ERROR, "deduplication failed to add tuple to page");
|
|
|
|
spacesaving = 0;
|
|
}
|
|
else
|
|
{
|
|
IndexTuple final;
|
|
|
|
/* Form a tuple with a posting list */
|
|
final = _bt_form_posting(state->base, state->htids, state->nhtids);
|
|
tuplesz = IndexTupleSize(final);
|
|
Assert(tuplesz <= state->maxpostingsize);
|
|
|
|
/* Save final number of items for posting list */
|
|
state->intervals[state->nintervals].nitems = state->nitems;
|
|
|
|
Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
|
|
Assert(tuplesz <= BTMaxItemSize(newpage));
|
|
if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
|
|
false) == InvalidOffsetNumber)
|
|
elog(ERROR, "deduplication failed to add tuple to page");
|
|
|
|
pfree(final);
|
|
spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
|
|
/* Increment nintervals, since we wrote a new posting list tuple */
|
|
state->nintervals++;
|
|
Assert(spacesaving > 0 && spacesaving < BLCKSZ);
|
|
}
|
|
|
|
/* Reset state for next pending posting list */
|
|
state->nhtids = 0;
|
|
state->nitems = 0;
|
|
state->phystupsize = 0;
|
|
|
|
return spacesaving;
|
|
}
|
|
|
|
/*
|
|
* Finalize interval during bottom-up index deletion.
|
|
*
|
|
* During a bottom-up pass we expect that TIDs will be recorded in dedup state
|
|
* first, and then get moved over to delstate (in variable-sized batches) by
|
|
* calling here. Call here happens when the number of TIDs in a dedup
|
|
* interval is known, and interval gets finalized (i.e. when caller sees next
|
|
* tuple on the page is not a duplicate, or when caller runs out of tuples to
|
|
* process from leaf page).
|
|
*
|
|
* This is where bottom-up deletion determines and remembers which entries are
|
|
* duplicates. This will be important information to the tableam delete
|
|
* infrastructure later on. Plain index tuple duplicates are marked
|
|
* "promising" here, per tableam contract.
|
|
*
|
|
* Our approach to marking entries whose TIDs come from posting lists is more
|
|
* complicated. Posting lists can only be formed by a deduplication pass (or
|
|
* during an index build), so recent version churn affecting the pointed-to
|
|
* logical rows is not particularly likely. We may still give a weak signal
|
|
* about posting list tuples' entries (by marking just one of its TIDs/entries
|
|
* promising), though this is only a possibility in the event of further
|
|
* duplicate index tuples in final interval that covers posting list tuple (as
|
|
* in the plain tuple case). A weak signal/hint will be useful to the tableam
|
|
* when it has no stronger signal to go with for the deletion operation as a
|
|
* whole.
|
|
*
|
|
* The heuristics we use work well in practice because we only need to give
|
|
* the tableam the right _general_ idea about where to look. Garbage tends to
|
|
* naturally get concentrated in relatively few table blocks with workloads
|
|
* that bottom-up deletion targets. The tableam cannot possibly rank all
|
|
* available table blocks sensibly based on the hints we provide, but that's
|
|
* okay -- only the extremes matter. The tableam just needs to be able to
|
|
* predict which few table blocks will have the most tuples that are safe to
|
|
* delete for each deletion operation, with low variance across related
|
|
* deletion operations.
|
|
*/
|
|
static void
|
|
_bt_bottomupdel_finish_pending(Page page, BTDedupState state,
|
|
TM_IndexDeleteOp *delstate)
|
|
{
|
|
bool dupinterval = (state->nitems > 1);
|
|
|
|
Assert(state->nitems > 0);
|
|
Assert(state->nitems <= state->nhtids);
|
|
Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
|
|
|
|
for (int i = 0; i < state->nitems; i++)
|
|
{
|
|
OffsetNumber offnum = state->baseoff + i;
|
|
ItemId itemid = PageGetItemId(page, offnum);
|
|
IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
|
|
TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
|
|
TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
|
|
|
|
if (!BTreeTupleIsPosting(itup))
|
|
{
|
|
/* Simple case: A plain non-pivot tuple */
|
|
ideltid->tid = itup->t_tid;
|
|
ideltid->id = delstate->ndeltids;
|
|
istatus->idxoffnum = offnum;
|
|
istatus->knowndeletable = false; /* for now */
|
|
istatus->promising = dupinterval; /* simple rule */
|
|
istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
|
|
|
|
delstate->ndeltids++;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* Complicated case: A posting list tuple.
|
|
*
|
|
* We make the conservative assumption that there can only be at
|
|
* most one affected logical row per posting list tuple. There
|
|
* will be at most one promising entry in deltids to represent
|
|
* this presumed lone logical row. Note that this isn't even
|
|
* considered unless the posting list tuple is also in an interval
|
|
* of duplicates -- this complicated rule is just a variant of the
|
|
* simple rule used to decide if plain index tuples are promising.
|
|
*/
|
|
int nitem = BTreeTupleGetNPosting(itup);
|
|
bool firstpromising = false;
|
|
bool lastpromising = false;
|
|
|
|
Assert(_bt_posting_valid(itup));
|
|
|
|
if (dupinterval)
|
|
{
|
|
/*
|
|
* Complicated rule: either the first or last TID in the
|
|
* posting list gets marked promising (if any at all)
|
|
*/
|
|
BlockNumber minblocklist,
|
|
midblocklist,
|
|
maxblocklist;
|
|
ItemPointer mintid,
|
|
midtid,
|
|
maxtid;
|
|
|
|
mintid = BTreeTupleGetHeapTID(itup);
|
|
midtid = BTreeTupleGetPostingN(itup, nitem / 2);
|
|
maxtid = BTreeTupleGetMaxHeapTID(itup);
|
|
minblocklist = ItemPointerGetBlockNumber(mintid);
|
|
midblocklist = ItemPointerGetBlockNumber(midtid);
|
|
maxblocklist = ItemPointerGetBlockNumber(maxtid);
|
|
|
|
/* Only entry with predominant table block can be promising */
|
|
firstpromising = (minblocklist == midblocklist);
|
|
lastpromising = (!firstpromising &&
|
|
midblocklist == maxblocklist);
|
|
}
|
|
|
|
for (int p = 0; p < nitem; p++)
|
|
{
|
|
ItemPointer htid = BTreeTupleGetPostingN(itup, p);
|
|
|
|
ideltid->tid = *htid;
|
|
ideltid->id = delstate->ndeltids;
|
|
istatus->idxoffnum = offnum;
|
|
istatus->knowndeletable = false; /* for now */
|
|
istatus->promising = false;
|
|
if ((firstpromising && p == 0) ||
|
|
(lastpromising && p == nitem - 1))
|
|
istatus->promising = true;
|
|
istatus->freespace = sizeof(ItemPointerData); /* at worst */
|
|
|
|
ideltid++;
|
|
istatus++;
|
|
delstate->ndeltids++;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (dupinterval)
|
|
{
|
|
state->intervals[state->nintervals].nitems = state->nitems;
|
|
state->nintervals++;
|
|
}
|
|
|
|
/* Reset state for next interval */
|
|
state->nhtids = 0;
|
|
state->nitems = 0;
|
|
state->phystupsize = 0;
|
|
}
|
|
|
|
/*
|
|
* Determine if page non-pivot tuples (data items) are all duplicates of the
|
|
* same value -- if they are, deduplication's "single value" strategy should
|
|
* be applied. The general goal of this strategy is to ensure that
|
|
* nbtsplitloc.c (which uses its own single value strategy) will find a useful
|
|
* split point as further duplicates are inserted, and successive rightmost
|
|
* page splits occur among pages that store the same duplicate value. When
|
|
* the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
|
|
* just like it would if deduplication were disabled.
|
|
*
|
|
* We expect that affected workloads will require _several_ single value
|
|
* strategy deduplication passes (over a page that only stores duplicates)
|
|
* before the page is finally split. The first deduplication pass should only
|
|
* find regular non-pivot tuples. Later deduplication passes will find
|
|
* existing maxpostingsize-capped posting list tuples, which must be skipped
|
|
* over. The penultimate pass is generally the first pass that actually
|
|
* reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
|
|
* few untouched non-pivot tuples. The final deduplication pass won't free
|
|
* any space -- it will skip over everything without merging anything (it
|
|
* retraces the steps of the penultimate pass).
|
|
*
|
|
* Fortunately, having several passes isn't too expensive. Each pass (after
|
|
* the first pass) won't spend many cycles on the large posting list tuples
|
|
* left by previous passes. Each pass will find a large contiguous group of
|
|
* smaller duplicate tuples to merge together at the end of the page.
|
|
*/
|
|
static bool
|
|
_bt_do_singleval(Relation rel, Page page, BTDedupState state,
|
|
OffsetNumber minoff, IndexTuple newitem)
|
|
{
|
|
int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
|
|
ItemId itemid;
|
|
IndexTuple itup;
|
|
|
|
itemid = PageGetItemId(page, minoff);
|
|
itup = (IndexTuple) PageGetItem(page, itemid);
|
|
|
|
if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
|
|
{
|
|
itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
|
|
itup = (IndexTuple) PageGetItem(page, itemid);
|
|
|
|
if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Lower maxpostingsize when using "single value" strategy, to avoid a sixth
|
|
* and final maxpostingsize-capped tuple. The sixth and final posting list
|
|
* tuple will end up somewhat smaller than the first five. (Note: The first
|
|
* five tuples could actually just be very large duplicate tuples that
|
|
* couldn't be merged together at all. Deduplication will simply not modify
|
|
* the page when that happens.)
|
|
*
|
|
* When there are six posting lists on the page (after current deduplication
|
|
* pass goes on to create/observe a sixth very large tuple), caller should end
|
|
* its deduplication pass. It isn't useful to try to deduplicate items that
|
|
* are supposed to end up on the new right sibling page following the
|
|
* anticipated page split. A future deduplication pass of future right
|
|
* sibling page might take care of it. (This is why the first single value
|
|
* strategy deduplication pass for a given leaf page will generally find only
|
|
* plain non-pivot tuples -- see _bt_do_singleval() comments.)
|
|
*/
|
|
static void
|
|
_bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
|
|
{
|
|
Size leftfree;
|
|
int reduction;
|
|
|
|
/* This calculation needs to match nbtsplitloc.c */
|
|
leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
|
|
MAXALIGN(sizeof(BTPageOpaqueData));
|
|
/* Subtract size of new high key (includes pivot heap TID space) */
|
|
leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
|
|
|
|
/*
|
|
* Reduce maxpostingsize by an amount equal to target free space on left
|
|
* half of page
|
|
*/
|
|
reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
|
|
if (state->maxpostingsize > reduction)
|
|
state->maxpostingsize -= reduction;
|
|
else
|
|
state->maxpostingsize = 0;
|
|
}
|
|
|
|
/*
|
|
* Build a posting list tuple based on caller's "base" index tuple and list of
|
|
* heap TIDs. When nhtids == 1, builds a standard non-pivot tuple without a
|
|
* posting list. (Posting list tuples can never have a single heap TID, partly
|
|
* because that ensures that deduplication always reduces final MAXALIGN()'d
|
|
* size of entire tuple.)
|
|
*
|
|
* Convention is that posting list starts at a MAXALIGN()'d offset (rather
|
|
* than a SHORTALIGN()'d offset), in line with the approach taken when
|
|
* appending a heap TID to new pivot tuple/high key during suffix truncation.
|
|
* This sometimes wastes a little space that was only needed as alignment
|
|
* padding in the original tuple. Following this convention simplifies the
|
|
* space accounting used when deduplicating a page (the same convention
|
|
* simplifies the accounting for choosing a point to split a page at).
|
|
*
|
|
* Note: Caller's "htids" array must be unique and already in ascending TID
|
|
* order. Any existing heap TIDs from "base" won't automatically appear in
|
|
* returned posting list tuple (they must be included in htids array.)
|
|
*/
|
|
IndexTuple
|
|
_bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
|
|
{
|
|
uint32 keysize,
|
|
newsize;
|
|
IndexTuple itup;
|
|
|
|
if (BTreeTupleIsPosting(base))
|
|
keysize = BTreeTupleGetPostingOffset(base);
|
|
else
|
|
keysize = IndexTupleSize(base);
|
|
|
|
Assert(!BTreeTupleIsPivot(base));
|
|
Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
|
|
Assert(keysize == MAXALIGN(keysize));
|
|
|
|
/* Determine final size of new tuple */
|
|
if (nhtids > 1)
|
|
newsize = MAXALIGN(keysize +
|
|
nhtids * sizeof(ItemPointerData));
|
|
else
|
|
newsize = keysize;
|
|
|
|
Assert(newsize <= INDEX_SIZE_MASK);
|
|
Assert(newsize == MAXALIGN(newsize));
|
|
|
|
/* Allocate memory using palloc0() (matches index_form_tuple()) */
|
|
itup = palloc0(newsize);
|
|
memcpy(itup, base, keysize);
|
|
itup->t_info &= ~INDEX_SIZE_MASK;
|
|
itup->t_info |= newsize;
|
|
if (nhtids > 1)
|
|
{
|
|
/* Form posting list tuple */
|
|
BTreeTupleSetPosting(itup, nhtids, keysize);
|
|
memcpy(BTreeTupleGetPosting(itup), htids,
|
|
sizeof(ItemPointerData) * nhtids);
|
|
Assert(_bt_posting_valid(itup));
|
|
}
|
|
else
|
|
{
|
|
/* Form standard non-pivot tuple */
|
|
itup->t_info &= ~INDEX_ALT_TID_MASK;
|
|
ItemPointerCopy(htids, &itup->t_tid);
|
|
Assert(ItemPointerIsValid(&itup->t_tid));
|
|
}
|
|
|
|
return itup;
|
|
}
|
|
|
|
/*
|
|
* Generate a replacement tuple by "updating" a posting list tuple so that it
|
|
* no longer has TIDs that need to be deleted.
|
|
*
|
|
* Used by both VACUUM and index deletion. Caller's vacposting argument
|
|
* points to the existing posting list tuple to be updated.
|
|
*
|
|
* On return, caller's vacposting argument will point to final "updated"
|
|
* tuple, which will be palloc()'d in caller's memory context.
|
|
*/
|
|
void
|
|
_bt_update_posting(BTVacuumPosting vacposting)
|
|
{
|
|
IndexTuple origtuple = vacposting->itup;
|
|
uint32 keysize,
|
|
newsize;
|
|
IndexTuple itup;
|
|
int nhtids;
|
|
int ui,
|
|
d;
|
|
ItemPointer htids;
|
|
|
|
nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
|
|
|
|
Assert(_bt_posting_valid(origtuple));
|
|
Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
|
|
|
|
/*
|
|
* Determine final size of new tuple.
|
|
*
|
|
* This calculation needs to match the code used within _bt_form_posting()
|
|
* for new posting list tuples. We avoid calling _bt_form_posting() here
|
|
* to save ourselves a second memory allocation for a htids workspace.
|
|
*/
|
|
keysize = BTreeTupleGetPostingOffset(origtuple);
|
|
if (nhtids > 1)
|
|
newsize = MAXALIGN(keysize +
|
|
nhtids * sizeof(ItemPointerData));
|
|
else
|
|
newsize = keysize;
|
|
|
|
Assert(newsize <= INDEX_SIZE_MASK);
|
|
Assert(newsize == MAXALIGN(newsize));
|
|
|
|
/* Allocate memory using palloc0() (matches index_form_tuple()) */
|
|
itup = palloc0(newsize);
|
|
memcpy(itup, origtuple, keysize);
|
|
itup->t_info &= ~INDEX_SIZE_MASK;
|
|
itup->t_info |= newsize;
|
|
|
|
if (nhtids > 1)
|
|
{
|
|
/* Form posting list tuple */
|
|
BTreeTupleSetPosting(itup, nhtids, keysize);
|
|
htids = BTreeTupleGetPosting(itup);
|
|
}
|
|
else
|
|
{
|
|
/* Form standard non-pivot tuple */
|
|
itup->t_info &= ~INDEX_ALT_TID_MASK;
|
|
htids = &itup->t_tid;
|
|
}
|
|
|
|
ui = 0;
|
|
d = 0;
|
|
for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
|
|
{
|
|
if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
|
|
{
|
|
d++;
|
|
continue;
|
|
}
|
|
htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
|
|
}
|
|
Assert(ui == nhtids);
|
|
Assert(d == vacposting->ndeletedtids);
|
|
Assert(nhtids == 1 || _bt_posting_valid(itup));
|
|
Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
|
|
|
|
/* vacposting arg's itup will now point to updated version */
|
|
vacposting->itup = itup;
|
|
}
|
|
|
|
/*
|
|
* Prepare for a posting list split by swapping heap TID in newitem with heap
|
|
* TID from original posting list (the 'oposting' heap TID located at offset
|
|
* 'postingoff'). Modifies newitem, so caller should pass their own private
|
|
* copy that can safely be modified.
|
|
*
|
|
* Returns new posting list tuple, which is palloc()'d in caller's context.
|
|
* This is guaranteed to be the same size as 'oposting'. Modified newitem is
|
|
* what caller actually inserts. (This happens inside the same critical
|
|
* section that performs an in-place update of old posting list using new
|
|
* posting list returned here.)
|
|
*
|
|
* While the keys from newitem and oposting must be opclass equal, and must
|
|
* generate identical output when run through the underlying type's output
|
|
* function, it doesn't follow that their representations match exactly.
|
|
* Caller must avoid assuming that there can't be representational differences
|
|
* that make datums from oposting bigger or smaller than the corresponding
|
|
* datums from newitem. For example, differences in TOAST input state might
|
|
* break a faulty assumption about tuple size (the executor is entitled to
|
|
* apply TOAST compression based on its own criteria). It also seems possible
|
|
* that further representational variation will be introduced in the future,
|
|
* in order to support nbtree features like page-level prefix compression.
|
|
*
|
|
* See nbtree/README for details on the design of posting list splits.
|
|
*/
|
|
IndexTuple
|
|
_bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
|
|
{
|
|
int nhtids;
|
|
char *replacepos;
|
|
char *replaceposright;
|
|
Size nmovebytes;
|
|
IndexTuple nposting;
|
|
|
|
nhtids = BTreeTupleGetNPosting(oposting);
|
|
Assert(_bt_posting_valid(oposting));
|
|
|
|
/*
|
|
* The postingoff argument originated as a _bt_binsrch_posting() return
|
|
* value. It will be 0 in the event of corruption that makes a leaf page
|
|
* contain a non-pivot tuple that's somehow identical to newitem (no two
|
|
* non-pivot tuples should ever have the same TID). This has been known
|
|
* to happen in the field from time to time.
|
|
*
|
|
* Perform a basic sanity check to catch this case now.
|
|
*/
|
|
if (!(postingoff > 0 && postingoff < nhtids))
|
|
elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
|
|
nhtids, postingoff);
|
|
|
|
/*
|
|
* Move item pointers in posting list to make a gap for the new item's
|
|
* heap TID. We shift TIDs one place to the right, losing original
|
|
* rightmost TID. (nmovebytes must not include TIDs to the left of
|
|
* postingoff, nor the existing rightmost/max TID that gets overwritten.)
|
|
*/
|
|
nposting = CopyIndexTuple(oposting);
|
|
replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
|
|
replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
|
|
nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
|
|
memmove(replaceposright, replacepos, nmovebytes);
|
|
|
|
/* Fill the gap at postingoff with TID of new item (original new TID) */
|
|
Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
|
|
ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
|
|
|
|
/* Now copy oposting's rightmost/max TID into new item (final new TID) */
|
|
ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
|
|
|
|
Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
|
|
BTreeTupleGetHeapTID(newitem)) < 0);
|
|
Assert(_bt_posting_valid(nposting));
|
|
|
|
return nposting;
|
|
}
|
|
|
|
/*
|
|
* Verify posting list invariants for "posting", which must be a posting list
|
|
* tuple. Used within assertions.
|
|
*/
|
|
#ifdef USE_ASSERT_CHECKING
|
|
static bool
|
|
_bt_posting_valid(IndexTuple posting)
|
|
{
|
|
ItemPointerData last;
|
|
ItemPointer htid;
|
|
|
|
if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
|
|
return false;
|
|
|
|
/* Remember first heap TID for loop */
|
|
ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
|
|
if (!ItemPointerIsValid(&last))
|
|
return false;
|
|
|
|
/* Iterate, starting from second TID */
|
|
for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
|
|
{
|
|
htid = BTreeTupleGetPostingN(posting, i);
|
|
|
|
if (!ItemPointerIsValid(htid))
|
|
return false;
|
|
if (ItemPointerCompare(htid, &last) <= 0)
|
|
return false;
|
|
ItemPointerCopy(htid, &last);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
#endif
|