diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 6a058ccdac..31717321b0 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -145,6 +145,7 @@ static void bt_tuple_present_callback(Relation index, ItemPointer tid,
bool tupleIsAlive, void *checkstate);
static IndexTuple bt_normalize_tuple(BtreeCheckState *state,
IndexTuple itup);
+static inline IndexTuple bt_posting_plain_tuple(IndexTuple itup, int n);
static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup);
static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
OffsetNumber offset);
@@ -167,6 +168,7 @@ static ItemId PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block,
Page page, OffsetNumber offset);
static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state,
IndexTuple itup, bool nonpivot);
+static inline ItemPointer BTreeTupleGetPointsToTID(IndexTuple itup);
/*
* bt_index_check(index regclass, heapallindexed boolean)
@@ -278,7 +280,8 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed,
if (btree_index_mainfork_expected(indrel))
{
- bool heapkeyspace;
+ bool heapkeyspace,
+ allequalimage;
RelationOpenSmgr(indrel);
if (!smgrexists(indrel->rd_smgr, MAIN_FORKNUM))
@@ -288,7 +291,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed,
RelationGetRelationName(indrel))));
/* Check index, possibly against table it is an index on */
- heapkeyspace = _bt_heapkeyspace(indrel);
+ _bt_metaversion(indrel, &heapkeyspace, &allequalimage);
bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck,
heapallindexed, rootdescend);
}
@@ -419,12 +422,12 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
/*
* Size Bloom filter based on estimated number of tuples in index,
* while conservatively assuming that each block must contain at least
- * MaxIndexTuplesPerPage / 5 non-pivot tuples. (Non-leaf pages cannot
- * contain non-pivot tuples. That's okay because they generally make
- * up no more than about 1% of all pages in the index.)
+ * MaxTIDsPerBTreePage / 3 "plain" tuples -- see
+ * bt_posting_plain_tuple() for definition, and details of how posting
+ * list tuples are handled.
*/
total_pages = RelationGetNumberOfBlocks(rel);
- total_elems = Max(total_pages * (MaxIndexTuplesPerPage / 5),
+ total_elems = Max(total_pages * (MaxTIDsPerBTreePage / 3),
(int64) state->rel->rd_rel->reltuples);
/* Random seed relies on backend srandom() call to avoid repetition */
seed = random();
@@ -924,6 +927,7 @@ bt_target_page_check(BtreeCheckState *state)
size_t tupsize;
BTScanInsert skey;
bool lowersizelimit;
+ ItemPointer scantid;
CHECK_FOR_INTERRUPTS();
@@ -954,13 +958,15 @@ bt_target_page_check(BtreeCheckState *state)
if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target,
offset))
{
+ ItemPointer tid;
char *itid,
*htid;
itid = psprintf("(%u,%u)", state->targetblock, offset);
+ tid = BTreeTupleGetPointsToTID(itup);
htid = psprintf("(%u,%u)",
- ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
- ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+ ItemPointerGetBlockNumberNoCheck(tid),
+ ItemPointerGetOffsetNumberNoCheck(tid));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
@@ -994,18 +1000,20 @@ bt_target_page_check(BtreeCheckState *state)
/*
* Readonly callers may optionally verify that non-pivot tuples can
- * each be found by an independent search that starts from the root
+ * each be found by an independent search that starts from the root.
+ * Note that we deliberately don't do individual searches for each
+ * TID, since the posting list itself is validated by other checks.
*/
if (state->rootdescend && P_ISLEAF(topaque) &&
!bt_rootdescend(state, itup))
{
+ ItemPointer tid = BTreeTupleGetPointsToTID(itup);
char *itid,
*htid;
itid = psprintf("(%u,%u)", state->targetblock, offset);
- htid = psprintf("(%u,%u)",
- ItemPointerGetBlockNumber(&(itup->t_tid)),
- ItemPointerGetOffsetNumber(&(itup->t_tid)));
+ htid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
@@ -1017,6 +1025,40 @@ bt_target_page_check(BtreeCheckState *state)
(uint32) state->targetlsn)));
}
+ /*
+ * If tuple is a posting list tuple, make sure posting list TIDs are
+ * in order
+ */
+ if (BTreeTupleIsPosting(itup))
+ {
+ ItemPointerData last;
+ ItemPointer current;
+
+ ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last);
+
+ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+ {
+
+ current = BTreeTupleGetPostingN(itup, i);
+
+ if (ItemPointerCompare(current, &last) <= 0)
+ {
+ char *itid = psprintf("(%u,%u)", state->targetblock, offset);
+
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("posting list contains misplaced TID in index \"%s\"",
+ RelationGetRelationName(state->rel)),
+ errdetail_internal("Index tid=%s posting list offset=%d page lsn=%X/%X.",
+ itid, i,
+ (uint32) (state->targetlsn >> 32),
+ (uint32) state->targetlsn)));
+ }
+
+ ItemPointerCopy(current, &last);
+ }
+ }
+
/* Build insertion scankey for current page offset */
skey = bt_mkscankey_pivotsearch(state->rel, itup);
@@ -1049,13 +1091,14 @@ bt_target_page_check(BtreeCheckState *state)
if (tupsize > (lowersizelimit ? BTMaxItemSize(state->target) :
BTMaxItemSizeNoHeapTid(state->target)))
{
+ ItemPointer tid = BTreeTupleGetPointsToTID(itup);
char *itid,
*htid;
itid = psprintf("(%u,%u)", state->targetblock, offset);
htid = psprintf("(%u,%u)",
- ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
- ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+ ItemPointerGetBlockNumberNoCheck(tid),
+ ItemPointerGetOffsetNumberNoCheck(tid));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
@@ -1074,12 +1117,32 @@ bt_target_page_check(BtreeCheckState *state)
{
IndexTuple norm;
- norm = bt_normalize_tuple(state, itup);
- bloom_add_element(state->filter, (unsigned char *) norm,
- IndexTupleSize(norm));
- /* Be tidy */
- if (norm != itup)
- pfree(norm);
+ if (BTreeTupleIsPosting(itup))
+ {
+ /* Fingerprint all elements as distinct "plain" tuples */
+ for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
+ {
+ IndexTuple logtuple;
+
+ logtuple = bt_posting_plain_tuple(itup, i);
+ norm = bt_normalize_tuple(state, logtuple);
+ bloom_add_element(state->filter, (unsigned char *) norm,
+ IndexTupleSize(norm));
+ /* Be tidy */
+ if (norm != logtuple)
+ pfree(norm);
+ pfree(logtuple);
+ }
+ }
+ else
+ {
+ norm = bt_normalize_tuple(state, itup);
+ bloom_add_element(state->filter, (unsigned char *) norm,
+ IndexTupleSize(norm));
+ /* Be tidy */
+ if (norm != itup)
+ pfree(norm);
+ }
}
/*
@@ -1087,7 +1150,8 @@ bt_target_page_check(BtreeCheckState *state)
*
* If there is a high key (if this is not the rightmost page on its
* entire level), check that high key actually is upper bound on all
- * page items.
+ * page items. If this is a posting list tuple, we'll need to set
+ * scantid to be highest TID in posting list.
*
* We prefer to check all items against high key rather than checking
* just the last and trusting that the operator class obeys the
@@ -1127,17 +1191,22 @@ bt_target_page_check(BtreeCheckState *state)
* tuple. (See also: "Notes About Data Representation" in the nbtree
* README.)
*/
+ scantid = skey->scantid;
+ if (state->heapkeyspace && BTreeTupleIsPosting(itup))
+ skey->scantid = BTreeTupleGetMaxHeapTID(itup);
+
if (!P_RIGHTMOST(topaque) &&
!(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) :
invariant_l_offset(state, skey, P_HIKEY)))
{
+ ItemPointer tid = BTreeTupleGetPointsToTID(itup);
char *itid,
*htid;
itid = psprintf("(%u,%u)", state->targetblock, offset);
htid = psprintf("(%u,%u)",
- ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
- ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+ ItemPointerGetBlockNumberNoCheck(tid),
+ ItemPointerGetOffsetNumberNoCheck(tid));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
@@ -1150,6 +1219,8 @@ bt_target_page_check(BtreeCheckState *state)
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
}
+ /* Reset, in case scantid was set to (itup) posting tuple's max TID */
+ skey->scantid = scantid;
/*
* * Item order check *
@@ -1160,15 +1231,17 @@ bt_target_page_check(BtreeCheckState *state)
if (OffsetNumberNext(offset) <= max &&
!invariant_l_offset(state, skey, OffsetNumberNext(offset)))
{
+ ItemPointer tid;
char *itid,
*htid,
*nitid,
*nhtid;
itid = psprintf("(%u,%u)", state->targetblock, offset);
+ tid = BTreeTupleGetPointsToTID(itup);
htid = psprintf("(%u,%u)",
- ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
- ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+ ItemPointerGetBlockNumberNoCheck(tid),
+ ItemPointerGetOffsetNumberNoCheck(tid));
nitid = psprintf("(%u,%u)", state->targetblock,
OffsetNumberNext(offset));
@@ -1177,9 +1250,10 @@ bt_target_page_check(BtreeCheckState *state)
state->target,
OffsetNumberNext(offset));
itup = (IndexTuple) PageGetItem(state->target, itemid);
+ tid = BTreeTupleGetPointsToTID(itup);
nhtid = psprintf("(%u,%u)",
- ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
- ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+ ItemPointerGetBlockNumberNoCheck(tid),
+ ItemPointerGetOffsetNumberNoCheck(tid));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
@@ -1953,10 +2027,9 @@ bt_tuple_present_callback(Relation index, ItemPointer tid, Datum *values,
* verification. In particular, it won't try to normalize opclass-equal
* datums with potentially distinct representations (e.g., btree/numeric_ops
* index datums will not get their display scale normalized-away here).
- * Normalization may need to be expanded to handle more cases in the future,
- * though. For example, it's possible that non-pivot tuples could in the
- * future have alternative logically equivalent representations due to using
- * the INDEX_ALT_TID_MASK bit to implement intelligent deduplication.
+ * Caller does normalization for non-pivot tuples that have a posting list,
+ * since dummy CREATE INDEX callback code generates new tuples with the same
+ * normalized representation.
*/
static IndexTuple
bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
@@ -1969,6 +2042,9 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
IndexTuple reformed;
int i;
+ /* Caller should only pass "logical" non-pivot tuples here */
+ Assert(!BTreeTupleIsPosting(itup) && !BTreeTupleIsPivot(itup));
+
/* Easy case: It's immediately clear that tuple has no varlena datums */
if (!IndexTupleHasVarwidths(itup))
return itup;
@@ -2031,6 +2107,29 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
return reformed;
}
+/*
+ * Produce palloc()'d "plain" tuple for nth posting list entry/TID.
+ *
+ * In general, deduplication is not supposed to change the logical contents of
+ * an index. Multiple index tuples are merged together into one equivalent
+ * posting list index tuple when convenient.
+ *
+ * heapallindexed verification must normalize-away this variation in
+ * representation by converting posting list tuples into two or more "plain"
+ * tuples. Each tuple must be fingerprinted separately -- there must be one
+ * tuple for each corresponding Bloom filter probe during the heap scan.
+ *
+ * Note: Caller still needs to call bt_normalize_tuple() with returned tuple.
+ */
+static inline IndexTuple
+bt_posting_plain_tuple(IndexTuple itup, int n)
+{
+ Assert(BTreeTupleIsPosting(itup));
+
+ /* Returns non-posting-list tuple */
+ return _bt_form_posting(itup, BTreeTupleGetPostingN(itup, n), 1);
+}
+
/*
* Search for itup in index, starting from fast root page. itup must be a
* non-pivot tuple. This is only supported with heapkeyspace indexes, since
@@ -2087,6 +2186,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
insertstate.itup = itup;
insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
insertstate.itup_key = key;
+ insertstate.postingoff = 0;
insertstate.bounds_valid = false;
insertstate.buf = lbuf;
@@ -2094,7 +2194,9 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
offnum = _bt_binsrch_insert(state->rel, &insertstate);
/* Compare first >= matching item on leaf page, if any */
page = BufferGetPage(lbuf);
+ /* Should match on first heap TID when tuple has a posting list */
if (offnum <= PageGetMaxOffsetNumber(page) &&
+ insertstate.postingoff <= 0 &&
_bt_compare(state->rel, key, page, offnum) == 0)
exists = true;
_bt_relbuf(state->rel, lbuf);
@@ -2548,26 +2650,69 @@ PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page,
}
/*
- * BTreeTupleGetHeapTID() wrapper that lets caller enforce that a heap TID must
- * be present in cases where that is mandatory.
- *
- * This doesn't add much as of BTREE_VERSION 4, since the INDEX_ALT_TID_MASK
- * bit is effectively a proxy for whether or not the tuple is a pivot tuple.
- * It may become more useful in the future, when non-pivot tuples support their
- * own alternative INDEX_ALT_TID_MASK representation.
+ * BTreeTupleGetHeapTID() wrapper that enforces that a heap TID is present in
+ * cases where that is mandatory (i.e. for non-pivot tuples)
*/
static inline ItemPointer
BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup,
bool nonpivot)
{
- ItemPointer result = BTreeTupleGetHeapTID(itup);
- BlockNumber targetblock = state->targetblock;
+ ItemPointer htid;
- if (result == NULL && nonpivot)
+ /*
+ * Caller determines whether this is supposed to be a pivot or non-pivot
+ * tuple using page type and item offset number. Verify that tuple
+ * metadata agrees with this.
+ */
+ Assert(state->heapkeyspace);
+ if (BTreeTupleIsPivot(itup) && nonpivot)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected pivot tuple",
+ state->targetblock,
+ RelationGetRelationName(state->rel))));
+
+ if (!BTreeTupleIsPivot(itup) && !nonpivot)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected non-pivot tuple",
+ state->targetblock,
+ RelationGetRelationName(state->rel))));
+
+ htid = BTreeTupleGetHeapTID(itup);
+ if (!ItemPointerIsValid(htid) && nonpivot)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID",
- targetblock, RelationGetRelationName(state->rel))));
+ state->targetblock,
+ RelationGetRelationName(state->rel))));
- return result;
+ return htid;
+}
+
+/*
+ * Return the "pointed to" TID for itup, which is used to generate a
+ * descriptive error message. itup must be a "data item" tuple (it wouldn't
+ * make much sense to call here with a high key tuple, since there won't be a
+ * valid downlink/block number to display).
+ *
+ * Returns either a heap TID (which will be the first heap TID in posting list
+ * if itup is posting list tuple), or a TID that contains downlink block
+ * number, plus some encoded metadata (e.g., the number of attributes present
+ * in itup).
+ */
+static inline ItemPointer
+BTreeTupleGetPointsToTID(IndexTuple itup)
+{
+ /*
+ * Rely on the assumption that !heapkeyspace internal page data items will
+ * correctly return TID with downlink here -- BTreeTupleGetHeapTID() won't
+ * recognize it as a pivot tuple, but everything still works out because
+ * the t_tid field is still returned
+ */
+ if (!BTreeTupleIsPivot(itup))
+ return BTreeTupleGetHeapTID(itup);
+
+ /* Pivot tuple returns TID with downlink block (heapkeyspace variant) */
+ return &itup->t_tid;
}
diff --git a/doc/src/sgml/btree.sgml b/doc/src/sgml/btree.sgml
index fcf771c857..f02e02b0ac 100644
--- a/doc/src/sgml/btree.sgml
+++ b/doc/src/sgml/btree.sgml
@@ -557,11 +557,208 @@ equalimage(opcintype oid) returns bool
Implementation
+
+ This section covers B-Tree index implementation details that may be
+ of use to advanced users. See
+ src/backend/access/nbtree/README in the source
+ distribution for a much more detailed, internals-focused description
+ of the B-Tree implementation.
+
+
+ B-Tree Structure
- An introduction to the btree index implementation can be found in
- src/backend/access/nbtree/README.
+ PostgreSQL B-Tree indexes are
+ multi-level tree structures, where each level of the tree can be
+ used as a doubly-linked list of pages. A single metapage is stored
+ in a fixed position at the start of the first segment file of the
+ index. All other pages are either leaf pages or internal pages.
+ Leaf pages are the pages on the lowest level of the tree. All
+ other levels consist of internal pages. Each leaf page contains
+ tuples that point to table rows. Each internal page contains
+ tuples that point to the next level down in the tree. Typically,
+ over 99% of all pages are leaf pages. Both internal pages and leaf
+ pages use the standard page format described in .
+
+
+ New leaf pages are added to a B-Tree index when an existing leaf
+ page cannot fit an incoming tuple. A page
+ split operation makes room for items that originally
+ belonged on the overflowing page by moving a portion of the items
+ to a new page. Page splits must also insert a new
+ downlink to the new page in the parent page,
+ which may cause the parent to split in turn. Page splits
+ cascade upwards
in a recursive fashion. When the
+ root page finally cannot fit a new downlink, a root page
+ split operation takes place. This adds a new level to
+ the tree structure by creating a new root page that is one level
+ above the original root page.
+
+
+
+
+ Deduplication
+
+ A duplicate is a leaf page tuple (a tuple that points to a table
+ row) where all indexed key columns have values
+ that match corresponding column values from at least one other leaf
+ page tuple that's close by in the same index. Duplicate tuples are
+ quite common in practice. B-Tree indexes can use a special,
+ space-efficient representation for duplicates when an optional
+ technique is enabled: deduplication.
+
+
+ Deduplication works by periodically merging groups of duplicate
+ tuples together, forming a single posting list tuple for each
+ group. The column key value(s) only appear once in this
+ representation. This is followed by a sorted array of
+ TIDs that point to rows in the table. This
+ significantly reduces the storage size of indexes where each value
+ (or each distinct combination of column values) appears several
+ times on average. The latency of queries can be reduced
+ significantly. Overall query throughput may increase
+ significantly. The overhead of routine index vacuuming may also be
+ reduced significantly.
+
+
+
+ While NULL is generally not considered to be equal to any other
+ value, including NULL, NULL is nevertheless treated as just
+ another value from the domain of indexed values by the B-Tree
+ implementation (except when enforcing uniqueness in a unique
+ index). B-Tree deduplication is therefore just as effective with
+ duplicates
that contain a NULL value.
+
+
+
+ The deduplication process occurs lazily, when a new item is
+ inserted that cannot fit on an existing leaf page. This prevents
+ (or at least delays) leaf page splits. Unlike GIN posting list
+ tuples, B-Tree posting list tuples do not need to expand every time
+ a new duplicate is inserted; they are merely an alternative
+ physical representation of the original logical contents of the
+ leaf page. This design prioritizes consistent performance with
+ mixed read-write workloads. Most client applications will at least
+ see a moderate performance benefit from using deduplication.
+ Deduplication is enabled by default.
+
+
+ Write-heavy workloads that don't benefit from deduplication due to
+ having few or no duplicate values in indexes will incur a small,
+ fixed performance penalty (unless deduplication is explicitly
+ disabled). The deduplicate_items storage
+ parameter can be used to disable deduplication within individual
+ indexes. There is never any performance penalty with read-only
+ workloads, since reading posting list tuples is at least as
+ efficient as reading the standard tuple representation. Disabling
+ deduplication isn't usually helpful.
+
+
+ B-Tree indexes are not directly aware that under MVCC, there might
+ be multiple extant versions of the same logical table row; to an
+ index, each tuple is an independent object that needs its own index
+ entry. Thus, an update of a row always creates all-new index
+ entries for the row, even if the key values did not change. Some
+ workloads suffer from index bloat caused by these
+ implementation-level version duplicates (this is typically a
+ problem for UPDATE-heavy workloads that cannot
+ apply the HOT optimization due to modifying at
+ least one indexed column). B-Tree deduplication does not
+ distinguish between these implementation-level version duplicates
+ and conventional duplicates. Deduplication can nevertheless help
+ with controlling index bloat caused by implementation-level version
+ churn.
+
+
+
+ A special heuristic is applied to determine whether a
+ deduplication pass in a unique index should take place. It can
+ often skip straight to splitting a leaf page, avoiding a
+ performance penalty from wasting cycles on unhelpful deduplication
+ passes. If you're concerned about the overhead of deduplication,
+ consider setting deduplicate_items = off
+ selectively. Leaving deduplication enabled in unique indexes has
+ little downside.
+
+
+
+ Deduplication cannot be used in all cases due to
+ implementation-level restrictions. Deduplication safety is
+ determined when CREATE INDEX or
+ REINDEX run.
+
+
+ Note that deduplication is deemed unsafe and cannot be used in the
+ following cases involving semantically significant differences
+ among equal datums:
+
+
+
+
+
+ text, varchar, and char
+ cannot use deduplication when a
+ nondeterministic collation is used. Case
+ and accent differences must be preserved among equal datums.
+
+
+
+
+
+ numeric cannot use deduplication. Numeric display
+ scale must be preserved among equal datums.
+
+
+
+
+
+ jsonb cannot use deduplication, since the
+ jsonb B-Tree operator class uses
+ numeric internally.
+
+
+
+
+
+ float4 and float8 cannot use
+ deduplication. These types have distinct representations for
+ -0 and 0, which are
+ nevertheless considered equal. This difference must be
+ preserved.
+
+
+
+
+
+ There is one further implementation-level restriction that may be
+ lifted in a future version of
+ PostgreSQL:
+
+
+
+
+
+ Container types (such as composite types, arrays, or range
+ types) cannot use deduplication.
+
+
+
+
+
+ There is one further implementation-level restriction that applies
+ regardless of the operator class or collation used:
+
+
+
+
+
+ INCLUDE indexes can never use deduplication.
+
+
+
+
diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml
index 057a6bb81a..20cdfabd7b 100644
--- a/doc/src/sgml/charset.sgml
+++ b/doc/src/sgml/charset.sgml
@@ -928,10 +928,11 @@ CREATE COLLATION ignore_accents (provider = icu, locale = 'und-u-ks-level1-kc-tr
nondeterministic collations give a more correct
behavior,
especially when considering the full power of Unicode and its many
special cases, they also have some drawbacks. Foremost, their use leads
- to a performance penalty. Also, certain operations are not possible with
- nondeterministic collations, such as pattern matching operations.
- Therefore, they should be used only in cases where they are specifically
- wanted.
+ to a performance penalty. Note, in particular, that B-tree cannot use
+ deduplication with indexes that use a nondeterministic collation. Also,
+ certain operations are not possible with nondeterministic collations,
+ such as pattern matching operations. Therefore, they should be used
+ only in cases where they are specifically wanted.
diff --git a/doc/src/sgml/citext.sgml b/doc/src/sgml/citext.sgml
index 667824fb0b..5986601327 100644
--- a/doc/src/sgml/citext.sgml
+++ b/doc/src/sgml/citext.sgml
@@ -233,9 +233,10 @@ SELECT * FROM users WHERE nick = 'Larry';
citext is not as efficient as text because the
operator functions and the B-tree comparison functions must make copies
- of the data and convert it to lower case for comparisons. It is,
- however, slightly more efficient than using lower to get
- case-insensitive matching.
+ of the data and convert it to lower case for comparisons. Also, only
+ text can support B-Tree deduplication. However,
+ citext is slightly more efficient than using
+ lower to get case-insensitive matching.
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index ceda48e0fc..28035f1635 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -16561,10 +16561,11 @@ AND
rows. Two rows might have a different binary representation even
though comparisons of the two rows with the equality operator is true.
The ordering of rows under these comparison operators is deterministic
- but not otherwise meaningful. These operators are used internally for
- materialized views and might be useful for other specialized purposes
- such as replication but are not intended to be generally useful for
- writing queries.
+ but not otherwise meaningful. These operators are used internally
+ for materialized views and might be useful for other specialized
+ purposes such as replication and B-Tree deduplication (see ). They are not intended to be
+ generally useful for writing queries, though.
diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml
index ab362a0dc5..a05e2e6b9c 100644
--- a/doc/src/sgml/ref/create_index.sgml
+++ b/doc/src/sgml/ref/create_index.sgml
@@ -171,6 +171,8 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ]
@@ -393,10 +395,39 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ]
- B-tree indexes additionally accept this parameter:
+ B-tree indexes also accept these parameters:
+
+ deduplicate_items
+
+ deduplicate_items
+ storage parameter
+
+
+
+
+ Controls usage of the B-tree deduplication technique described
+ in . Set to
+ ON or OFF to enable or
+ disable the optimization. (Alternative spellings of
+ ON and OFF are allowed as
+ described in .) The default is
+ ON.
+
+
+
+
+ Turning deduplicate_items off via
+ ALTER INDEX prevents future insertions from
+ triggering deduplication, but does not in itself make existing
+ posting list tuples use the standard tuple representation.
+
+
+
+
+
vacuum_cleanup_index_scale_factor
@@ -451,9 +482,7 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] . It is a Boolean parameter:
ON enables fast update, OFF disables it.
- (Alternative spellings of ON and OFF are
- allowed as described in .) The
- default is ON.
+ The default is ON.
@@ -805,6 +834,13 @@ CREATE UNIQUE INDEX title_idx ON films (title) INCLUDE (director, rating);
+
+ To create a B-Tree index with deduplication disabled:
+
+CREATE INDEX title_idx ON films (title) WITH (deduplicate_items = off);
+
+
+
To create an index on the expression lower(title),
allowing efficient case-insensitive searches:
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 79430d2b7b..5325dd3f61 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -158,6 +158,16 @@ static relopt_bool boolRelOpts[] =
},
true
},
+ {
+ {
+ "deduplicate_items",
+ "Enables \"deduplicate items\" feature for this btree index",
+ RELOPT_KIND_BTREE,
+ ShareUpdateExclusiveLock /* since it applies only to later
+ * inserts */
+ },
+ true
+ },
/* list terminator */
{{NULL}}
};
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index c16eb05416..dfba5ae39a 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -276,6 +276,10 @@ BuildIndexValueDescription(Relation indexRelation,
/*
* Get the latestRemovedXid from the table entries pointed at by the index
* tuples being deleted.
+ *
+ * Note: index access methods that don't consistently use the standard
+ * IndexTuple + heap TID item pointer representation will need to provide
+ * their own version of this function.
*/
TransactionId
index_compute_xid_horizon_for_tuples(Relation irel,
diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile
index bf245f5dab..d69808e78c 100644
--- a/src/backend/access/nbtree/Makefile
+++ b/src/backend/access/nbtree/Makefile
@@ -14,6 +14,7 @@ include $(top_builddir)/src/Makefile.global
OBJS = \
nbtcompare.o \
+ nbtdedup.o \
nbtinsert.o \
nbtpage.o \
nbtree.o \
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index c60a4d0d9e..6499f5adb7 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -432,7 +432,10 @@ because we allow LP_DEAD to be set with only a share lock (it's exactly
like a hint bit for a heap tuple), but physically removing tuples requires
exclusive lock. In the current code we try to remove LP_DEAD tuples when
we are otherwise faced with having to split a page to do an insertion (and
-hence have exclusive lock on it already).
+hence have exclusive lock on it already). Deduplication can also prevent
+a page split, but removing LP_DEAD tuples is the preferred approach.
+(Note that posting list tuples can only have their LP_DEAD bit set when
+every table TID within the posting list is known dead.)
This leaves the index in a state where it has no entry for a dead tuple
that still exists in the heap. This is not a problem for the current
@@ -726,6 +729,134 @@ if it must. When a page that's already full of duplicates must be split,
the fallback strategy assumes that duplicates are mostly inserted in
ascending heap TID order. The page is split in a way that leaves the left
half of the page mostly full, and the right half of the page mostly empty.
+The overall effect is that leaf page splits gracefully adapt to inserts of
+large groups of duplicates, maximizing space utilization. Note also that
+"trapping" large groups of duplicates on the same leaf page like this makes
+deduplication more efficient. Deduplication can be performed infrequently,
+without merging together existing posting list tuples too often.
+
+Notes about deduplication
+-------------------------
+
+We deduplicate non-pivot tuples in non-unique indexes to reduce storage
+overhead, and to avoid (or at least delay) page splits. Note that the
+goals for deduplication in unique indexes are rather different; see later
+section for details. Deduplication alters the physical representation of
+tuples without changing the logical contents of the index, and without
+adding overhead to read queries. Non-pivot tuples are merged together
+into a single physical tuple with a posting list (a simple array of heap
+TIDs with the standard item pointer format). Deduplication is always
+applied lazily, at the point where it would otherwise be necessary to
+perform a page split. It occurs only when LP_DEAD items have been
+removed, as our last line of defense against splitting a leaf page. We
+can set the LP_DEAD bit with posting list tuples, though only when all
+TIDs are known dead.
+
+Our lazy approach to deduplication allows the page space accounting used
+during page splits to have absolutely minimal special case logic for
+posting lists. Posting lists can be thought of as extra payload that
+suffix truncation will reliably truncate away as needed during page
+splits, just like non-key columns from an INCLUDE index tuple.
+Incoming/new tuples can generally be treated as non-overlapping plain
+items (though see section on posting list splits for information about how
+overlapping new/incoming items are really handled).
+
+The representation of posting lists is almost identical to the posting
+lists used by GIN, so it would be straightforward to apply GIN's varbyte
+encoding compression scheme to individual posting lists. Posting list
+compression would break the assumptions made by posting list splits about
+page space accounting (see later section), so it's not clear how
+compression could be integrated with nbtree. Besides, posting list
+compression does not offer a compelling trade-off for nbtree, since in
+general nbtree is optimized for consistent performance with many
+concurrent readers and writers.
+
+A major goal of our lazy approach to deduplication is to limit the
+performance impact of deduplication with random updates. Even concurrent
+append-only inserts of the same key value will tend to have inserts of
+individual index tuples in an order that doesn't quite match heap TID
+order. Delaying deduplication minimizes page level fragmentation.
+
+Deduplication in unique indexes
+-------------------------------
+
+Very often, the range of values that can be placed on a given leaf page in
+a unique index is fixed and permanent. For example, a primary key on an
+identity column will usually only have page splits caused by the insertion
+of new logical rows within the rightmost leaf page. If there is a split
+of a non-rightmost leaf page, then the split must have been triggered by
+inserts associated with an UPDATE of an existing logical row. Splitting a
+leaf page purely to store multiple versions should be considered
+pathological, since it permanently degrades the index structure in order
+to absorb a temporary burst of duplicates. Deduplication in unique
+indexes helps to prevent these pathological page splits. Storing
+duplicates in a space efficient manner is not the goal, since in the long
+run there won't be any duplicates anyway. Rather, we're buying time for
+standard garbage collection mechanisms to run before a page split is
+needed.
+
+Unique index leaf pages only get a deduplication pass when an insertion
+(that might have to split the page) observed an existing duplicate on the
+page in passing. This is based on the assumption that deduplication will
+only work out when _all_ new insertions are duplicates from UPDATEs. This
+may mean that we miss an opportunity to delay a page split, but that's
+okay because our ultimate goal is to delay leaf page splits _indefinitely_
+(i.e. to prevent them altogether). There is little point in trying to
+delay a split that is probably inevitable anyway. This allows us to avoid
+the overhead of attempting to deduplicate with unique indexes that always
+have few or no duplicates.
+
+Posting list splits
+-------------------
+
+When the incoming tuple happens to overlap with an existing posting list,
+a posting list split is performed. Like a page split, a posting list
+split resolves a situation where a new/incoming item "won't fit", while
+inserting the incoming item in passing (i.e. as part of the same atomic
+action). It's possible (though not particularly likely) that an insert of
+a new item on to an almost-full page will overlap with a posting list,
+resulting in both a posting list split and a page split. Even then, the
+atomic action that splits the posting list also inserts the new item
+(since page splits always insert the new item in passing). Including the
+posting list split in the same atomic action as the insert avoids problems
+caused by concurrent inserts into the same posting list -- the exact
+details of how we change the posting list depend upon the new item, and
+vice-versa. A single atomic action also minimizes the volume of extra
+WAL required for a posting list split, since we don't have to explicitly
+WAL-log the original posting list tuple.
+
+Despite piggy-backing on the same atomic action that inserts a new tuple,
+posting list splits can be thought of as a separate, extra action to the
+insert itself (or to the page split itself). Posting list splits
+conceptually "rewrite" an insert that overlaps with an existing posting
+list into an insert that adds its final new item just to the right of the
+posting list instead. The size of the posting list won't change, and so
+page space accounting code does not need to care about posting list splits
+at all. This is an important upside of our design; the page split point
+choice logic is very subtle even without it needing to deal with posting
+list splits.
+
+Only a few isolated extra steps are required to preserve the illusion that
+the new item never overlapped with an existing posting list in the first
+place: the heap TID of the incoming tuple is swapped with the rightmost/max
+heap TID from the existing/originally overlapping posting list. Also, the
+posting-split-with-page-split case must generate a new high key based on
+an imaginary version of the original page that has both the final new item
+and the after-list-split posting tuple (page splits usually just operate
+against an imaginary version that contains the new item/item that won't
+fit).
+
+This approach avoids inventing an "eager" atomic posting split operation
+that splits the posting list without simultaneously finishing the insert
+of the incoming item. This alternative design might seem cleaner, but it
+creates subtle problems for page space accounting. In general, there
+might not be enough free space on the page to split a posting list such
+that the incoming/new item no longer overlaps with either posting list
+half --- the operation could fail before the actual retail insert of the
+new item even begins. We'd end up having to handle posting list splits
+that need a page split anyway. Besides, supporting variable "split points"
+while splitting posting lists won't actually improve overall space
+utilization.
Notes About Data Representation
-------------------------------
diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c
new file mode 100644
index 0000000000..e5481f2f93
--- /dev/null
+++ b/src/backend/access/nbtree/nbtdedup.c
@@ -0,0 +1,830 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtdedup.c
+ * Deduplicate items in Postgres btrees.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/nbtree/nbtdedup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/nbtxlog.h"
+#include "miscadmin.h"
+#include "utils/rel.h"
+
+static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
+ OffsetNumber minoff, IndexTuple newitem);
+static void _bt_singleval_fillfactor(Page page, BTDedupState state,
+ Size newitemsz);
+#ifdef USE_ASSERT_CHECKING
+static bool _bt_posting_valid(IndexTuple posting);
+#endif
+
+/*
+ * Deduplicate items on a leaf page. The page will have to be split by caller
+ * if we cannot successfully free at least newitemsz (we also need space for
+ * newitem's line pointer, which isn't included in caller's newitemsz).
+ *
+ * The general approach taken here is to perform as much deduplication as
+ * possible to free as much space as possible. Note, however, that "single
+ * value" strategy is sometimes used for !checkingunique callers, in which
+ * case deduplication will leave a few tuples untouched at the end of the
+ * page. The general idea is to prepare the page for an anticipated page
+ * split that uses nbtsplitloc.c's "single value" strategy to determine a
+ * split point. (There is no reason to deduplicate items that will end up on
+ * the right half of the page after the anticipated page split; better to
+ * handle those if and when the anticipated right half page gets its own
+ * deduplication pass, following further inserts of duplicates.)
+ *
+ * This function should be called during insertion, when the page doesn't have
+ * enough space to fit an incoming newitem. If the BTP_HAS_GARBAGE page flag
+ * was set, caller should have removed any LP_DEAD items by calling
+ * _bt_vacuum_one_page() before calling here. We may still have to kill
+ * LP_DEAD items here when the page's BTP_HAS_GARBAGE hint is falsely unset,
+ * but that should be rare. Also, _bt_vacuum_one_page() won't unset the
+ * BTP_HAS_GARBAGE flag when it finds no LP_DEAD items, so a successful
+ * deduplication pass will always clear it, just to keep things tidy.
+ */
+void
+_bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
+ IndexTuple newitem, Size newitemsz, bool checkingunique)
+{
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ Page page = BufferGetPage(buf);
+ BTPageOpaque opaque;
+ Page newpage;
+ int newpagendataitems = 0;
+ OffsetNumber deletable[MaxIndexTuplesPerPage];
+ BTDedupState state;
+ int ndeletable = 0;
+ Size pagesaving = 0;
+ bool singlevalstrat = false;
+ int natts = IndexRelationGetNumberOfAttributes(rel);
+
+ /*
+ * We can't assume that there are no LP_DEAD items. For one thing, VACUUM
+ * will clear the BTP_HAS_GARBAGE hint without reliably removing items
+ * that are marked LP_DEAD. We don't want to unnecessarily unset LP_DEAD
+ * bits when deduplicating items. Allowing it would be correct, though
+ * wasteful.
+ */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid = PageGetItemId(page, offnum);
+
+ if (ItemIdIsDead(itemid))
+ deletable[ndeletable++] = offnum;
+ }
+
+ if (ndeletable > 0)
+ {
+ _bt_delitems_delete(rel, buf, deletable, ndeletable, heapRel);
+
+ /*
+ * Return when a split will be avoided. This is equivalent to
+ * avoiding a split using the usual _bt_vacuum_one_page() path.
+ */
+ if (PageGetFreeSpace(page) >= newitemsz)
+ return;
+
+ /*
+ * Reconsider number of items on page, in case _bt_delitems_delete()
+ * managed to delete an item or two
+ */
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+ }
+
+ /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
+ newitemsz += sizeof(ItemIdData);
+
+ /*
+ * By here, it's clear that deduplication will definitely be attempted.
+ * Initialize deduplication state.
+ *
+ * It would be possible for maxpostingsize (limit on posting list tuple
+ * size) to be set to one third of the page. However, it seems like a
+ * good idea to limit the size of posting lists to one sixth of a page.
+ * That ought to leave us with a good split point when pages full of
+ * duplicates can be split several times.
+ */
+ state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+ state->deduplicate = true;
+ state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
+ /* Metadata about base tuple of current pending posting list */
+ state->base = NULL;
+ state->baseoff = InvalidOffsetNumber;
+ state->basetupsize = 0;
+ /* Metadata about current pending posting list TIDs */
+ state->htids = palloc(state->maxpostingsize);
+ state->nhtids = 0;
+ state->nitems = 0;
+ /* Size of all physical tuples to be replaced by pending posting list */
+ state->phystupsize = 0;
+ /* nintervals should be initialized to zero */
+ state->nintervals = 0;
+
+ /* Determine if "single value" strategy should be used */
+ if (!checkingunique)
+ singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
+
+ /*
+ * Deduplicate items from page, and write them to newpage.
+ *
+ * Copy the original page's LSN into newpage copy. This will become the
+ * updated version of the page. We need this because XLogInsert will
+ * examine the LSN and possibly dump it in a page image.
+ */
+ newpage = PageGetTempPageCopySpecial(page);
+ PageSetLSN(newpage, PageGetLSN(page));
+
+ /* Copy high key, if any */
+ if (!P_RIGHTMOST(opaque))
+ {
+ ItemId hitemid = PageGetItemId(page, P_HIKEY);
+ Size hitemsz = ItemIdGetLength(hitemid);
+ IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
+
+ if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "deduplication failed to add highkey");
+ }
+
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid = PageGetItemId(page, offnum);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+
+ Assert(!ItemIdIsDead(itemid));
+
+ if (offnum == minoff)
+ {
+ /*
+ * No previous/base tuple for the data item -- use the data item
+ * as base tuple of pending posting list
+ */
+ _bt_dedup_start_pending(state, itup, offnum);
+ }
+ else if (state->deduplicate &&
+ _bt_keep_natts_fast(rel, state->base, itup) > natts &&
+ _bt_dedup_save_htid(state, itup))
+ {
+ /*
+ * Tuple is equal to base tuple of pending posting list. Heap
+ * TID(s) for itup have been saved in state.
+ */
+ }
+ else
+ {
+ /*
+ * Tuple is not equal to pending posting list tuple, or
+ * _bt_dedup_save_htid() opted to not merge current item into
+ * pending posting list for some other reason (e.g., adding more
+ * TIDs would have caused posting list to exceed current
+ * maxpostingsize).
+ *
+ * If state contains pending posting list with more than one item,
+ * form new posting tuple, and actually update the page. Else
+ * reset the state and move on without modifying the page.
+ */
+ pagesaving += _bt_dedup_finish_pending(newpage, state);
+ newpagendataitems++;
+
+ if (singlevalstrat)
+ {
+ /*
+ * Single value strategy's extra steps.
+ *
+ * Lower maxpostingsize for sixth and final item that might be
+ * deduplicated by current deduplication pass. When sixth
+ * item formed/observed, stop deduplicating items.
+ *
+ * Note: It's possible that this will be reached even when
+ * current deduplication pass has yet to merge together some
+ * existing items. It doesn't matter whether or not the
+ * current call generated the maxpostingsize-capped duplicate
+ * tuples at the start of the page.
+ */
+ if (newpagendataitems == 5)
+ _bt_singleval_fillfactor(page, state, newitemsz);
+ else if (newpagendataitems == 6)
+ {
+ state->deduplicate = false;
+ singlevalstrat = false; /* won't be back here */
+ }
+ }
+
+ /* itup starts new pending posting list */
+ _bt_dedup_start_pending(state, itup, offnum);
+ }
+ }
+
+ /* Handle the last item */
+ pagesaving += _bt_dedup_finish_pending(newpage, state);
+ newpagendataitems++;
+
+ /*
+ * If no items suitable for deduplication were found, newpage must be
+ * exactly the same as the original page, so just return from function.
+ *
+ * We could determine whether or not to proceed on the basis the space
+ * savings being sufficient to avoid an immediate page split instead. We
+ * don't do that because there is some small value in nbtsplitloc.c always
+ * operating against a page that is fully deduplicated (apart from
+ * newitem). Besides, most of the cost has already been paid.
+ */
+ if (state->nintervals == 0)
+ {
+ /* cannot leak memory here */
+ pfree(newpage);
+ pfree(state->htids);
+ pfree(state);
+ return;
+ }
+
+ /*
+ * By here, it's clear that deduplication will definitely go ahead.
+ *
+ * Clear the BTP_HAS_GARBAGE page flag in the unlikely event that it is
+ * still falsely set, just to keep things tidy. (We can't rely on
+ * _bt_vacuum_one_page() having done this already, and we can't rely on a
+ * page split or VACUUM getting to it in the near future.)
+ */
+ if (P_HAS_GARBAGE(opaque))
+ {
+ BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
+
+ nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+ }
+
+ START_CRIT_SECTION();
+
+ PageRestoreTempPage(newpage, page);
+ MarkBufferDirty(buf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ XLogRecPtr recptr;
+ xl_btree_dedup xlrec_dedup;
+
+ xlrec_dedup.nintervals = state->nintervals;
+
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+ XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
+
+ /*
+ * The intervals array is not in the buffer, but pretend that it is.
+ * When XLogInsert stores the whole buffer, the array need not be
+ * stored too.
+ */
+ XLogRegisterBufData(0, (char *) state->intervals,
+ state->nintervals * sizeof(BTDedupInterval));
+
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
+
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ /* Local space accounting should agree with page accounting */
+ Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
+
+ /* cannot leak memory here */
+ pfree(state->htids);
+ pfree(state);
+}
+
+/*
+ * Create a new pending posting list tuple based on caller's base tuple.
+ *
+ * Every tuple processed by deduplication either becomes the base tuple for a
+ * posting list, or gets its heap TID(s) accepted into a pending posting list.
+ * A tuple that starts out as the base tuple for a posting list will only
+ * actually be rewritten within _bt_dedup_finish_pending() when it turns out
+ * that there are duplicates that can be merged into the base tuple.
+ */
+void
+_bt_dedup_start_pending(BTDedupState state, IndexTuple base,
+ OffsetNumber baseoff)
+{
+ Assert(state->nhtids == 0);
+ Assert(state->nitems == 0);
+ Assert(!BTreeTupleIsPivot(base));
+
+ /*
+ * Copy heap TID(s) from new base tuple for new candidate posting list
+ * into working state's array
+ */
+ if (!BTreeTupleIsPosting(base))
+ {
+ memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
+ state->nhtids = 1;
+ state->basetupsize = IndexTupleSize(base);
+ }
+ else
+ {
+ int nposting;
+
+ nposting = BTreeTupleGetNPosting(base);
+ memcpy(state->htids, BTreeTupleGetPosting(base),
+ sizeof(ItemPointerData) * nposting);
+ state->nhtids = nposting;
+ /* basetupsize should not include existing posting list */
+ state->basetupsize = BTreeTupleGetPostingOffset(base);
+ }
+
+ /*
+ * Save new base tuple itself -- it'll be needed if we actually create a
+ * new posting list from new pending posting list.
+ *
+ * Must maintain physical size of all existing tuples (including line
+ * pointer overhead) so that we can calculate space savings on page.
+ */
+ state->nitems = 1;
+ state->base = base;
+ state->baseoff = baseoff;
+ state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
+ /* Also save baseoff in pending state for interval */
+ state->intervals[state->nintervals].baseoff = state->baseoff;
+}
+
+/*
+ * Save itup heap TID(s) into pending posting list where possible.
+ *
+ * Returns bool indicating if the pending posting list managed by state now
+ * includes itup's heap TID(s).
+ */
+bool
+_bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
+{
+ int nhtids;
+ ItemPointer htids;
+ Size mergedtupsz;
+
+ Assert(!BTreeTupleIsPivot(itup));
+
+ if (!BTreeTupleIsPosting(itup))
+ {
+ nhtids = 1;
+ htids = &itup->t_tid;
+ }
+ else
+ {
+ nhtids = BTreeTupleGetNPosting(itup);
+ htids = BTreeTupleGetPosting(itup);
+ }
+
+ /*
+ * Don't append (have caller finish pending posting list as-is) if
+ * appending heap TID(s) from itup would put us over maxpostingsize limit.
+ *
+ * This calculation needs to match the code used within _bt_form_posting()
+ * for new posting list tuples.
+ */
+ mergedtupsz = MAXALIGN(state->basetupsize +
+ (state->nhtids + nhtids) * sizeof(ItemPointerData));
+
+ if (mergedtupsz > state->maxpostingsize)
+ return false;
+
+ /*
+ * Save heap TIDs to pending posting list tuple -- itup can be merged into
+ * pending posting list
+ */
+ state->nitems++;
+ memcpy(state->htids + state->nhtids, htids,
+ sizeof(ItemPointerData) * nhtids);
+ state->nhtids += nhtids;
+ state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
+
+ return true;
+}
+
+/*
+ * Finalize pending posting list tuple, and add it to the page. Final tuple
+ * is based on saved base tuple, and saved list of heap TIDs.
+ *
+ * Returns space saving from deduplicating to make a new posting list tuple.
+ * Note that this includes line pointer overhead. This is zero in the case
+ * where no deduplication was possible.
+ */
+Size
+_bt_dedup_finish_pending(Page newpage, BTDedupState state)
+{
+ OffsetNumber tupoff;
+ Size tuplesz;
+ Size spacesaving;
+
+ Assert(state->nitems > 0);
+ Assert(state->nitems <= state->nhtids);
+ Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
+
+ tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
+ if (state->nitems == 1)
+ {
+ /* Use original, unchanged base tuple */
+ tuplesz = IndexTupleSize(state->base);
+ if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "deduplication failed to add tuple to page");
+
+ spacesaving = 0;
+ }
+ else
+ {
+ IndexTuple final;
+
+ /* Form a tuple with a posting list */
+ final = _bt_form_posting(state->base, state->htids, state->nhtids);
+ tuplesz = IndexTupleSize(final);
+ Assert(tuplesz <= state->maxpostingsize);
+
+ /* Save final number of items for posting list */
+ state->intervals[state->nintervals].nitems = state->nitems;
+
+ Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
+ if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
+ false) == InvalidOffsetNumber)
+ elog(ERROR, "deduplication failed to add tuple to page");
+
+ pfree(final);
+ spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
+ /* Increment nintervals, since we wrote a new posting list tuple */
+ state->nintervals++;
+ Assert(spacesaving > 0 && spacesaving < BLCKSZ);
+ }
+
+ /* Reset state for next pending posting list */
+ state->nhtids = 0;
+ state->nitems = 0;
+ state->phystupsize = 0;
+
+ return spacesaving;
+}
+
+/*
+ * Determine if page non-pivot tuples (data items) are all duplicates of the
+ * same value -- if they are, deduplication's "single value" strategy should
+ * be applied. The general goal of this strategy is to ensure that
+ * nbtsplitloc.c (which uses its own single value strategy) will find a useful
+ * split point as further duplicates are inserted, and successive rightmost
+ * page splits occur among pages that store the same duplicate value. When
+ * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
+ * just like it would if deduplication were disabled.
+ *
+ * We expect that affected workloads will require _several_ single value
+ * strategy deduplication passes (over a page that only stores duplicates)
+ * before the page is finally split. The first deduplication pass should only
+ * find regular non-pivot tuples. Later deduplication passes will find
+ * existing maxpostingsize-capped posting list tuples, which must be skipped
+ * over. The penultimate pass is generally the first pass that actually
+ * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
+ * few untouched non-pivot tuples. The final deduplication pass won't free
+ * any space -- it will skip over everything without merging anything (it
+ * retraces the steps of the penultimate pass).
+ *
+ * Fortunately, having several passes isn't too expensive. Each pass (after
+ * the first pass) won't spend many cycles on the large posting list tuples
+ * left by previous passes. Each pass will find a large contiguous group of
+ * smaller duplicate tuples to merge together at the end of the page.
+ *
+ * Note: We deliberately don't bother checking if the high key is a distinct
+ * value (prior to the TID tiebreaker column) before proceeding, unlike
+ * nbtsplitloc.c. Its single value strategy only gets applied on the
+ * rightmost page of duplicates of the same value (other leaf pages full of
+ * duplicates will get a simple 50:50 page split instead of splitting towards
+ * the end of the page). There is little point in making the same distinction
+ * here.
+ */
+static bool
+_bt_do_singleval(Relation rel, Page page, BTDedupState state,
+ OffsetNumber minoff, IndexTuple newitem)
+{
+ int natts = IndexRelationGetNumberOfAttributes(rel);
+ ItemId itemid;
+ IndexTuple itup;
+
+ itemid = PageGetItemId(page, minoff);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+
+ if (_bt_keep_natts_fast(rel, newitem, itup) > natts)
+ {
+ itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
+ itup = (IndexTuple) PageGetItem(page, itemid);
+
+ if (_bt_keep_natts_fast(rel, newitem, itup) > natts)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
+ * and final maxpostingsize-capped tuple. The sixth and final posting list
+ * tuple will end up somewhat smaller than the first five. (Note: The first
+ * five tuples could actually just be very large duplicate tuples that
+ * couldn't be merged together at all. Deduplication will simply not modify
+ * the page when that happens.)
+ *
+ * When there are six posting lists on the page (after current deduplication
+ * pass goes on to create/observe a sixth very large tuple), caller should end
+ * its deduplication pass. It isn't useful to try to deduplicate items that
+ * are supposed to end up on the new right sibling page following the
+ * anticipated page split. A future deduplication pass of future right
+ * sibling page might take care of it. (This is why the first single value
+ * strategy deduplication pass for a given leaf page will generally find only
+ * plain non-pivot tuples -- see _bt_do_singleval() comments.)
+ */
+static void
+_bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
+{
+ Size leftfree;
+ int reduction;
+
+ /* This calculation needs to match nbtsplitloc.c */
+ leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
+ MAXALIGN(sizeof(BTPageOpaqueData));
+ /* Subtract size of new high key (includes pivot heap TID space) */
+ leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
+
+ /*
+ * Reduce maxpostingsize by an amount equal to target free space on left
+ * half of page
+ */
+ reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
+ if (state->maxpostingsize > reduction)
+ state->maxpostingsize -= reduction;
+ else
+ state->maxpostingsize = 0;
+}
+
+/*
+ * Build a posting list tuple based on caller's "base" index tuple and list of
+ * heap TIDs. When nhtids == 1, builds a standard non-pivot tuple without a
+ * posting list. (Posting list tuples can never have a single heap TID, partly
+ * because that ensures that deduplication always reduces final MAXALIGN()'d
+ * size of entire tuple.)
+ *
+ * Convention is that posting list starts at a MAXALIGN()'d offset (rather
+ * than a SHORTALIGN()'d offset), in line with the approach taken when
+ * appending a heap TID to new pivot tuple/high key during suffix truncation.
+ * This sometimes wastes a little space that was only needed as alignment
+ * padding in the original tuple. Following this convention simplifies the
+ * space accounting used when deduplicating a page (the same convention
+ * simplifies the accounting for choosing a point to split a page at).
+ *
+ * Note: Caller's "htids" array must be unique and already in ascending TID
+ * order. Any existing heap TIDs from "base" won't automatically appear in
+ * returned posting list tuple (they must be included in htids array.)
+ */
+IndexTuple
+_bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
+{
+ uint32 keysize,
+ newsize;
+ IndexTuple itup;
+
+ if (BTreeTupleIsPosting(base))
+ keysize = BTreeTupleGetPostingOffset(base);
+ else
+ keysize = IndexTupleSize(base);
+
+ Assert(!BTreeTupleIsPivot(base));
+ Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
+ Assert(keysize == MAXALIGN(keysize));
+
+ /* Determine final size of new tuple */
+ if (nhtids > 1)
+ newsize = MAXALIGN(keysize +
+ nhtids * sizeof(ItemPointerData));
+ else
+ newsize = keysize;
+
+ Assert(newsize <= INDEX_SIZE_MASK);
+ Assert(newsize == MAXALIGN(newsize));
+
+ /* Allocate memory using palloc0() (matches index_form_tuple()) */
+ itup = palloc0(newsize);
+ memcpy(itup, base, keysize);
+ itup->t_info &= ~INDEX_SIZE_MASK;
+ itup->t_info |= newsize;
+ if (nhtids > 1)
+ {
+ /* Form posting list tuple */
+ BTreeTupleSetPosting(itup, nhtids, keysize);
+ memcpy(BTreeTupleGetPosting(itup), htids,
+ sizeof(ItemPointerData) * nhtids);
+ Assert(_bt_posting_valid(itup));
+ }
+ else
+ {
+ /* Form standard non-pivot tuple */
+ itup->t_info &= ~INDEX_ALT_TID_MASK;
+ ItemPointerCopy(htids, &itup->t_tid);
+ Assert(ItemPointerIsValid(&itup->t_tid));
+ }
+
+ return itup;
+}
+
+/*
+ * Generate a replacement tuple by "updating" a posting list tuple so that it
+ * no longer has TIDs that need to be deleted.
+ *
+ * Used by VACUUM. Caller's vacposting argument points to the existing
+ * posting list tuple to be updated.
+ *
+ * On return, caller's vacposting argument will point to final "updated"
+ * tuple, which will be palloc()'d in caller's memory context.
+ */
+void
+_bt_update_posting(BTVacuumPosting vacposting)
+{
+ IndexTuple origtuple = vacposting->itup;
+ uint32 keysize,
+ newsize;
+ IndexTuple itup;
+ int nhtids;
+ int ui,
+ d;
+ ItemPointer htids;
+
+ nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
+
+ Assert(_bt_posting_valid(origtuple));
+ Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
+
+ if (BTreeTupleIsPosting(origtuple))
+ keysize = BTreeTupleGetPostingOffset(origtuple);
+ else
+ keysize = IndexTupleSize(origtuple);
+
+ /*
+ * Determine final size of new tuple.
+ *
+ * This calculation needs to match the code used within _bt_form_posting()
+ * for new posting list tuples. We avoid calling _bt_form_posting() here
+ * to save ourselves a second memory allocation for a htids workspace.
+ */
+ if (nhtids > 1)
+ newsize = MAXALIGN(keysize +
+ nhtids * sizeof(ItemPointerData));
+ else
+ newsize = keysize;
+
+ /* Allocate memory using palloc0() (matches index_form_tuple()) */
+ itup = palloc0(newsize);
+ memcpy(itup, origtuple, keysize);
+ itup->t_info &= ~INDEX_SIZE_MASK;
+ itup->t_info |= newsize;
+
+ if (nhtids > 1)
+ {
+ /* Form posting list tuple */
+ BTreeTupleSetPosting(itup, nhtids, keysize);
+ htids = BTreeTupleGetPosting(itup);
+ }
+ else
+ {
+ /* Form standard non-pivot tuple */
+ itup->t_info &= ~INDEX_ALT_TID_MASK;
+ htids = &itup->t_tid;
+ }
+
+ ui = 0;
+ d = 0;
+ for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
+ {
+ if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
+ {
+ d++;
+ continue;
+ }
+ htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
+ }
+ Assert(ui == nhtids);
+ Assert(d == vacposting->ndeletedtids);
+ Assert(nhtids == 1 || _bt_posting_valid(itup));
+
+ /* vacposting arg's itup will now point to updated version */
+ vacposting->itup = itup;
+}
+
+/*
+ * Prepare for a posting list split by swapping heap TID in newitem with heap
+ * TID from original posting list (the 'oposting' heap TID located at offset
+ * 'postingoff'). Modifies newitem, so caller should pass their own private
+ * copy that can safely be modified.
+ *
+ * Returns new posting list tuple, which is palloc()'d in caller's context.
+ * This is guaranteed to be the same size as 'oposting'. Modified newitem is
+ * what caller actually inserts. (This happens inside the same critical
+ * section that performs an in-place update of old posting list using new
+ * posting list returned here.)
+ *
+ * While the keys from newitem and oposting must be opclass equal, and must
+ * generate identical output when run through the underlying type's output
+ * function, it doesn't follow that their representations match exactly.
+ * Caller must avoid assuming that there can't be representational differences
+ * that make datums from oposting bigger or smaller than the corresponding
+ * datums from newitem. For example, differences in TOAST input state might
+ * break a faulty assumption about tuple size (the executor is entitled to
+ * apply TOAST compression based on its own criteria). It also seems possible
+ * that further representational variation will be introduced in the future,
+ * in order to support nbtree features like page-level prefix compression.
+ *
+ * See nbtree/README for details on the design of posting list splits.
+ */
+IndexTuple
+_bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
+{
+ int nhtids;
+ char *replacepos;
+ char *replaceposright;
+ Size nmovebytes;
+ IndexTuple nposting;
+
+ nhtids = BTreeTupleGetNPosting(oposting);
+ Assert(_bt_posting_valid(oposting));
+ Assert(postingoff > 0 && postingoff < nhtids);
+
+ /*
+ * Move item pointers in posting list to make a gap for the new item's
+ * heap TID. We shift TIDs one place to the right, losing original
+ * rightmost TID. (nmovebytes must not include TIDs to the left of
+ * postingoff, nor the existing rightmost/max TID that gets overwritten.)
+ */
+ nposting = CopyIndexTuple(oposting);
+ replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
+ replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
+ nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
+ memmove(replaceposright, replacepos, nmovebytes);
+
+ /* Fill the gap at postingoff with TID of new item (original new TID) */
+ Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
+ ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
+
+ /* Now copy oposting's rightmost/max TID into new item (final new TID) */
+ ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
+
+ Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
+ BTreeTupleGetHeapTID(newitem)) < 0);
+ Assert(_bt_posting_valid(nposting));
+
+ return nposting;
+}
+
+/*
+ * Verify posting list invariants for "posting", which must be a posting list
+ * tuple. Used within assertions.
+ */
+#ifdef USE_ASSERT_CHECKING
+static bool
+_bt_posting_valid(IndexTuple posting)
+{
+ ItemPointerData last;
+ ItemPointer htid;
+
+ if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
+ return false;
+
+ /* Remember first heap TID for loop */
+ ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
+ if (!ItemPointerIsValid(&last))
+ return false;
+
+ /* Iterate, starting from second TID */
+ for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
+ {
+ htid = BTreeTupleGetPostingN(posting, i);
+
+ if (!ItemPointerIsValid(htid))
+ return false;
+ if (ItemPointerCompare(htid, &last) <= 0)
+ return false;
+ ItemPointerCopy(htid, &last);
+ }
+
+ return true;
+}
+#endif
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 4e5849ab8e..b913543221 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -47,10 +47,12 @@ static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
BTStack stack,
IndexTuple itup,
OffsetNumber newitemoff,
+ int postingoff,
bool split_only_page);
static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
- IndexTuple newitem);
+ IndexTuple newitem, IndexTuple orignewitem,
+ IndexTuple nposting, uint16 postingoff);
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
BTStack stack, bool is_root, bool is_only);
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
@@ -125,6 +127,7 @@ _bt_doinsert(Relation rel, IndexTuple itup,
insertstate.itup_key = itup_key;
insertstate.bounds_valid = false;
insertstate.buf = InvalidBuffer;
+ insertstate.postingoff = 0;
/*
* It's very common to have an index on an auto-incremented or
@@ -295,7 +298,7 @@ top:
newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
stack, heapRel);
_bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
- itup, newitemoff, false);
+ itup, newitemoff, insertstate.postingoff, false);
}
else
{
@@ -340,6 +343,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
uint32 *speculativeToken)
{
IndexTuple itup = insertstate->itup;
+ IndexTuple curitup;
+ ItemId curitemid;
BTScanInsert itup_key = insertstate->itup_key;
SnapshotData SnapshotDirty;
OffsetNumber offset;
@@ -348,6 +353,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
BTPageOpaque opaque;
Buffer nbuf = InvalidBuffer;
bool found = false;
+ bool inposting = false;
+ bool prevalldead = true;
+ int curposti = 0;
/* Assume unique until we find a duplicate */
*is_unique = true;
@@ -375,13 +383,21 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
Assert(itup_key->scantid == NULL);
for (;;)
{
- ItemId curitemid;
- IndexTuple curitup;
- BlockNumber nblkno;
-
/*
- * make sure the offset points to an actual item before trying to
- * examine it...
+ * Each iteration of the loop processes one heap TID, not one index
+ * tuple. Current offset number for page isn't usually advanced on
+ * iterations that process heap TIDs from posting list tuples.
+ *
+ * "inposting" state is set when _inside_ a posting list --- not when
+ * we're at the start (or end) of a posting list. We advance curposti
+ * at the end of the iteration when inside a posting list tuple. In
+ * general, every loop iteration either advances the page offset or
+ * advances curposti --- an iteration that handles the rightmost/max
+ * heap TID in a posting list finally advances the page offset (and
+ * unsets "inposting").
+ *
+ * Make sure the offset points to an actual index tuple before trying
+ * to examine it...
*/
if (offset <= maxoff)
{
@@ -406,31 +422,60 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
break;
}
- curitemid = PageGetItemId(page, offset);
-
/*
- * We can skip items that are marked killed.
+ * We can skip items that are already marked killed.
*
* In the presence of heavy update activity an index may contain
* many killed items with the same key; running _bt_compare() on
* each killed item gets expensive. Just advance over killed
* items as quickly as we can. We only apply _bt_compare() when
- * we get to a non-killed item. Even those comparisons could be
- * avoided (in the common case where there is only one page to
- * visit) by reusing bounds, but just skipping dead items is fast
- * enough.
+ * we get to a non-killed item. We could reuse the bounds to
+ * avoid _bt_compare() calls for known equal tuples, but it
+ * doesn't seem worth it. Workloads with heavy update activity
+ * tend to have many deduplication passes, so we'll often avoid
+ * most of those comparisons, too (we call _bt_compare() when the
+ * posting list tuple is initially encountered, though not when
+ * processing later TIDs from the same tuple).
*/
- if (!ItemIdIsDead(curitemid))
+ if (!inposting)
+ curitemid = PageGetItemId(page, offset);
+ if (inposting || !ItemIdIsDead(curitemid))
{
ItemPointerData htid;
bool all_dead;
- if (_bt_compare(rel, itup_key, page, offset) != 0)
- break; /* we're past all the equal tuples */
+ if (!inposting)
+ {
+ /* Plain tuple, or first TID in posting list tuple */
+ if (_bt_compare(rel, itup_key, page, offset) != 0)
+ break; /* we're past all the equal tuples */
- /* okay, we gotta fetch the heap tuple ... */
- curitup = (IndexTuple) PageGetItem(page, curitemid);
- htid = curitup->t_tid;
+ /* Advanced curitup */
+ curitup = (IndexTuple) PageGetItem(page, curitemid);
+ Assert(!BTreeTupleIsPivot(curitup));
+ }
+
+ /* okay, we gotta fetch the heap tuple using htid ... */
+ if (!BTreeTupleIsPosting(curitup))
+ {
+ /* ... htid is from simple non-pivot tuple */
+ Assert(!inposting);
+ htid = curitup->t_tid;
+ }
+ else if (!inposting)
+ {
+ /* ... htid is first TID in new posting list */
+ inposting = true;
+ prevalldead = true;
+ curposti = 0;
+ htid = *BTreeTupleGetPostingN(curitup, 0);
+ }
+ else
+ {
+ /* ... htid is second or subsequent TID in posting list */
+ Assert(curposti > 0);
+ htid = *BTreeTupleGetPostingN(curitup, curposti);
+ }
/*
* If we are doing a recheck, we expect to find the tuple we
@@ -506,8 +551,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
* not part of this chain because it had a different index
* entry.
*/
- htid = itup->t_tid;
- if (table_index_fetch_tuple_check(heapRel, &htid,
+ if (table_index_fetch_tuple_check(heapRel, &itup->t_tid,
SnapshotSelf, NULL))
{
/* Normal case --- it's still live */
@@ -565,12 +609,14 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
RelationGetRelationName(rel))));
}
}
- else if (all_dead)
+ else if (all_dead && (!inposting ||
+ (prevalldead &&
+ curposti == BTreeTupleGetNPosting(curitup) - 1)))
{
/*
- * The conflicting tuple (or whole HOT chain) is dead to
- * everyone, so we may as well mark the index entry
- * killed.
+ * The conflicting tuple (or all HOT chains pointed to by
+ * all posting list TIDs) is dead to everyone, so mark the
+ * index entry killed.
*/
ItemIdMarkDead(curitemid);
opaque->btpo_flags |= BTP_HAS_GARBAGE;
@@ -584,14 +630,29 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
else
MarkBufferDirtyHint(insertstate->buf, true);
}
+
+ /*
+ * Remember if posting list tuple has even a single HOT chain
+ * whose members are not all dead
+ */
+ if (!all_dead && inposting)
+ prevalldead = false;
}
}
- /*
- * Advance to next tuple to continue checking.
- */
- if (offset < maxoff)
+ if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
+ {
+ /* Advance to next TID in same posting list */
+ curposti++;
+ continue;
+ }
+ else if (offset < maxoff)
+ {
+ /* Advance to next tuple */
+ curposti = 0;
+ inposting = false;
offset = OffsetNumberNext(offset);
+ }
else
{
int highkeycmp;
@@ -606,7 +667,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
/* Advance to next non-dead page --- there must be one */
for (;;)
{
- nblkno = opaque->btpo_next;
+ BlockNumber nblkno = opaque->btpo_next;
+
nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
page = BufferGetPage(nbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -616,6 +678,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
elog(ERROR, "fell off the end of index \"%s\"",
RelationGetRelationName(rel));
}
+ /* Will also advance to next tuple */
+ curposti = 0;
+ inposting = false;
maxoff = PageGetMaxOffsetNumber(page);
offset = P_FIRSTDATAKEY(opaque);
/* Don't invalidate binary search bounds */
@@ -684,6 +749,7 @@ _bt_findinsertloc(Relation rel,
BTScanInsert itup_key = insertstate->itup_key;
Page page = BufferGetPage(insertstate->buf);
BTPageOpaque lpageop;
+ OffsetNumber newitemoff;
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -696,9 +762,13 @@ _bt_findinsertloc(Relation rel,
Assert(!insertstate->bounds_valid || checkingunique);
Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
+ Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
if (itup_key->heapkeyspace)
{
+ /* Keep track of whether checkingunique duplicate seen */
+ bool uniquedup = false;
+
/*
* If we're inserting into a unique index, we may have to walk right
* through leaf pages to find the one leaf page that we must insert on
@@ -715,6 +785,13 @@ _bt_findinsertloc(Relation rel,
*/
if (checkingunique)
{
+ if (insertstate->low < insertstate->stricthigh)
+ {
+ /* Encountered a duplicate in _bt_check_unique() */
+ Assert(insertstate->bounds_valid);
+ uniquedup = true;
+ }
+
for (;;)
{
/*
@@ -741,18 +818,43 @@ _bt_findinsertloc(Relation rel,
/* Update local state after stepping right */
page = BufferGetPage(insertstate->buf);
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ /* Assume duplicates (if checkingunique) */
+ uniquedup = true;
}
}
/*
* If the target page is full, see if we can obtain enough space by
- * erasing LP_DEAD items
+ * erasing LP_DEAD items. If that fails to free enough space, see if
+ * we can avoid a page split by performing a deduplication pass over
+ * the page.
+ *
+ * We only perform a deduplication pass for a checkingunique caller
+ * when the incoming item is a duplicate of an existing item on the
+ * leaf page. This heuristic avoids wasting cycles -- we only expect
+ * to benefit from deduplicating a unique index page when most or all
+ * recently added items are duplicates. See nbtree/README.
*/
- if (PageGetFreeSpace(page) < insertstate->itemsz &&
- P_HAS_GARBAGE(lpageop))
+ if (PageGetFreeSpace(page) < insertstate->itemsz)
{
- _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
- insertstate->bounds_valid = false;
+ if (P_HAS_GARBAGE(lpageop))
+ {
+ _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
+ insertstate->bounds_valid = false;
+
+ /* Might as well assume duplicates (if checkingunique) */
+ uniquedup = true;
+ }
+
+ if (itup_key->allequalimage && BTGetDeduplicateItems(rel) &&
+ (!checkingunique || uniquedup) &&
+ PageGetFreeSpace(page) < insertstate->itemsz)
+ {
+ _bt_dedup_one_page(rel, insertstate->buf, heapRel,
+ insertstate->itup, insertstate->itemsz,
+ checkingunique);
+ insertstate->bounds_valid = false;
+ }
}
}
else
@@ -834,7 +936,30 @@ _bt_findinsertloc(Relation rel,
Assert(P_RIGHTMOST(lpageop) ||
_bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
- return _bt_binsrch_insert(rel, insertstate);
+ newitemoff = _bt_binsrch_insert(rel, insertstate);
+
+ if (insertstate->postingoff == -1)
+ {
+ /*
+ * There is an overlapping posting list tuple with its LP_DEAD bit
+ * set. We don't want to unnecessarily unset its LP_DEAD bit while
+ * performing a posting list split, so delete all LP_DEAD items early.
+ * This is the only case where LP_DEAD deletes happen even though
+ * there is space for newitem on the page.
+ */
+ _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
+
+ /*
+ * Do new binary search. New insert location cannot overlap with any
+ * posting list now.
+ */
+ insertstate->bounds_valid = false;
+ insertstate->postingoff = 0;
+ newitemoff = _bt_binsrch_insert(rel, insertstate);
+ Assert(insertstate->postingoff == 0);
+ }
+
+ return newitemoff;
}
/*
@@ -900,10 +1025,12 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
*
* This recursive procedure does the following things:
*
+ * + if postingoff != 0, splits existing posting list tuple
+ * (since it overlaps with new 'itup' tuple).
* + if necessary, splits the target page, using 'itup_key' for
* suffix truncation on leaf pages (caller passes NULL for
* non-leaf pages).
- * + inserts the tuple.
+ * + inserts the new tuple (might be split from posting list).
* + if the page was split, pops the parent stack, and finds the
* right place to insert the new child pointer (by walking
* right using information stored in the parent stack).
@@ -931,11 +1058,15 @@ _bt_insertonpg(Relation rel,
BTStack stack,
IndexTuple itup,
OffsetNumber newitemoff,
+ int postingoff,
bool split_only_page)
{
Page page;
BTPageOpaque lpageop;
Size itemsz;
+ IndexTuple oposting;
+ IndexTuple origitup = NULL;
+ IndexTuple nposting = NULL;
page = BufferGetPage(buf);
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -949,6 +1080,7 @@ _bt_insertonpg(Relation rel,
Assert(P_ISLEAF(lpageop) ||
BTreeTupleGetNAtts(itup, rel) <=
IndexRelationGetNumberOfKeyAttributes(rel));
+ Assert(!BTreeTupleIsPosting(itup));
/* The caller should've finished any incomplete splits already. */
if (P_INCOMPLETE_SPLIT(lpageop))
@@ -959,6 +1091,34 @@ _bt_insertonpg(Relation rel,
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
* need to be consistent */
+ /*
+ * Do we need to split an existing posting list item?
+ */
+ if (postingoff != 0)
+ {
+ ItemId itemid = PageGetItemId(page, newitemoff);
+
+ /*
+ * The new tuple is a duplicate with a heap TID that falls inside the
+ * range of an existing posting list tuple on a leaf page. Prepare to
+ * split an existing posting list. Overwriting the posting list with
+ * its post-split version is treated as an extra step in either the
+ * insert or page split critical section.
+ */
+ Assert(P_ISLEAF(lpageop) && !ItemIdIsDead(itemid));
+ Assert(itup_key->heapkeyspace && itup_key->allequalimage);
+ oposting = (IndexTuple) PageGetItem(page, itemid);
+
+ /* use a mutable copy of itup as our itup from here on */
+ origitup = itup;
+ itup = CopyIndexTuple(origitup);
+ nposting = _bt_swap_posting(itup, oposting, postingoff);
+ /* itup now contains rightmost/max TID from oposting */
+
+ /* Alter offset so that newitem goes after posting list */
+ newitemoff = OffsetNumberNext(newitemoff);
+ }
+
/*
* Do we need to split the page to fit the item on it?
*
@@ -991,7 +1151,8 @@ _bt_insertonpg(Relation rel,
BlockNumberIsValid(RelationGetTargetBlock(rel))));
/* split the buffer into left and right halves */
- rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup);
+ rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
+ origitup, nposting, postingoff);
PredicateLockPageSplit(rel,
BufferGetBlockNumber(buf),
BufferGetBlockNumber(rbuf));
@@ -1066,6 +1227,9 @@ _bt_insertonpg(Relation rel,
/* Do the update. No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
+ if (postingoff != 0)
+ memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
+
if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
elog(PANIC, "failed to add new item to block %u in index \"%s\"",
itup_blkno, RelationGetRelationName(rel));
@@ -1115,8 +1279,19 @@ _bt_insertonpg(Relation rel,
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
- if (P_ISLEAF(lpageop))
+ if (P_ISLEAF(lpageop) && postingoff == 0)
+ {
+ /* Simple leaf insert */
xlinfo = XLOG_BTREE_INSERT_LEAF;
+ }
+ else if (postingoff != 0)
+ {
+ /*
+ * Leaf insert with posting list split. Must include
+ * postingoff field before newitem/orignewitem.
+ */
+ xlinfo = XLOG_BTREE_INSERT_POST;
+ }
else
{
/*
@@ -1139,6 +1314,7 @@ _bt_insertonpg(Relation rel,
xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
xlmeta.last_cleanup_num_heap_tuples =
metad->btm_last_cleanup_num_heap_tuples;
+ xlmeta.allequalimage = metad->btm_allequalimage;
XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
@@ -1147,7 +1323,27 @@ _bt_insertonpg(Relation rel,
}
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
- XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
+ if (postingoff == 0)
+ {
+ /* Simple, common case -- log itup from caller */
+ XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
+ }
+ else
+ {
+ /*
+ * Insert with posting list split (XLOG_BTREE_INSERT_POST
+ * record) case.
+ *
+ * Log postingoff. Also log origitup, not itup. REDO routine
+ * must reconstruct final itup (as well as nposting) using
+ * _bt_swap_posting().
+ */
+ uint16 upostingoff = postingoff;
+
+ XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
+ XLogRegisterBufData(0, (char *) origitup,
+ IndexTupleSize(origitup));
+ }
recptr = XLogInsert(RM_BTREE_ID, xlinfo);
@@ -1189,6 +1385,14 @@ _bt_insertonpg(Relation rel,
_bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
RelationSetTargetBlock(rel, cachedBlock);
}
+
+ /* be tidy */
+ if (postingoff != 0)
+ {
+ /* itup is actually a modified copy of caller's original */
+ pfree(nposting);
+ pfree(itup);
+ }
}
/*
@@ -1204,12 +1408,24 @@ _bt_insertonpg(Relation rel,
* This function will clear the INCOMPLETE_SPLIT flag on it, and
* release the buffer.
*
+ * orignewitem, nposting, and postingoff are needed when an insert of
+ * orignewitem results in both a posting list split and a page split.
+ * These extra posting list split details are used here in the same
+ * way as they are used in the more common case where a posting list
+ * split does not coincide with a page split. We need to deal with
+ * posting list splits directly in order to ensure that everything
+ * that follows from the insert of orignewitem is handled as a single
+ * atomic operation (though caller's insert of a new pivot/downlink
+ * into parent page will still be a separate operation). See
+ * nbtree/README for details on the design of posting list splits.
+ *
* Returns the new right sibling of buf, pinned and write-locked.
* The pin and lock on buf are maintained.
*/
static Buffer
_bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
- OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem)
+ OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
+ IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
{
Buffer rbuf;
Page origpage;
@@ -1229,6 +1445,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
OffsetNumber leftoff,
rightoff;
OffsetNumber firstright;
+ OffsetNumber origpagepostingoff;
OffsetNumber maxoff;
OffsetNumber i;
bool newitemonleft,
@@ -1298,6 +1515,34 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
PageSetLSN(leftpage, PageGetLSN(origpage));
isleaf = P_ISLEAF(oopaque);
+ /*
+ * Determine page offset number of existing overlapped-with-orignewitem
+ * posting list when it is necessary to perform a posting list split in
+ * passing. Note that newitem was already changed by caller (newitem no
+ * longer has the orignewitem TID).
+ *
+ * This page offset number (origpagepostingoff) will be used to pretend
+ * that the posting split has already taken place, even though the
+ * required modifications to origpage won't occur until we reach the
+ * critical section. The lastleft and firstright tuples of our page split
+ * point should, in effect, come from an imaginary version of origpage
+ * that has the nposting tuple instead of the original posting list tuple.
+ *
+ * Note: _bt_findsplitloc() should have compensated for coinciding posting
+ * list splits in just the same way, at least in theory. It doesn't
+ * bother with that, though. In practice it won't affect its choice of
+ * split point.
+ */
+ origpagepostingoff = InvalidOffsetNumber;
+ if (postingoff != 0)
+ {
+ Assert(isleaf);
+ Assert(ItemPointerCompare(&orignewitem->t_tid,
+ &newitem->t_tid) < 0);
+ Assert(BTreeTupleIsPosting(nposting));
+ origpagepostingoff = OffsetNumberPrev(newitemoff);
+ }
+
/*
* The "high key" for the new left page will be the first key that's going
* to go into the new right page, or a truncated version if this is a leaf
@@ -1335,6 +1580,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
itemid = PageGetItemId(origpage, firstright);
itemsz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(origpage, itemid);
+ if (firstright == origpagepostingoff)
+ item = nposting;
}
/*
@@ -1368,6 +1615,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
itemid = PageGetItemId(origpage, lastleftoff);
lastleft = (IndexTuple) PageGetItem(origpage, itemid);
+ if (lastleftoff == origpagepostingoff)
+ lastleft = nposting;
}
Assert(lastleft != item);
@@ -1383,6 +1632,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
*/
leftoff = P_HIKEY;
+ Assert(BTreeTupleIsPivot(lefthikey) || !itup_key->heapkeyspace);
Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0);
Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts);
if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
@@ -1447,6 +1697,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
itemid = PageGetItemId(origpage, P_HIKEY);
itemsz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(origpage, itemid);
+ Assert(BTreeTupleIsPivot(item) || !itup_key->heapkeyspace);
Assert(BTreeTupleGetNAtts(item, rel) > 0);
Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts);
if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
@@ -1475,8 +1726,16 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
itemsz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(origpage, itemid);
+ /* replace original item with nposting due to posting split? */
+ if (i == origpagepostingoff)
+ {
+ Assert(BTreeTupleIsPosting(item));
+ Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
+ item = nposting;
+ }
+
/* does new item belong before this one? */
- if (i == newitemoff)
+ else if (i == newitemoff)
{
if (newitemonleft)
{
@@ -1645,8 +1904,12 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
XLogRecPtr recptr;
xlrec.level = ropaque->btpo.level;
+ /* See comments below on newitem, orignewitem, and posting lists */
xlrec.firstright = firstright;
xlrec.newitemoff = newitemoff;
+ xlrec.postingoff = 0;
+ if (postingoff != 0 && origpagepostingoff < firstright)
+ xlrec.postingoff = postingoff;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
@@ -1665,11 +1928,35 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* because it's included with all the other items on the right page.)
* Show the new item as belonging to the left page buffer, so that it
* is not stored if XLogInsert decides it needs a full-page image of
- * the left page. We store the offset anyway, though, to support
- * archive compression of these records.
+ * the left page. We always store newitemoff in the record, though.
+ *
+ * The details are sometimes slightly different for page splits that
+ * coincide with a posting list split. If both the replacement
+ * posting list and newitem go on the right page, then we don't need
+ * to log anything extra, just like the simple !newitemonleft
+ * no-posting-split case (postingoff is set to zero in the WAL record,
+ * so recovery doesn't need to process a posting list split at all).
+ * Otherwise, we set postingoff and log orignewitem instead of
+ * newitem, despite having actually inserted newitem. REDO routine
+ * must reconstruct nposting and newitem using _bt_swap_posting().
+ *
+ * Note: It's possible that our page split point is the point that
+ * makes the posting list lastleft and newitem firstright. This is
+ * the only case where we log orignewitem/newitem despite newitem
+ * going on the right page. If XLogInsert decides that it can omit
+ * orignewitem due to logging a full-page image of the left page,
+ * everything still works out, since recovery only needs to log
+ * orignewitem for items on the left page (just like the regular
+ * newitem-logged case).
*/
- if (newitemonleft)
+ if (newitemonleft && xlrec.postingoff == 0)
XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
+ else if (xlrec.postingoff != 0)
+ {
+ Assert(newitemonleft || firstright == newitemoff);
+ Assert(MAXALIGN(newitemsz) == IndexTupleSize(orignewitem));
+ XLogRegisterBufData(0, (char *) orignewitem, MAXALIGN(newitemsz));
+ }
/* Log the left page's new high key */
itemid = PageGetItemId(origpage, P_HIKEY);
@@ -1829,7 +2116,7 @@ _bt_insert_parent(Relation rel,
/* Recursively insert into the parent */
_bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
- new_item, stack->bts_offset + 1,
+ new_item, stack->bts_offset + 1, 0,
is_only);
/* be tidy */
@@ -2185,6 +2472,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
md.fastlevel = metad->btm_level;
md.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
+ md.allequalimage = metad->btm_allequalimage;
XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
@@ -2265,7 +2553,7 @@ _bt_pgaddtup(Page page,
static void
_bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
{
- OffsetNumber deletable[MaxOffsetNumber];
+ OffsetNumber deletable[MaxIndexTuplesPerPage];
int ndeletable = 0;
OffsetNumber offnum,
minoff,
@@ -2298,6 +2586,6 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
* Note: if we didn't find any LP_DEAD items, then the page's
* BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
* separate write to clear it, however. We will clear it when we split
- * the page.
+ * the page, or when deduplication runs.
*/
}
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index f05cbe7467..39b8f17f4b 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -24,6 +24,7 @@
#include "access/nbtree.h"
#include "access/nbtxlog.h"
+#include "access/tableam.h"
#include "access/transam.h"
#include "access/xlog.h"
#include "access/xloginsert.h"
@@ -37,6 +38,8 @@ static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack);
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
bool *rightsib_empty);
+static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
+ OffsetNumber *deletable, int ndeletable);
static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
BTStack stack, Buffer *topparent, OffsetNumber *topoff,
BlockNumber *target, BlockNumber *rightsib);
@@ -47,7 +50,8 @@ static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
* _bt_initmetapage() -- Fill a page buffer with a correct metapage image
*/
void
-_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
+_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
+ bool allequalimage)
{
BTMetaPageData *metad;
BTPageOpaque metaopaque;
@@ -63,6 +67,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
metad->btm_fastlevel = level;
metad->btm_oldest_btpo_xact = InvalidTransactionId;
metad->btm_last_cleanup_num_heap_tuples = -1.0;
+ metad->btm_allequalimage = allequalimage;
metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
metaopaque->btpo_flags = BTP_META;
@@ -102,6 +107,9 @@ _bt_upgrademetapage(Page page)
metad->btm_version = BTREE_NOVAC_VERSION;
metad->btm_oldest_btpo_xact = InvalidTransactionId;
metad->btm_last_cleanup_num_heap_tuples = -1.0;
+ /* Only a REINDEX can set this field */
+ Assert(!metad->btm_allequalimage);
+ metad->btm_allequalimage = false;
/* Adjust pd_lower (see _bt_initmetapage() for details) */
((PageHeader) page)->pd_lower =
@@ -213,6 +221,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
md.fastlevel = metad->btm_fastlevel;
md.oldest_btpo_xact = oldestBtpoXact;
md.last_cleanup_num_heap_tuples = numHeapTuples;
+ md.allequalimage = metad->btm_allequalimage;
XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
@@ -274,6 +283,8 @@ _bt_getroot(Relation rel, int access)
Assert(metad->btm_magic == BTREE_MAGIC);
Assert(metad->btm_version >= BTREE_MIN_VERSION);
Assert(metad->btm_version <= BTREE_VERSION);
+ Assert(!metad->btm_allequalimage ||
+ metad->btm_version > BTREE_NOVAC_VERSION);
Assert(metad->btm_root != P_NONE);
rootblkno = metad->btm_fastroot;
@@ -394,6 +405,7 @@ _bt_getroot(Relation rel, int access)
md.fastlevel = 0;
md.oldest_btpo_xact = InvalidTransactionId;
md.last_cleanup_num_heap_tuples = -1.0;
+ md.allequalimage = metad->btm_allequalimage;
XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
@@ -618,22 +630,34 @@ _bt_getrootheight(Relation rel)
Assert(metad->btm_magic == BTREE_MAGIC);
Assert(metad->btm_version >= BTREE_MIN_VERSION);
Assert(metad->btm_version <= BTREE_VERSION);
+ Assert(!metad->btm_allequalimage ||
+ metad->btm_version > BTREE_NOVAC_VERSION);
Assert(metad->btm_fastroot != P_NONE);
return metad->btm_fastlevel;
}
/*
- * _bt_heapkeyspace() -- is heap TID being treated as a key?
+ * _bt_metaversion() -- Get version/status info from metapage.
+ *
+ * Sets caller's *heapkeyspace and *allequalimage arguments using data
+ * from the B-Tree metapage (could be locally-cached version). This
+ * information needs to be stashed in insertion scankey, so we provide a
+ * single function that fetches both at once.
*
* This is used to determine the rules that must be used to descend a
* btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
* pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
* performance when inserting a new BTScanInsert-wise duplicate tuple
* among many leaf pages already full of such duplicates.
+ *
+ * Also sets allequalimage field, which indicates whether or not it is
+ * safe to apply deduplication. We rely on the assumption that
+ * btm_allequalimage will be zero'ed on heapkeyspace indexes that were
+ * pg_upgrade'd from Postgres 12.
*/
-bool
-_bt_heapkeyspace(Relation rel)
+void
+_bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
{
BTMetaPageData *metad;
@@ -651,10 +675,11 @@ _bt_heapkeyspace(Relation rel)
*/
if (metad->btm_root == P_NONE)
{
- uint32 btm_version = metad->btm_version;
+ *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
+ *allequalimage = metad->btm_allequalimage;
_bt_relbuf(rel, metabuf);
- return btm_version > BTREE_NOVAC_VERSION;
+ return;
}
/*
@@ -678,9 +703,12 @@ _bt_heapkeyspace(Relation rel)
Assert(metad->btm_magic == BTREE_MAGIC);
Assert(metad->btm_version >= BTREE_MIN_VERSION);
Assert(metad->btm_version <= BTREE_VERSION);
+ Assert(!metad->btm_allequalimage ||
+ metad->btm_version > BTREE_NOVAC_VERSION);
Assert(metad->btm_fastroot != P_NONE);
- return metad->btm_version > BTREE_NOVAC_VERSION;
+ *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
+ *allequalimage = metad->btm_allequalimage;
}
/*
@@ -964,28 +992,106 @@ _bt_page_recyclable(Page page)
* Delete item(s) from a btree leaf page during VACUUM.
*
* This routine assumes that the caller has a super-exclusive write lock on
- * the buffer. Also, the given deletable array *must* be sorted in ascending
- * order.
+ * the buffer. Also, the given deletable and updatable arrays *must* be
+ * sorted in ascending order.
+ *
+ * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
+ * in an existing posting list item are to be removed by VACUUM. This works
+ * by updating/overwriting an existing item with caller's new version of the
+ * item (a version that lacks the TIDs that are to be deleted).
*
* We record VACUUMs and b-tree deletes differently in WAL. Deletes must
* generate their own latestRemovedXid by accessing the heap directly, whereas
- * VACUUMs rely on the initial heap scan taking care of it indirectly.
+ * VACUUMs rely on the initial heap scan taking care of it indirectly. Also,
+ * only VACUUM can perform granular deletes of individual TIDs in posting list
+ * tuples.
*/
void
_bt_delitems_vacuum(Relation rel, Buffer buf,
- OffsetNumber *deletable, int ndeletable)
+ OffsetNumber *deletable, int ndeletable,
+ BTVacuumPosting *updatable, int nupdatable)
{
Page page = BufferGetPage(buf);
BTPageOpaque opaque;
+ Size itemsz;
+ char *updatedbuf = NULL;
+ Size updatedbuflen = 0;
+ OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
/* Shouldn't be called unless there's something to do */
- Assert(ndeletable > 0);
+ Assert(ndeletable > 0 || nupdatable > 0);
+
+ for (int i = 0; i < nupdatable; i++)
+ {
+ /* Replace work area IndexTuple with updated version */
+ _bt_update_posting(updatable[i]);
+
+ /* Maintain array of updatable page offsets for WAL record */
+ updatedoffsets[i] = updatable[i]->updatedoffset;
+ }
+
+ /* XLOG stuff -- allocate and fill buffer before critical section */
+ if (nupdatable > 0 && RelationNeedsWAL(rel))
+ {
+ Size offset = 0;
+
+ for (int i = 0; i < nupdatable; i++)
+ {
+ BTVacuumPosting vacposting = updatable[i];
+
+ itemsz = SizeOfBtreeUpdate +
+ vacposting->ndeletedtids * sizeof(uint16);
+ updatedbuflen += itemsz;
+ }
+
+ updatedbuf = palloc(updatedbuflen);
+ for (int i = 0; i < nupdatable; i++)
+ {
+ BTVacuumPosting vacposting = updatable[i];
+ xl_btree_update update;
+
+ update.ndeletedtids = vacposting->ndeletedtids;
+ memcpy(updatedbuf + offset, &update.ndeletedtids,
+ SizeOfBtreeUpdate);
+ offset += SizeOfBtreeUpdate;
+
+ itemsz = update.ndeletedtids * sizeof(uint16);
+ memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
+ offset += itemsz;
+ }
+ }
/* No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
- /* Fix the page */
- PageIndexMultiDelete(page, deletable, ndeletable);
+ /*
+ * Handle posting tuple updates.
+ *
+ * Deliberately do this before handling simple deletes. If we did it the
+ * other way around (i.e. WAL record order -- simple deletes before
+ * updates) then we'd have to make compensating changes to the 'updatable'
+ * array of offset numbers.
+ *
+ * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
+ * happens to already be set. Although we unset the BTP_HAS_GARBAGE page
+ * level flag, unsetting individual LP_DEAD bits should still be avoided.
+ */
+ for (int i = 0; i < nupdatable; i++)
+ {
+ OffsetNumber updatedoffset = updatedoffsets[i];
+ IndexTuple itup;
+
+ itup = updatable[i]->itup;
+ itemsz = MAXALIGN(IndexTupleSize(itup));
+ if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
+ itemsz))
+ elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
+ BufferGetBlockNumber(buf), RelationGetRelationName(rel));
+ }
+
+ /* Now handle simple deletes of entire tuples */
+ if (ndeletable > 0)
+ PageIndexMultiDelete(page, deletable, ndeletable);
/*
* We can clear the vacuum cycle ID since this page has certainly been
@@ -1006,7 +1112,9 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
* limited, since we never falsely unset an LP_DEAD bit. Workloads that
* are particularly dependent on LP_DEAD bits being set quickly will
* usually manage to set the BTP_HAS_GARBAGE flag before the page fills up
- * again anyway.
+ * again anyway. Furthermore, attempting a deduplication pass will remove
+ * all LP_DEAD items, regardless of whether the BTP_HAS_GARBAGE hint bit
+ * is set or not.
*/
opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
@@ -1019,18 +1127,22 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
xl_btree_vacuum xlrec_vacuum;
xlrec_vacuum.ndeleted = ndeletable;
+ xlrec_vacuum.nupdated = nupdatable;
XLogBeginInsert();
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
- /*
- * The deletable array is not in the buffer, but pretend that it is.
- * When XLogInsert stores the whole buffer, the array need not be
- * stored too.
- */
- XLogRegisterBufData(0, (char *) deletable,
- ndeletable * sizeof(OffsetNumber));
+ if (ndeletable > 0)
+ XLogRegisterBufData(0, (char *) deletable,
+ ndeletable * sizeof(OffsetNumber));
+
+ if (nupdatable > 0)
+ {
+ XLogRegisterBufData(0, (char *) updatedoffsets,
+ nupdatable * sizeof(OffsetNumber));
+ XLogRegisterBufData(0, updatedbuf, updatedbuflen);
+ }
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
@@ -1038,6 +1150,13 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
}
END_CRIT_SECTION();
+
+ /* can't leak memory here */
+ if (updatedbuf != NULL)
+ pfree(updatedbuf);
+ /* free tuples generated by calling _bt_update_posting() */
+ for (int i = 0; i < nupdatable; i++)
+ pfree(updatable[i]->itup);
}
/*
@@ -1050,6 +1169,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
* This is nearly the same as _bt_delitems_vacuum as far as what it does to
* the page, but it needs to generate its own latestRemovedXid by accessing
* the heap. This is used by the REDO routine to generate recovery conflicts.
+ * Also, it doesn't handle posting list tuples unless the entire tuple can be
+ * deleted as a whole (since there is only one LP_DEAD bit per line pointer).
*/
void
_bt_delitems_delete(Relation rel, Buffer buf,
@@ -1065,8 +1186,7 @@ _bt_delitems_delete(Relation rel, Buffer buf,
if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
latestRemovedXid =
- index_compute_xid_horizon_for_tuples(rel, heapRel, buf,
- deletable, ndeletable);
+ _bt_xid_horizon(rel, heapRel, page, deletable, ndeletable);
/* No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
@@ -1113,6 +1233,83 @@ _bt_delitems_delete(Relation rel, Buffer buf,
END_CRIT_SECTION();
}
+/*
+ * Get the latestRemovedXid from the table entries pointed to by the non-pivot
+ * tuples being deleted.
+ *
+ * This is a specialized version of index_compute_xid_horizon_for_tuples().
+ * It's needed because btree tuples don't always store table TID using the
+ * standard index tuple header field.
+ */
+static TransactionId
+_bt_xid_horizon(Relation rel, Relation heapRel, Page page,
+ OffsetNumber *deletable, int ndeletable)
+{
+ TransactionId latestRemovedXid = InvalidTransactionId;
+ int spacenhtids;
+ int nhtids;
+ ItemPointer htids;
+
+ /* Array will grow iff there are posting list tuples to consider */
+ spacenhtids = ndeletable;
+ nhtids = 0;
+ htids = (ItemPointer) palloc(sizeof(ItemPointerData) * spacenhtids);
+ for (int i = 0; i < ndeletable; i++)
+ {
+ ItemId itemid;
+ IndexTuple itup;
+
+ itemid = PageGetItemId(page, deletable[i]);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+
+ Assert(ItemIdIsDead(itemid));
+ Assert(!BTreeTupleIsPivot(itup));
+
+ if (!BTreeTupleIsPosting(itup))
+ {
+ if (nhtids + 1 > spacenhtids)
+ {
+ spacenhtids *= 2;
+ htids = (ItemPointer)
+ repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
+ }
+
+ Assert(ItemPointerIsValid(&itup->t_tid));
+ ItemPointerCopy(&itup->t_tid, &htids[nhtids]);
+ nhtids++;
+ }
+ else
+ {
+ int nposting = BTreeTupleGetNPosting(itup);
+
+ if (nhtids + nposting > spacenhtids)
+ {
+ spacenhtids = Max(spacenhtids * 2, nhtids + nposting);
+ htids = (ItemPointer)
+ repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
+ }
+
+ for (int j = 0; j < nposting; j++)
+ {
+ ItemPointer htid = BTreeTupleGetPostingN(itup, j);
+
+ Assert(ItemPointerIsValid(htid));
+ ItemPointerCopy(htid, &htids[nhtids]);
+ nhtids++;
+ }
+ }
+ }
+
+ Assert(nhtids >= ndeletable);
+
+ latestRemovedXid =
+ table_compute_xid_horizon_for_tuples(heapRel, htids, nhtids);
+
+ pfree(htids);
+
+ return latestRemovedXid;
+}
+
/*
* Returns true, if the given block has the half-dead flag set.
*/
@@ -2058,6 +2255,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
xlmeta.fastlevel = metad->btm_fastlevel;
xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
+ xlmeta.allequalimage = metad->btm_allequalimage;
XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 5254bc7ef5..4bb16297c3 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -95,6 +95,10 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
BTCycleId cycleid, TransactionId *oldestBtpoXact);
static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
BlockNumber orig_blkno);
+static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
+ IndexTuple posting,
+ OffsetNumber updatedoffset,
+ int *nremaining);
/*
@@ -161,7 +165,7 @@ btbuildempty(Relation index)
/* Construct metapage. */
metapage = (Page) palloc(BLCKSZ);
- _bt_initmetapage(metapage, P_NONE, 0);
+ _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
/*
* Write the page and log it. It might seem that an immediate sync would
@@ -264,8 +268,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
*/
if (so->killedItems == NULL)
so->killedItems = (int *)
- palloc(MaxIndexTuplesPerPage * sizeof(int));
- if (so->numKilled < MaxIndexTuplesPerPage)
+ palloc(MaxTIDsPerBTreePage * sizeof(int));
+ if (so->numKilled < MaxTIDsPerBTreePage)
so->killedItems[so->numKilled++] = so->currPos.itemIndex;
}
@@ -1154,11 +1158,15 @@ restart:
}
else if (P_ISLEAF(opaque))
{
- OffsetNumber deletable[MaxOffsetNumber];
+ OffsetNumber deletable[MaxIndexTuplesPerPage];
int ndeletable;
+ BTVacuumPosting updatable[MaxIndexTuplesPerPage];
+ int nupdatable;
OffsetNumber offnum,
minoff,
maxoff;
+ int nhtidsdead,
+ nhtidslive;
/*
* Trade in the initial read lock for a super-exclusive write lock on
@@ -1190,8 +1198,11 @@ restart:
* point using callback.
*/
ndeletable = 0;
+ nupdatable = 0;
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
+ nhtidsdead = 0;
+ nhtidslive = 0;
if (callback)
{
for (offnum = minoff;
@@ -1199,11 +1210,9 @@ restart:
offnum = OffsetNumberNext(offnum))
{
IndexTuple itup;
- ItemPointer htup;
itup = (IndexTuple) PageGetItem(page,
PageGetItemId(page, offnum));
- htup = &(itup->t_tid);
/*
* Hot Standby assumes that it's okay that XLOG_BTREE_VACUUM
@@ -1226,22 +1235,82 @@ restart:
* simple, and allows us to always avoid generating our own
* conflicts.
*/
- if (callback(htup, callback_state))
- deletable[ndeletable++] = offnum;
+ Assert(!BTreeTupleIsPivot(itup));
+ if (!BTreeTupleIsPosting(itup))
+ {
+ /* Regular tuple, standard table TID representation */
+ if (callback(&itup->t_tid, callback_state))
+ {
+ deletable[ndeletable++] = offnum;
+ nhtidsdead++;
+ }
+ else
+ nhtidslive++;
+ }
+ else
+ {
+ BTVacuumPosting vacposting;
+ int nremaining;
+
+ /* Posting list tuple */
+ vacposting = btreevacuumposting(vstate, itup, offnum,
+ &nremaining);
+ if (vacposting == NULL)
+ {
+ /*
+ * All table TIDs from the posting tuple remain, so no
+ * delete or update required
+ */
+ Assert(nremaining == BTreeTupleGetNPosting(itup));
+ }
+ else if (nremaining > 0)
+ {
+
+ /*
+ * Store metadata about posting list tuple in
+ * updatable array for entire page. Existing tuple
+ * will be updated during the later call to
+ * _bt_delitems_vacuum().
+ */
+ Assert(nremaining < BTreeTupleGetNPosting(itup));
+ updatable[nupdatable++] = vacposting;
+ nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
+ }
+ else
+ {
+ /*
+ * All table TIDs from the posting list must be
+ * deleted. We'll delete the index tuple completely
+ * (no update required).
+ */
+ Assert(nremaining == 0);
+ deletable[ndeletable++] = offnum;
+ nhtidsdead += BTreeTupleGetNPosting(itup);
+ pfree(vacposting);
+ }
+
+ nhtidslive += nremaining;
+ }
}
}
/*
- * Apply any needed deletes. We issue just one _bt_delitems_vacuum()
- * call per page, so as to minimize WAL traffic.
+ * Apply any needed deletes or updates. We issue just one
+ * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
*/
- if (ndeletable > 0)
+ if (ndeletable > 0 || nupdatable > 0)
{
- _bt_delitems_vacuum(rel, buf, deletable, ndeletable);
+ Assert(nhtidsdead >= Max(ndeletable, 1));
+ _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
+ nupdatable);
- stats->tuples_removed += ndeletable;
+ stats->tuples_removed += nhtidsdead;
/* must recompute maxoff */
maxoff = PageGetMaxOffsetNumber(page);
+
+ /* can't leak memory here */
+ for (int i = 0; i < nupdatable; i++)
+ pfree(updatable[i]);
}
else
{
@@ -1254,6 +1323,7 @@ restart:
* We treat this like a hint-bit update because there's no need to
* WAL-log it.
*/
+ Assert(nhtidsdead == 0);
if (vstate->cycleid != 0 &&
opaque->btpo_cycleid == vstate->cycleid)
{
@@ -1263,15 +1333,18 @@ restart:
}
/*
- * If it's now empty, try to delete; else count the live tuples. We
- * don't delete when recursing, though, to avoid putting entries into
- * freePages out-of-order (doesn't seem worth any extra code to handle
- * the case).
+ * If it's now empty, try to delete; else count the live tuples (live
+ * table TIDs in posting lists are counted as separate live tuples).
+ * We don't delete when recursing, though, to avoid putting entries
+ * into freePages out-of-order (doesn't seem worth any extra code to
+ * handle the case).
*/
if (minoff > maxoff)
delete_now = (blkno == orig_blkno);
else
- stats->num_index_tuples += maxoff - minoff + 1;
+ stats->num_index_tuples += nhtidslive;
+
+ Assert(!delete_now || nhtidslive == 0);
}
if (delete_now)
@@ -1303,9 +1376,10 @@ restart:
/*
* This is really tail recursion, but if the compiler is too stupid to
* optimize it as such, we'd eat an uncomfortably large amount of stack
- * space per recursion level (due to the deletable[] array). A failure is
- * improbable since the number of levels isn't likely to be large ... but
- * just in case, let's hand-optimize into a loop.
+ * space per recursion level (due to the arrays used to track details of
+ * deletable/updatable items). A failure is improbable since the number
+ * of levels isn't likely to be large ... but just in case, let's
+ * hand-optimize into a loop.
*/
if (recurse_to != P_NONE)
{
@@ -1314,6 +1388,61 @@ restart:
}
}
+/*
+ * btreevacuumposting --- determine TIDs still needed in posting list
+ *
+ * Returns metadata describing how to build replacement tuple without the TIDs
+ * that VACUUM needs to delete. Returned value is NULL in the common case
+ * where no changes are needed to caller's posting list tuple (we avoid
+ * allocating memory here as an optimization).
+ *
+ * The number of TIDs that should remain in the posting list tuple is set for
+ * caller in *nremaining.
+ */
+static BTVacuumPosting
+btreevacuumposting(BTVacState *vstate, IndexTuple posting,
+ OffsetNumber updatedoffset, int *nremaining)
+{
+ int live = 0;
+ int nitem = BTreeTupleGetNPosting(posting);
+ ItemPointer items = BTreeTupleGetPosting(posting);
+ BTVacuumPosting vacposting = NULL;
+
+ for (int i = 0; i < nitem; i++)
+ {
+ if (!vstate->callback(items + i, vstate->callback_state))
+ {
+ /* Live table TID */
+ live++;
+ }
+ else if (vacposting == NULL)
+ {
+ /*
+ * First dead table TID encountered.
+ *
+ * It's now clear that we need to delete one or more dead table
+ * TIDs, so start maintaining metadata describing how to update
+ * existing posting list tuple.
+ */
+ vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
+ nitem * sizeof(uint16));
+
+ vacposting->itup = posting;
+ vacposting->updatedoffset = updatedoffset;
+ vacposting->ndeletedtids = 0;
+ vacposting->deletetids[vacposting->ndeletedtids++] = i;
+ }
+ else
+ {
+ /* Second or subsequent dead table TID */
+ vacposting->deletetids[vacposting->ndeletedtids++] = i;
+ }
+ }
+
+ *nremaining = live;
+ return vacposting;
+}
+
/*
* btcanreturn() -- Check whether btree indexes support index-only scans.
*
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index df065d72f8..7aaa8c17b0 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -26,10 +26,18 @@
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
+static int _bt_binsrch_posting(BTScanInsert key, Page page,
+ OffsetNumber offnum);
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
OffsetNumber offnum);
static void _bt_saveitem(BTScanOpaque so, int itemIndex,
OffsetNumber offnum, IndexTuple itup);
+static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
+ OffsetNumber offnum, ItemPointer heapTid,
+ IndexTuple itup);
+static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
+ OffsetNumber offnum,
+ ItemPointer heapTid, int tupleOffset);
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
@@ -142,6 +150,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
offnum = _bt_binsrch(rel, key, *bufP);
itemid = PageGetItemId(page, offnum);
itup = (IndexTuple) PageGetItem(page, itemid);
+ Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
blkno = BTreeTupleGetDownLink(itup);
par_blkno = BufferGetBlockNumber(*bufP);
@@ -434,7 +443,10 @@ _bt_binsrch(Relation rel,
* low) makes bounds invalid.
*
* Caller is responsible for invalidating bounds when it modifies the page
- * before calling here a second time.
+ * before calling here a second time, and for dealing with posting list
+ * tuple matches (callers can use insertstate's postingoff field to
+ * determine which existing heap TID will need to be replaced by a posting
+ * list split).
*/
OffsetNumber
_bt_binsrch_insert(Relation rel, BTInsertState insertstate)
@@ -453,6 +465,7 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Assert(P_ISLEAF(opaque));
Assert(!key->nextkey);
+ Assert(insertstate->postingoff == 0);
if (!insertstate->bounds_valid)
{
@@ -509,6 +522,16 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
if (result != 0)
stricthigh = high;
}
+
+ /*
+ * If tuple at offset located by binary search is a posting list whose
+ * TID range overlaps with caller's scantid, perform posting list
+ * binary search to set postingoff for caller. Caller must split the
+ * posting list when postingoff is set. This should happen
+ * infrequently.
+ */
+ if (unlikely(result == 0 && key->scantid != NULL))
+ insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
}
/*
@@ -528,6 +551,73 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
return low;
}
+/*----------
+ * _bt_binsrch_posting() -- posting list binary search.
+ *
+ * Helper routine for _bt_binsrch_insert().
+ *
+ * Returns offset into posting list where caller's scantid belongs.
+ *----------
+ */
+static int
+_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
+{
+ IndexTuple itup;
+ ItemId itemid;
+ int low,
+ high,
+ mid,
+ res;
+
+ /*
+ * If this isn't a posting tuple, then the index must be corrupt (if it is
+ * an ordinary non-pivot tuple then there must be an existing tuple with a
+ * heap TID that equals inserter's new heap TID/scantid). Defensively
+ * check that tuple is a posting list tuple whose posting list range
+ * includes caller's scantid.
+ *
+ * (This is also needed because contrib/amcheck's rootdescend option needs
+ * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
+ */
+ itemid = PageGetItemId(page, offnum);
+ itup = (IndexTuple) PageGetItem(page, itemid);
+ if (!BTreeTupleIsPosting(itup))
+ return 0;
+
+ Assert(key->heapkeyspace && key->allequalimage);
+
+ /*
+ * In the event that posting list tuple has LP_DEAD bit set, indicate this
+ * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
+ * second call to _bt_binsrch_insert() can take place when its caller has
+ * removed the dead item.
+ */
+ if (ItemIdIsDead(itemid))
+ return -1;
+
+ /* "high" is past end of posting list for loop invariant */
+ low = 0;
+ high = BTreeTupleGetNPosting(itup);
+ Assert(high >= 2);
+
+ while (high > low)
+ {
+ mid = low + ((high - low) / 2);
+ res = ItemPointerCompare(key->scantid,
+ BTreeTupleGetPostingN(itup, mid));
+
+ if (res > 0)
+ low = mid + 1;
+ else if (res < 0)
+ high = mid;
+ else
+ return mid;
+ }
+
+ /* Exact match not found */
+ return low;
+}
+
/*----------
* _bt_compare() -- Compare insertion-type scankey to tuple on a page.
*
@@ -537,9 +627,14 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
* <0 if scankey < tuple at offnum;
* 0 if scankey == tuple at offnum;
* >0 if scankey > tuple at offnum.
- * NULLs in the keys are treated as sortable values. Therefore
- * "equality" does not necessarily mean that the item should be
- * returned to the caller as a matching key!
+ *
+ * NULLs in the keys are treated as sortable values. Therefore
+ * "equality" does not necessarily mean that the item should be returned
+ * to the caller as a matching key. Similarly, an insertion scankey
+ * with its scantid set is treated as equal to a posting tuple whose TID
+ * range overlaps with their scantid. There generally won't be a
+ * matching TID in the posting tuple, which caller must handle
+ * themselves (e.g., by splitting the posting list tuple).
*
* CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
* "minus infinity": this routine will always claim it is less than the
@@ -563,6 +658,7 @@ _bt_compare(Relation rel,
ScanKey scankey;
int ncmpkey;
int ntupatts;
+ int32 result;
Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
@@ -592,12 +688,12 @@ _bt_compare(Relation rel,
ncmpkey = Min(ntupatts, key->keysz);
Assert(key->heapkeyspace || ncmpkey == key->keysz);
+ Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
scankey = key->scankeys;
for (int i = 1; i <= ncmpkey; i++)
{
Datum datum;
bool isNull;
- int32 result;
datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
@@ -712,8 +808,25 @@ _bt_compare(Relation rel,
if (heapTid == NULL)
return 1;
+ /*
+ * Scankey must be treated as equal to a posting list tuple if its scantid
+ * value falls within the range of the posting list. In all other cases
+ * there can only be a single heap TID value, which is compared directly
+ * with scantid.
+ */
Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
- return ItemPointerCompare(key->scantid, heapTid);
+ result = ItemPointerCompare(key->scantid, heapTid);
+ if (result <= 0 || !BTreeTupleIsPosting(itup))
+ return result;
+ else
+ {
+ result = ItemPointerCompare(key->scantid,
+ BTreeTupleGetMaxHeapTID(itup));
+ if (result > 0)
+ return 1;
+ }
+
+ return 0;
}
/*
@@ -1228,7 +1341,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
}
/* Initialize remaining insertion scan key fields */
- inskey.heapkeyspace = _bt_heapkeyspace(rel);
+ _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
inskey.anynullkeys = false; /* unused */
inskey.nextkey = nextkey;
inskey.pivotsearch = false;
@@ -1483,9 +1596,35 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
{
- /* tuple passes all scan key conditions, so remember it */
- _bt_saveitem(so, itemIndex, offnum, itup);
- itemIndex++;
+ /* tuple passes all scan key conditions */
+ if (!BTreeTupleIsPosting(itup))
+ {
+ /* Remember it */
+ _bt_saveitem(so, itemIndex, offnum, itup);
+ itemIndex++;
+ }
+ else
+ {
+ int tupleOffset;
+
+ /*
+ * Set up state to return posting list, and remember first
+ * TID
+ */
+ tupleOffset =
+ _bt_setuppostingitems(so, itemIndex, offnum,
+ BTreeTupleGetPostingN(itup, 0),
+ itup);
+ itemIndex++;
+ /* Remember additional TIDs */
+ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+ {
+ _bt_savepostingitem(so, itemIndex, offnum,
+ BTreeTupleGetPostingN(itup, i),
+ tupleOffset);
+ itemIndex++;
+ }
+ }
}
/* When !continuescan, there can't be any more matches, so stop */
if (!continuescan)
@@ -1518,7 +1657,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
if (!continuescan)
so->currPos.moreRight = false;
- Assert(itemIndex <= MaxIndexTuplesPerPage);
+ Assert(itemIndex <= MaxTIDsPerBTreePage);
so->currPos.firstItem = 0;
so->currPos.lastItem = itemIndex - 1;
so->currPos.itemIndex = 0;
@@ -1526,7 +1665,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
else
{
/* load items[] in descending order */
- itemIndex = MaxIndexTuplesPerPage;
+ itemIndex = MaxTIDsPerBTreePage;
offnum = Min(offnum, maxoff);
@@ -1567,9 +1706,41 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
&continuescan);
if (passes_quals && tuple_alive)
{
- /* tuple passes all scan key conditions, so remember it */
- itemIndex--;
- _bt_saveitem(so, itemIndex, offnum, itup);
+ /* tuple passes all scan key conditions */
+ if (!BTreeTupleIsPosting(itup))
+ {
+ /* Remember it */
+ itemIndex--;
+ _bt_saveitem(so, itemIndex, offnum, itup);
+ }
+ else
+ {
+ int tupleOffset;
+
+ /*
+ * Set up state to return posting list, and remember first
+ * TID.
+ *
+ * Note that we deliberately save/return items from
+ * posting lists in ascending heap TID order for backwards
+ * scans. This allows _bt_killitems() to make a
+ * consistent assumption about the order of items
+ * associated with the same posting list tuple.
+ */
+ itemIndex--;
+ tupleOffset =
+ _bt_setuppostingitems(so, itemIndex, offnum,
+ BTreeTupleGetPostingN(itup, 0),
+ itup);
+ /* Remember additional TIDs */
+ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+ {
+ itemIndex--;
+ _bt_savepostingitem(so, itemIndex, offnum,
+ BTreeTupleGetPostingN(itup, i),
+ tupleOffset);
+ }
+ }
}
if (!continuescan)
{
@@ -1583,8 +1754,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Assert(itemIndex >= 0);
so->currPos.firstItem = itemIndex;
- so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
- so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
+ so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
+ so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
}
return (so->currPos.firstItem <= so->currPos.lastItem);
@@ -1597,6 +1768,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
{
BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+ Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
+
currItem->heapTid = itup->t_tid;
currItem->indexOffset = offnum;
if (so->currTuples)
@@ -1609,6 +1782,71 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
}
}
+/*
+ * Setup state to save TIDs/items from a single posting list tuple.
+ *
+ * Saves an index item into so->currPos.items[itemIndex] for TID that is
+ * returned to scan first. Second or subsequent TIDs for posting list should
+ * be saved by calling _bt_savepostingitem().
+ *
+ * Returns an offset into tuple storage space that main tuple is stored at if
+ * needed.
+ */
+static int
+_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+ ItemPointer heapTid, IndexTuple itup)
+{
+ BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+ Assert(BTreeTupleIsPosting(itup));
+
+ currItem->heapTid = *heapTid;
+ currItem->indexOffset = offnum;
+ if (so->currTuples)
+ {
+ /* Save base IndexTuple (truncate posting list) */
+ IndexTuple base;
+ Size itupsz = BTreeTupleGetPostingOffset(itup);
+
+ itupsz = MAXALIGN(itupsz);
+ currItem->tupleOffset = so->currPos.nextTupleOffset;
+ base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
+ memcpy(base, itup, itupsz);
+ /* Defensively reduce work area index tuple header size */
+ base->t_info &= ~INDEX_SIZE_MASK;
+ base->t_info |= itupsz;
+ so->currPos.nextTupleOffset += itupsz;
+
+ return currItem->tupleOffset;
+ }
+
+ return 0;
+}
+
+/*
+ * Save an index item into so->currPos.items[itemIndex] for current posting
+ * tuple.
+ *
+ * Assumes that _bt_setuppostingitems() has already been called for current
+ * posting list tuple. Caller passes its return value as tupleOffset.
+ */
+static inline void
+_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+ ItemPointer heapTid, int tupleOffset)
+{
+ BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+ currItem->heapTid = *heapTid;
+ currItem->indexOffset = offnum;
+
+ /*
+ * Have index-only scans return the same base IndexTuple for every TID
+ * that originates from the same posting list
+ */
+ if (so->currTuples)
+ currItem->tupleOffset = tupleOffset;
+}
+
/*
* _bt_steppage() -- Step to next page containing valid data for scan
*
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index baec5de999..e66cd36dfa 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -243,6 +243,7 @@ typedef struct BTPageState
BlockNumber btps_blkno; /* block # to write this page at */
IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
OffsetNumber btps_lastoff; /* last item offset loaded */
+ Size btps_lastextra; /* last item's extra posting list space */
uint32 btps_level; /* tree level (0 = leaf) */
Size btps_full; /* "full" if less than this much free space */
struct BTPageState *btps_next; /* link to parent level, if any */
@@ -277,7 +278,10 @@ static void _bt_slideleft(Page page);
static void _bt_sortaddtup(Page page, Size itemsize,
IndexTuple itup, OffsetNumber itup_off);
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
- IndexTuple itup);
+ IndexTuple itup, Size truncextra);
+static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
+ BTPageState *state,
+ BTDedupState dstate);
static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
static void _bt_load(BTWriteState *wstate,
BTSpool *btspool, BTSpool *btspool2);
@@ -563,6 +567,8 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
wstate.heap = btspool->heap;
wstate.index = btspool->index;
wstate.inskey = _bt_mkscankey(wstate.index, NULL);
+ /* _bt_mkscankey() won't set allequalimage without metapage */
+ wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
/*
* We need to log index creation in WAL iff WAL archiving/streaming is
@@ -711,6 +717,7 @@ _bt_pagestate(BTWriteState *wstate, uint32 level)
state->btps_lowkey = NULL;
/* initialize lastoff so first item goes into P_FIRSTKEY */
state->btps_lastoff = P_HIKEY;
+ state->btps_lastextra = 0;
state->btps_level = level;
/* set "full" threshold based on level. See notes at head of file. */
if (level > 0)
@@ -789,7 +796,8 @@ _bt_sortaddtup(Page page,
}
/*----------
- * Add an item to a disk page from the sort output.
+ * Add an item to a disk page from the sort output (or add a posting list
+ * item formed from the sort output).
*
* We must be careful to observe the page layout conventions of nbtsearch.c:
* - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
@@ -821,14 +829,27 @@ _bt_sortaddtup(Page page,
* the truncated high key at offset 1.
*
* 'last' pointer indicates the last offset added to the page.
+ *
+ * 'truncextra' is the size of the posting list in itup, if any. This
+ * information is stashed for the next call here, when we may benefit
+ * from considering the impact of truncating away the posting list on
+ * the page before deciding to finish the page off. Posting lists are
+ * often relatively large, so it is worth going to the trouble of
+ * accounting for the saving from truncating away the posting list of
+ * the tuple that becomes the high key (that may be the only way to
+ * get close to target free space on the page). Note that this is
+ * only used for the soft fillfactor-wise limit, not the critical hard
+ * limit.
*----------
*/
static void
-_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
+_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
+ Size truncextra)
{
Page npage;
BlockNumber nblkno;
OffsetNumber last_off;
+ Size last_truncextra;
Size pgspc;
Size itupsz;
bool isleaf;
@@ -842,6 +863,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
npage = state->btps_page;
nblkno = state->btps_blkno;
last_off = state->btps_lastoff;
+ last_truncextra = state->btps_lastextra;
+ state->btps_lastextra = truncextra;
pgspc = PageGetFreeSpace(npage);
itupsz = IndexTupleSize(itup);
@@ -883,10 +906,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
* page. Disregard fillfactor and insert on "full" current page if we
* don't have the minimum number of items yet. (Note that we deliberately
* assume that suffix truncation neither enlarges nor shrinks new high key
- * when applying soft limit.)
+ * when applying soft limit, except when last tuple has a posting list.)
*/
if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
- (pgspc < state->btps_full && last_off > P_FIRSTKEY))
+ (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
{
/*
* Finish off the page and write it out.
@@ -944,11 +967,14 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
* We don't try to bias our choice of split point to make it more
* likely that _bt_truncate() can truncate away more attributes,
* whereas the split point used within _bt_split() is chosen much
- * more delicately. Suffix truncation is mostly useful because it
- * improves space utilization for workloads with random
- * insertions. It doesn't seem worthwhile to add logic for
- * choosing a split point here for a benefit that is bound to be
- * much smaller.
+ * more delicately. Even still, the lastleft and firstright
+ * tuples passed to _bt_truncate() here are at least not fully
+ * equal to each other when deduplication is used, unless there is
+ * a large group of duplicates (also, unique index builds usually
+ * have few or no spool2 duplicates). When the split point is
+ * between two unequal tuples, _bt_truncate() will avoid including
+ * a heap TID in the new high key, which is the most important
+ * benefit of suffix truncation.
*
* Overwrite the old item with new truncated high key directly.
* oitup is already located at the physical beginning of tuple
@@ -983,7 +1009,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
!P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
- _bt_buildadd(wstate, state->btps_next, state->btps_lowkey);
+ _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
pfree(state->btps_lowkey);
/*
@@ -1045,6 +1071,43 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
state->btps_lastoff = last_off;
}
+/*
+ * Finalize pending posting list tuple, and add it to the index. Final tuple
+ * is based on saved base tuple, and saved list of heap TIDs.
+ *
+ * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
+ * using _bt_buildadd().
+ */
+static void
+_bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
+ BTDedupState dstate)
+{
+ Assert(dstate->nitems > 0);
+
+ if (dstate->nitems == 1)
+ _bt_buildadd(wstate, state, dstate->base, 0);
+ else
+ {
+ IndexTuple postingtuple;
+ Size truncextra;
+
+ /* form a tuple with a posting list */
+ postingtuple = _bt_form_posting(dstate->base,
+ dstate->htids,
+ dstate->nhtids);
+ /* Calculate posting list overhead */
+ truncextra = IndexTupleSize(postingtuple) -
+ BTreeTupleGetPostingOffset(postingtuple);
+
+ _bt_buildadd(wstate, state, postingtuple, truncextra);
+ pfree(postingtuple);
+ }
+
+ dstate->nhtids = 0;
+ dstate->nitems = 0;
+ dstate->phystupsize = 0;
+}
+
/*
* Finish writing out the completed btree.
*/
@@ -1090,7 +1153,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
!P_LEFTMOST(opaque));
BTreeTupleSetDownLink(s->btps_lowkey, blkno);
- _bt_buildadd(wstate, s->btps_next, s->btps_lowkey);
+ _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
pfree(s->btps_lowkey);
s->btps_lowkey = NULL;
}
@@ -1111,7 +1174,8 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
* by filling in a valid magic number in the metapage.
*/
metapage = (Page) palloc(BLCKSZ);
- _bt_initmetapage(metapage, rootblkno, rootlevel);
+ _bt_initmetapage(metapage, rootblkno, rootlevel,
+ wstate->inskey->allequalimage);
_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
}
@@ -1132,6 +1196,10 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
SortSupport sortKeys;
int64 tuples_done = 0;
+ bool deduplicate;
+
+ deduplicate = wstate->inskey->allequalimage &&
+ BTGetDeduplicateItems(wstate->index);
if (merge)
{
@@ -1228,12 +1296,12 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
if (load1)
{
- _bt_buildadd(wstate, state, itup);
+ _bt_buildadd(wstate, state, itup, 0);
itup = tuplesort_getindextuple(btspool->sortstate, true);
}
else
{
- _bt_buildadd(wstate, state, itup2);
+ _bt_buildadd(wstate, state, itup2, 0);
itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
}
@@ -1243,9 +1311,100 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
}
pfree(sortKeys);
}
+ else if (deduplicate)
+ {
+ /* merge is unnecessary, deduplicate into posting lists */
+ BTDedupState dstate;
+
+ dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
+ dstate->deduplicate = true; /* unused */
+ dstate->maxpostingsize = 0; /* set later */
+ /* Metadata about base tuple of current pending posting list */
+ dstate->base = NULL;
+ dstate->baseoff = InvalidOffsetNumber; /* unused */
+ dstate->basetupsize = 0;
+ /* Metadata about current pending posting list TIDs */
+ dstate->htids = NULL;
+ dstate->nhtids = 0;
+ dstate->nitems = 0;
+ dstate->phystupsize = 0; /* unused */
+ dstate->nintervals = 0; /* unused */
+
+ while ((itup = tuplesort_getindextuple(btspool->sortstate,
+ true)) != NULL)
+ {
+ /* When we see first tuple, create first index page */
+ if (state == NULL)
+ {
+ state = _bt_pagestate(wstate, 0);
+
+ /*
+ * Limit size of posting list tuples to 1/10 space we want to
+ * leave behind on the page, plus space for final item's line
+ * pointer. This is equal to the space that we'd like to
+ * leave behind on each leaf page when fillfactor is 90,
+ * allowing us to get close to fillfactor% space utilization
+ * when there happen to be a great many duplicates. (This
+ * makes higher leaf fillfactor settings ineffective when
+ * building indexes that have many duplicates, but packing
+ * leaf pages full with few very large tuples doesn't seem
+ * like a useful goal.)
+ */
+ dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
+ sizeof(ItemIdData);
+ Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
+ dstate->maxpostingsize <= INDEX_SIZE_MASK);
+ dstate->htids = palloc(dstate->maxpostingsize);
+
+ /* start new pending posting list with itup copy */
+ _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
+ InvalidOffsetNumber);
+ }
+ else if (_bt_keep_natts_fast(wstate->index, dstate->base,
+ itup) > keysz &&
+ _bt_dedup_save_htid(dstate, itup))
+ {
+ /*
+ * Tuple is equal to base tuple of pending posting list. Heap
+ * TID from itup has been saved in state.
+ */
+ }
+ else
+ {
+ /*
+ * Tuple is not equal to pending posting list tuple, or
+ * _bt_dedup_save_htid() opted to not merge current item into
+ * pending posting list.
+ */
+ _bt_sort_dedup_finish_pending(wstate, state, dstate);
+ pfree(dstate->base);
+
+ /* start new pending posting list with itup copy */
+ _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
+ InvalidOffsetNumber);
+ }
+
+ /* Report progress */
+ pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
+ ++tuples_done);
+ }
+
+ if (state)
+ {
+ /*
+ * Handle the last item (there must be a last item when the
+ * tuplesort returned one or more tuples)
+ */
+ _bt_sort_dedup_finish_pending(wstate, state, dstate);
+ pfree(dstate->base);
+ pfree(dstate->htids);
+ }
+
+ pfree(dstate);
+ }
else
{
- /* merge is unnecessary */
+ /* merging and deduplication are both unnecessary */
while ((itup = tuplesort_getindextuple(btspool->sortstate,
true)) != NULL)
{
@@ -1253,7 +1412,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
if (state == NULL)
state = _bt_pagestate(wstate, 0);
- _bt_buildadd(wstate, state, itup);
+ _bt_buildadd(wstate, state, itup, 0);
/* Report progress */
pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
index 76c2d945c8..8ba055be9e 100644
--- a/src/backend/access/nbtree/nbtsplitloc.c
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@@ -183,6 +183,9 @@ _bt_findsplitloc(Relation rel,
state.minfirstrightsz = SIZE_MAX;
state.newitemoff = newitemoff;
+ /* newitem cannot be a posting list item */
+ Assert(!BTreeTupleIsPosting(newitem));
+
/*
* maxsplits should never exceed maxoff because there will be at most as
* many candidate split points as there are points _between_ tuples, once
@@ -459,6 +462,7 @@ _bt_recsplitloc(FindSplitData *state,
int16 leftfree,
rightfree;
Size firstrightitemsz;
+ Size postingsz = 0;
bool newitemisfirstonright;
/* Is the new item going to be the first item on the right page? */
@@ -468,8 +472,30 @@ _bt_recsplitloc(FindSplitData *state,
if (newitemisfirstonright)
firstrightitemsz = state->newitemsz;
else
+ {
firstrightitemsz = firstoldonrightsz;
+ /*
+ * Calculate suffix truncation space saving when firstright is a
+ * posting list tuple, though only when the firstright is over 64
+ * bytes including line pointer overhead (arbitrary). This avoids
+ * accessing the tuple in cases where its posting list must be very
+ * small (if firstright has one at all).
+ */
+ if (state->is_leaf && firstrightitemsz > 64)
+ {
+ ItemId itemid;
+ IndexTuple newhighkey;
+
+ itemid = PageGetItemId(state->page, firstoldonright);
+ newhighkey = (IndexTuple) PageGetItem(state->page, itemid);
+
+ if (BTreeTupleIsPosting(newhighkey))
+ postingsz = IndexTupleSize(newhighkey) -
+ BTreeTupleGetPostingOffset(newhighkey);
+ }
+ }
+
/* Account for all the old tuples */
leftfree = state->leftspace - olddataitemstoleft;
rightfree = state->rightspace -
@@ -491,11 +517,17 @@ _bt_recsplitloc(FindSplitData *state,
* If we are on the leaf level, assume that suffix truncation cannot avoid
* adding a heap TID to the left half's new high key when splitting at the
* leaf level. In practice the new high key will often be smaller and
- * will rarely be larger, but conservatively assume the worst case.
+ * will rarely be larger, but conservatively assume the worst case. We do
+ * go to the trouble of subtracting away posting list overhead, though
+ * only when it looks like it will make an appreciable difference.
+ * (Posting lists are the only case where truncation will typically make
+ * the final high key far smaller than firstright, so being a bit more
+ * precise there noticeably improves the balance of free space.)
*/
if (state->is_leaf)
leftfree -= (int16) (firstrightitemsz +
- MAXALIGN(sizeof(ItemPointerData)));
+ MAXALIGN(sizeof(ItemPointerData)) -
+ postingsz);
else
leftfree -= (int16) firstrightitemsz;
@@ -691,7 +723,8 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff));
tup = (IndexTuple) PageGetItem(state->page, itemid);
/* Do cheaper test first */
- if (!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
+ if (BTreeTupleIsPosting(tup) ||
+ !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
return false;
/* Check same conditions as rightmost item case, too */
keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index af07732eab..54afa6f417 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -81,7 +81,10 @@ static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
* determine whether or not the keys in the index are expected to be
* unique (i.e. if this is a "heapkeyspace" index). We assume a
* heapkeyspace index when caller passes a NULL tuple, allowing index
- * build callers to avoid accessing the non-existent metapage.
+ * build callers to avoid accessing the non-existent metapage. We
+ * also assume that the index is _not_ allequalimage when a NULL tuple
+ * is passed; CREATE INDEX callers call _bt_allequalimage() to set the
+ * field themselves.
*/
BTScanInsert
_bt_mkscankey(Relation rel, IndexTuple itup)
@@ -108,7 +111,14 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
*/
key = palloc(offsetof(BTScanInsertData, scankeys) +
sizeof(ScanKeyData) * indnkeyatts);
- key->heapkeyspace = itup == NULL || _bt_heapkeyspace(rel);
+ if (itup)
+ _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
+ else
+ {
+ /* Utility statement callers can set these fields themselves */
+ key->heapkeyspace = true;
+ key->allequalimage = false;
+ }
key->anynullkeys = false; /* initial assumption */
key->nextkey = false;
key->pivotsearch = false;
@@ -1374,6 +1384,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
* attribute passes the qual.
*/
Assert(ScanDirectionIsForward(dir));
+ Assert(BTreeTupleIsPivot(tuple));
continue;
}
@@ -1535,6 +1546,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
* attribute passes the qual.
*/
Assert(ScanDirectionIsForward(dir));
+ Assert(BTreeTupleIsPivot(tuple));
cmpresult = 0;
if (subkey->sk_flags & SK_ROW_END)
break;
@@ -1774,10 +1786,65 @@ _bt_killitems(IndexScanDesc scan)
{
ItemId iid = PageGetItemId(page, offnum);
IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
+ bool killtuple = false;
- if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+ if (BTreeTupleIsPosting(ituple))
{
- /* found the item */
+ int pi = i + 1;
+ int nposting = BTreeTupleGetNPosting(ituple);
+ int j;
+
+ /*
+ * Note that we rely on the assumption that heap TIDs in the
+ * scanpos items array are always in ascending heap TID order
+ * within a posting list
+ */
+ for (j = 0; j < nposting; j++)
+ {
+ ItemPointer item = BTreeTupleGetPostingN(ituple, j);
+
+ if (!ItemPointerEquals(item, &kitem->heapTid))
+ break; /* out of posting list loop */
+
+ /* kitem must have matching offnum when heap TIDs match */
+ Assert(kitem->indexOffset == offnum);
+
+ /*
+ * Read-ahead to later kitems here.
+ *
+ * We rely on the assumption that not advancing kitem here
+ * will prevent us from considering the posting list tuple
+ * fully dead by not matching its next heap TID in next
+ * loop iteration.
+ *
+ * If, on the other hand, this is the final heap TID in
+ * the posting list tuple, then tuple gets killed
+ * regardless (i.e. we handle the case where the last
+ * kitem is also the last heap TID in the last index tuple
+ * correctly -- posting tuple still gets killed).
+ */
+ if (pi < numKilled)
+ kitem = &so->currPos.items[so->killedItems[pi++]];
+ }
+
+ /*
+ * Don't bother advancing the outermost loop's int iterator to
+ * avoid processing killed items that relate to the same
+ * offnum/posting list tuple. This micro-optimization hardly
+ * seems worth it. (Further iterations of the outermost loop
+ * will fail to match on this same posting list's first heap
+ * TID instead, so we'll advance to the next offnum/index
+ * tuple pretty quickly.)
+ */
+ if (j == nposting)
+ killtuple = true;
+ }
+ else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+ killtuple = true;
+
+ if (killtuple)
+ {
+ /* found the item/all posting list items */
ItemIdMarkDead(iid);
killedsomething = true;
break; /* out of inner search loop */
@@ -2018,7 +2085,9 @@ btoptions(Datum reloptions, bool validate)
static const relopt_parse_elt tab[] = {
{"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
{"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
- offsetof(BTOptions, vacuum_cleanup_index_scale_factor)}
+ offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
+ {"deduplicate_items", RELOPT_TYPE_BOOL,
+ offsetof(BTOptions, deduplicate_items)}
};
@@ -2119,11 +2188,10 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
Size newsize;
/*
- * We should only ever truncate leaf index tuples. It's never okay to
- * truncate a second time.
+ * We should only ever truncate non-pivot tuples from leaf pages. It's
+ * never okay to truncate when splitting an internal page.
*/
- Assert(BTreeTupleGetNAtts(lastleft, rel) == natts);
- Assert(BTreeTupleGetNAtts(firstright, rel) == natts);
+ Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));
/* Determine how many attributes must be kept in truncated tuple */
keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
@@ -2139,6 +2207,19 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
pivot = index_truncate_tuple(itupdesc, firstright, keepnatts);
+ if (BTreeTupleIsPosting(pivot))
+ {
+ /*
+ * index_truncate_tuple() just returns a straight copy of
+ * firstright when it has no key attributes to truncate. We need
+ * to truncate away the posting list ourselves.
+ */
+ Assert(keepnatts == nkeyatts);
+ Assert(natts == nkeyatts);
+ pivot->t_info &= ~INDEX_SIZE_MASK;
+ pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
+ }
+
/*
* If there is a distinguishing key attribute within new pivot tuple,
* there is no need to add an explicit heap TID attribute
@@ -2155,6 +2236,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
* attribute to the new pivot tuple.
*/
Assert(natts != nkeyatts);
+ Assert(!BTreeTupleIsPosting(lastleft) &&
+ !BTreeTupleIsPosting(firstright));
newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData));
tidpivot = palloc0(newsize);
memcpy(tidpivot, pivot, IndexTupleSize(pivot));
@@ -2172,6 +2255,19 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
newsize = IndexTupleSize(firstright) + MAXALIGN(sizeof(ItemPointerData));
pivot = palloc0(newsize);
memcpy(pivot, firstright, IndexTupleSize(firstright));
+
+ if (BTreeTupleIsPosting(firstright))
+ {
+ /*
+ * New pivot tuple was copied from firstright, which happens to be
+ * a posting list tuple. We will have to include the max lastleft
+ * heap TID in the final pivot tuple, but we can remove the
+ * posting list now. (Pivot tuples should never contain a posting
+ * list.)
+ */
+ newsize = MAXALIGN(BTreeTupleGetPostingOffset(firstright)) +
+ MAXALIGN(sizeof(ItemPointerData));
+ }
}
/*
@@ -2199,7 +2295,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
*/
pivotheaptid = (ItemPointer) ((char *) pivot + newsize -
sizeof(ItemPointerData));
- ItemPointerCopy(&lastleft->t_tid, pivotheaptid);
+ ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
/*
* Lehman and Yao require that the downlink to the right page, which is to
@@ -2210,9 +2306,12 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
* tiebreaker.
*/
#ifndef DEBUG_NO_TRUNCATE
- Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < 0);
- Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0);
- Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+ Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft),
+ BTreeTupleGetHeapTID(firstright)) < 0);
+ Assert(ItemPointerCompare(pivotheaptid,
+ BTreeTupleGetHeapTID(lastleft)) >= 0);
+ Assert(ItemPointerCompare(pivotheaptid,
+ BTreeTupleGetHeapTID(firstright)) < 0);
#else
/*
@@ -2225,7 +2324,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
* attribute values along with lastleft's heap TID value when lastleft's
* TID happens to be greater than firstright's TID.
*/
- ItemPointerCopy(&firstright->t_tid, pivotheaptid);
+ ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
/*
* Pivot heap TID should never be fully equal to firstright. Note that
@@ -2234,7 +2333,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
*/
ItemPointerSetOffsetNumber(pivotheaptid,
OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
- Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+ Assert(ItemPointerCompare(pivotheaptid,
+ BTreeTupleGetHeapTID(firstright)) < 0);
#endif
BTreeTupleSetNAtts(pivot, nkeyatts);
@@ -2301,6 +2401,13 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
keepnatts++;
}
+ /*
+ * Assert that _bt_keep_natts_fast() agrees with us in passing. This is
+ * expected in an allequalimage index.
+ */
+ Assert(!itup_key->allequalimage ||
+ keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright));
+
return keepnatts;
}
@@ -2315,13 +2422,16 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
* The approach taken here usually provides the same answer as _bt_keep_natts
* will (for the same pair of tuples from a heapkeyspace index), since the
* majority of btree opclasses can never indicate that two datums are equal
- * unless they're bitwise equal after detoasting.
+ * unless they're bitwise equal after detoasting. When an index only has
+ * "equal image" columns, routine is guaranteed to give the same result as
+ * _bt_keep_natts would.
*
- * These issues must be acceptable to callers, typically because they're only
- * concerned about making suffix truncation as effective as possible without
- * leaving excessive amounts of free space on either side of page split.
* Callers can rely on the fact that attributes considered equal here are
- * definitely also equal according to _bt_keep_natts.
+ * definitely also equal according to _bt_keep_natts, even when the index uses
+ * an opclass or collation that is not "allequalimage"/deduplication-safe.
+ * This weaker guarantee is good enough for nbtsplitloc.c caller, since false
+ * negatives generally only have the effect of making leaf page splits use a
+ * more balanced split point.
*/
int
_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
@@ -2393,28 +2503,42 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
* Mask allocated for number of keys in index tuple must be able to fit
* maximum possible number of index attributes
*/
- StaticAssertStmt(BT_N_KEYS_OFFSET_MASK >= INDEX_MAX_KEYS,
- "BT_N_KEYS_OFFSET_MASK can't fit INDEX_MAX_KEYS");
+ StaticAssertStmt(BT_OFFSET_MASK >= INDEX_MAX_KEYS,
+ "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS");
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
tupnatts = BTreeTupleGetNAtts(itup, rel);
+ /* !heapkeyspace indexes do not support deduplication */
+ if (!heapkeyspace && BTreeTupleIsPosting(itup))
+ return false;
+
+ /* Posting list tuples should never have "pivot heap TID" bit set */
+ if (BTreeTupleIsPosting(itup) &&
+ (ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
+ BT_PIVOT_HEAP_TID_ATTR) != 0)
+ return false;
+
+ /* INCLUDE indexes do not support deduplication */
+ if (natts != nkeyatts && BTreeTupleIsPosting(itup))
+ return false;
+
if (P_ISLEAF(opaque))
{
if (offnum >= P_FIRSTDATAKEY(opaque))
{
/*
- * Non-pivot tuples currently never use alternative heap TID
- * representation -- even those within heapkeyspace indexes
+ * Non-pivot tuple should never be explicitly marked as a pivot
+ * tuple
*/
- if ((itup->t_info & INDEX_ALT_TID_MASK) != 0)
+ if (BTreeTupleIsPivot(itup))
return false;
/*
* Leaf tuples that are not the page high key (non-pivot tuples)
* should never be truncated. (Note that tupnatts must have been
- * inferred, rather than coming from an explicit on-disk
- * representation.)
+ * inferred, even with a posting list tuple, because only pivot
+ * tuples store tupnatts directly.)
*/
return tupnatts == natts;
}
@@ -2458,12 +2582,12 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
* non-zero, or when there is no explicit representation and the
* tuple is evidently not a pre-pg_upgrade tuple.
*
- * Prior to v11, downlinks always had P_HIKEY as their offset. Use
- * that to decide if the tuple is a pre-v11 tuple.
+ * Prior to v11, downlinks always had P_HIKEY as their offset.
+ * Accept that as an alternative indication of a valid
+ * !heapkeyspace negative infinity tuple.
*/
return tupnatts == 0 ||
- ((itup->t_info & INDEX_ALT_TID_MASK) == 0 &&
- ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
+ ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY;
}
else
{
@@ -2489,7 +2613,11 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
* heapkeyspace index pivot tuples, regardless of whether or not there are
* non-key attributes.
*/
- if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+ if (!BTreeTupleIsPivot(itup))
+ return false;
+
+ /* Pivot tuple should not use posting list representation (redundant) */
+ if (BTreeTupleIsPosting(itup))
return false;
/*
@@ -2559,8 +2687,8 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
BTMaxItemSizeNoHeapTid(page),
RelationGetRelationName(rel)),
errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
- ItemPointerGetBlockNumber(&newtup->t_tid),
- ItemPointerGetOffsetNumber(&newtup->t_tid),
+ ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)),
+ ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)),
RelationGetRelationName(heap)),
errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
"Consider a function index of an MD5 hash of the value, "
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 2e5202c2d6..99d0914e72 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -22,6 +22,9 @@
#include "access/xlogutils.h"
#include "miscadmin.h"
#include "storage/procarray.h"
+#include "utils/memutils.h"
+
+static MemoryContext opCtx; /* working memory for operations */
/*
* _bt_restore_page -- re-enter all the index tuples on a page
@@ -111,6 +114,7 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id)
Assert(md->btm_version >= BTREE_NOVAC_VERSION);
md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact;
md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples;
+ md->btm_allequalimage = xlrec->allequalimage;
pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
pageop->btpo_flags = BTP_META;
@@ -156,7 +160,8 @@ _bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id)
}
static void
-btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record)
+btree_xlog_insert(bool isleaf, bool ismeta, bool posting,
+ XLogReaderState *record)
{
XLogRecPtr lsn = record->EndRecPtr;
xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
@@ -181,9 +186,52 @@ btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record)
page = BufferGetPage(buffer);
- if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
- false, false) == InvalidOffsetNumber)
- elog(PANIC, "btree_xlog_insert: failed to add item");
+ if (!posting)
+ {
+ /* Simple retail insertion */
+ if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
+ false, false) == InvalidOffsetNumber)
+ elog(PANIC, "failed to add new item");
+ }
+ else
+ {
+ ItemId itemid;
+ IndexTuple oposting,
+ newitem,
+ nposting;
+ uint16 postingoff;
+
+ /*
+ * A posting list split occurred during leaf page insertion. WAL
+ * record data will start with an offset number representing the
+ * point in an existing posting list that a split occurs at.
+ *
+ * Use _bt_swap_posting() to repeat posting list split steps from
+ * primary. Note that newitem from WAL record is 'orignewitem',
+ * not the final version of newitem that is actually inserted on
+ * page.
+ */
+ postingoff = *((uint16 *) datapos);
+ datapos += sizeof(uint16);
+ datalen -= sizeof(uint16);
+
+ itemid = PageGetItemId(page, OffsetNumberPrev(xlrec->offnum));
+ oposting = (IndexTuple) PageGetItem(page, itemid);
+
+ /* Use mutable, aligned newitem copy in _bt_swap_posting() */
+ Assert(isleaf && postingoff > 0);
+ newitem = CopyIndexTuple((IndexTuple) datapos);
+ nposting = _bt_swap_posting(newitem, oposting, postingoff);
+
+ /* Replace existing posting list with post-split version */
+ memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
+
+ /* Insert "final" new item (not orignewitem from WAL stream) */
+ Assert(IndexTupleSize(newitem) == datalen);
+ if (PageAddItem(page, (Item) newitem, datalen, xlrec->offnum,
+ false, false) == InvalidOffsetNumber)
+ elog(PANIC, "failed to add posting split new item");
+ }
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
@@ -265,20 +313,38 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
OffsetNumber off;
IndexTuple newitem = NULL,
- left_hikey = NULL;
+ left_hikey = NULL,
+ nposting = NULL;
Size newitemsz = 0,
left_hikeysz = 0;
Page newlpage;
- OffsetNumber leftoff;
+ OffsetNumber leftoff,
+ replacepostingoff = InvalidOffsetNumber;
datapos = XLogRecGetBlockData(record, 0, &datalen);
- if (onleft)
+ if (onleft || xlrec->postingoff != 0)
{
newitem = (IndexTuple) datapos;
newitemsz = MAXALIGN(IndexTupleSize(newitem));
datapos += newitemsz;
datalen -= newitemsz;
+
+ if (xlrec->postingoff != 0)
+ {
+ ItemId itemid;
+ IndexTuple oposting;
+
+ /* Posting list must be at offset number before new item's */
+ replacepostingoff = OffsetNumberPrev(xlrec->newitemoff);
+
+ /* Use mutable, aligned newitem copy in _bt_swap_posting() */
+ newitem = CopyIndexTuple(newitem);
+ itemid = PageGetItemId(lpage, replacepostingoff);
+ oposting = (IndexTuple) PageGetItem(lpage, itemid);
+ nposting = _bt_swap_posting(newitem, oposting,
+ xlrec->postingoff);
+ }
}
/*
@@ -308,8 +374,20 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
Size itemsz;
IndexTuple item;
+ /* Add replacement posting list when required */
+ if (off == replacepostingoff)
+ {
+ Assert(onleft || xlrec->firstright == xlrec->newitemoff);
+ if (PageAddItem(newlpage, (Item) nposting,
+ MAXALIGN(IndexTupleSize(nposting)), leftoff,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add new posting list item to left page after split");
+ leftoff = OffsetNumberNext(leftoff);
+ continue; /* don't insert oposting */
+ }
+
/* add the new item if it was inserted on left page */
- if (onleft && off == xlrec->newitemoff)
+ else if (onleft && off == xlrec->newitemoff)
{
if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
false, false) == InvalidOffsetNumber)
@@ -383,6 +461,98 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
}
}
+static void
+btree_xlog_dedup(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ xl_btree_dedup *xlrec = (xl_btree_dedup *) XLogRecGetData(record);
+ Buffer buf;
+
+ if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO)
+ {
+ char *ptr = XLogRecGetBlockData(record, 0, NULL);
+ Page page = (Page) BufferGetPage(buf);
+ BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ BTDedupState state;
+ BTDedupInterval *intervals;
+ Page newpage;
+
+ state = (BTDedupState) palloc(sizeof(BTDedupStateData));
+ state->deduplicate = true; /* unused */
+ /* Conservatively use larger maxpostingsize than primary */
+ state->maxpostingsize = BTMaxItemSize(page);
+ state->base = NULL;
+ state->baseoff = InvalidOffsetNumber;
+ state->basetupsize = 0;
+ state->htids = palloc(state->maxpostingsize);
+ state->nhtids = 0;
+ state->nitems = 0;
+ state->phystupsize = 0;
+ state->nintervals = 0;
+
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+ newpage = PageGetTempPageCopySpecial(page);
+
+ if (!P_RIGHTMOST(opaque))
+ {
+ ItemId itemid = PageGetItemId(page, P_HIKEY);
+ Size itemsz = ItemIdGetLength(itemid);
+ IndexTuple item = (IndexTuple) PageGetItem(page, itemid);
+
+ if (PageAddItem(newpage, (Item) item, itemsz, P_HIKEY,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "deduplication failed to add highkey");
+ }
+
+ intervals = (BTDedupInterval *) ptr;
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid = PageGetItemId(page, offnum);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+
+ if (offnum == minoff)
+ _bt_dedup_start_pending(state, itup, offnum);
+ else if (state->nintervals < xlrec->nintervals &&
+ state->baseoff == intervals[state->nintervals].baseoff &&
+ state->nitems < intervals[state->nintervals].nitems)
+ {
+ if (!_bt_dedup_save_htid(state, itup))
+ elog(ERROR, "deduplication failed to add heap tid to pending posting list");
+ }
+ else
+ {
+ _bt_dedup_finish_pending(newpage, state);
+ _bt_dedup_start_pending(state, itup, offnum);
+ }
+ }
+
+ _bt_dedup_finish_pending(newpage, state);
+ Assert(state->nintervals == xlrec->nintervals);
+ Assert(memcmp(state->intervals, intervals,
+ state->nintervals * sizeof(BTDedupInterval)) == 0);
+
+ if (P_HAS_GARBAGE(opaque))
+ {
+ BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
+
+ nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+ }
+
+ PageRestoreTempPage(newpage, page);
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buf);
+ }
+
+ if (BufferIsValid(buf))
+ UnlockReleaseBuffer(buf);
+}
+
static void
btree_xlog_vacuum(XLogReaderState *record)
{
@@ -405,7 +575,56 @@ btree_xlog_vacuum(XLogReaderState *record)
page = (Page) BufferGetPage(buffer);
- PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
+ if (xlrec->nupdated > 0)
+ {
+ OffsetNumber *updatedoffsets;
+ xl_btree_update *updates;
+
+ updatedoffsets = (OffsetNumber *)
+ (ptr + xlrec->ndeleted * sizeof(OffsetNumber));
+ updates = (xl_btree_update *) ((char *) updatedoffsets +
+ xlrec->nupdated *
+ sizeof(OffsetNumber));
+
+ for (int i = 0; i < xlrec->nupdated; i++)
+ {
+ BTVacuumPosting vacposting;
+ IndexTuple origtuple;
+ ItemId itemid;
+ Size itemsz;
+
+ itemid = PageGetItemId(page, updatedoffsets[i]);
+ origtuple = (IndexTuple) PageGetItem(page, itemid);
+
+ vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
+ updates->ndeletedtids * sizeof(uint16));
+ vacposting->updatedoffset = updatedoffsets[i];
+ vacposting->itup = origtuple;
+ vacposting->ndeletedtids = updates->ndeletedtids;
+ memcpy(vacposting->deletetids,
+ (char *) updates + SizeOfBtreeUpdate,
+ updates->ndeletedtids * sizeof(uint16));
+
+ _bt_update_posting(vacposting);
+
+ /* Overwrite updated version of tuple */
+ itemsz = MAXALIGN(IndexTupleSize(vacposting->itup));
+ if (!PageIndexTupleOverwrite(page, updatedoffsets[i],
+ (Item) vacposting->itup, itemsz))
+ elog(PANIC, "failed to update partially dead item");
+
+ pfree(vacposting->itup);
+ pfree(vacposting);
+
+ /* advance to next xl_btree_update from array */
+ updates = (xl_btree_update *)
+ ((char *) updates + SizeOfBtreeUpdate +
+ updates->ndeletedtids * sizeof(uint16));
+ }
+ }
+
+ if (xlrec->ndeleted > 0)
+ PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
/*
* Mark the page as not containing any LP_DEAD items --- see comments
@@ -724,17 +943,19 @@ void
btree_redo(XLogReaderState *record)
{
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+ MemoryContext oldCtx;
+ oldCtx = MemoryContextSwitchTo(opCtx);
switch (info)
{
case XLOG_BTREE_INSERT_LEAF:
- btree_xlog_insert(true, false, record);
+ btree_xlog_insert(true, false, false, record);
break;
case XLOG_BTREE_INSERT_UPPER:
- btree_xlog_insert(false, false, record);
+ btree_xlog_insert(false, false, false, record);
break;
case XLOG_BTREE_INSERT_META:
- btree_xlog_insert(false, true, record);
+ btree_xlog_insert(false, true, false, record);
break;
case XLOG_BTREE_SPLIT_L:
btree_xlog_split(true, record);
@@ -742,6 +963,12 @@ btree_redo(XLogReaderState *record)
case XLOG_BTREE_SPLIT_R:
btree_xlog_split(false, record);
break;
+ case XLOG_BTREE_INSERT_POST:
+ btree_xlog_insert(true, false, true, record);
+ break;
+ case XLOG_BTREE_DEDUP:
+ btree_xlog_dedup(record);
+ break;
case XLOG_BTREE_VACUUM:
btree_xlog_vacuum(record);
break;
@@ -767,6 +994,23 @@ btree_redo(XLogReaderState *record)
default:
elog(PANIC, "btree_redo: unknown op code %u", info);
}
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(opCtx);
+}
+
+void
+btree_xlog_startup(void)
+{
+ opCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "Btree recovery temporary context",
+ ALLOCSET_DEFAULT_SIZES);
+}
+
+void
+btree_xlog_cleanup(void)
+{
+ MemoryContextDelete(opCtx);
+ opCtx = NULL;
}
/*
diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c
index 7d63a7124e..7a1616f371 100644
--- a/src/backend/access/rmgrdesc/nbtdesc.c
+++ b/src/backend/access/rmgrdesc/nbtdesc.c
@@ -27,6 +27,7 @@ btree_desc(StringInfo buf, XLogReaderState *record)
case XLOG_BTREE_INSERT_LEAF:
case XLOG_BTREE_INSERT_UPPER:
case XLOG_BTREE_INSERT_META:
+ case XLOG_BTREE_INSERT_POST:
{
xl_btree_insert *xlrec = (xl_btree_insert *) rec;
@@ -38,15 +39,24 @@ btree_desc(StringInfo buf, XLogReaderState *record)
{
xl_btree_split *xlrec = (xl_btree_split *) rec;
- appendStringInfo(buf, "level %u, firstright %d, newitemoff %d",
- xlrec->level, xlrec->firstright, xlrec->newitemoff);
+ appendStringInfo(buf, "level %u, firstright %d, newitemoff %d, postingoff %d",
+ xlrec->level, xlrec->firstright,
+ xlrec->newitemoff, xlrec->postingoff);
+ break;
+ }
+ case XLOG_BTREE_DEDUP:
+ {
+ xl_btree_dedup *xlrec = (xl_btree_dedup *) rec;
+
+ appendStringInfo(buf, "nintervals %u", xlrec->nintervals);
break;
}
case XLOG_BTREE_VACUUM:
{
xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec;
- appendStringInfo(buf, "ndeleted %u", xlrec->ndeleted);
+ appendStringInfo(buf, "ndeleted %u; nupdated %u",
+ xlrec->ndeleted, xlrec->nupdated);
break;
}
case XLOG_BTREE_DELETE:
@@ -130,6 +140,12 @@ btree_identify(uint8 info)
case XLOG_BTREE_SPLIT_R:
id = "SPLIT_R";
break;
+ case XLOG_BTREE_INSERT_POST:
+ id = "INSERT_POST";
+ break;
+ case XLOG_BTREE_DEDUP:
+ id = "DEDUP";
+ break;
case XLOG_BTREE_VACUUM:
id = "VACUUM";
break;
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 4ea6ea7a3d..cb7b8c8a63 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -1048,8 +1048,10 @@ PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum)
* This is better than deleting and reinserting the tuple, because it
* avoids any data shifting when the tuple size doesn't change; and
* even when it does, we avoid moving the line pointers around.
- * Conceivably this could also be of use to an index AM that cares about
- * the physical order of tuples as well as their ItemId order.
+ * This could be used by an index AM that doesn't want to unset the
+ * LP_DEAD bit when it happens to be set. It could conceivably also be
+ * used by an index AM that cares about the physical order of tuples as
+ * well as their logical/ItemId order.
*
* If there's insufficient space for the new tuple, return false. Other
* errors represent data-corruption problems, so we just elog.
@@ -1134,8 +1136,9 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
}
}
- /* Update the item's tuple length (other fields shouldn't change) */
- ItemIdSetNormal(tupid, offset + size_diff, newsize);
+ /* Update the item's tuple length without changing its lp_flags field */
+ tupid->lp_off = offset + size_diff;
+ tupid->lp_len = newsize;
/* Copy new tuple data onto page */
memcpy(PageGetItem(page, tupid), newtup, newsize);
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index dc03fbde13..b6b08d0ccb 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -1731,14 +1731,14 @@ psql_completion(const char *text, int start, int end)
/* ALTER INDEX SET|RESET ( */
else if (Matches("ALTER", "INDEX", MatchAny, "RESET", "("))
COMPLETE_WITH("fillfactor",
- "vacuum_cleanup_index_scale_factor", /* BTREE */
+ "vacuum_cleanup_index_scale_factor", "deduplicate_items", /* BTREE */
"fastupdate", "gin_pending_list_limit", /* GIN */
"buffering", /* GiST */
"pages_per_range", "autosummarize" /* BRIN */
);
else if (Matches("ALTER", "INDEX", MatchAny, "SET", "("))
COMPLETE_WITH("fillfactor =",
- "vacuum_cleanup_index_scale_factor =", /* BTREE */
+ "vacuum_cleanup_index_scale_factor =", "deduplicate_items =", /* BTREE */
"fastupdate =", "gin_pending_list_limit =", /* GIN */
"buffering =", /* GiST */
"pages_per_range =", "autosummarize =" /* BRIN */
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index e8d4d2b55b..bfe49f46b0 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -108,6 +108,7 @@ typedef struct BTMetaPageData
* pages */
float8 btm_last_cleanup_num_heap_tuples; /* number of heap tuples
* during last cleanup */
+ bool btm_allequalimage; /* are all columns "equalimage"? */
} BTMetaPageData;
#define BTPageGetMeta(p) \
@@ -124,6 +125,14 @@ typedef struct BTMetaPageData
* need to be immediately re-indexed at pg_upgrade. In order to get the
* new heapkeyspace semantics, however, a REINDEX is needed.
*
+ * Deduplication is safe to use when the btm_allequalimage field is set to
+ * true. It's safe to read the btm_allequalimage field on version 3, but
+ * only version 4 indexes make use of deduplication. Even version 4
+ * indexes created on PostgreSQL v12 will need a REINDEX to make use of
+ * deduplication, though, since there is no other way to set
+ * btm_allequalimage to true (pg_upgrade hasn't been taught to set the
+ * metapage field).
+ *
* Btree version 2 is mostly the same as version 3. There are two new
* fields in the metapage that were introduced in version 3. A version 2
* metapage will be automatically upgraded to version 3 on the first
@@ -156,6 +165,21 @@ typedef struct BTMetaPageData
MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
+/*
+ * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples
+ * that may be stored on a btree leaf page. It is used to size the
+ * per-page temporary buffers used by index scans.)
+ *
+ * Note: we don't bother considering per-tuple overheads here to keep
+ * things simple (value is based on how many elements a single array of
+ * heap TIDs must have to fill the space between the page header and
+ * special area). The value is slightly higher (i.e. more conservative)
+ * than necessary as a result, which is considered acceptable.
+ */
+#define MaxTIDsPerBTreePage \
+ (int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
+ sizeof(ItemPointerData))
+
/*
* The leaf-page fillfactor defaults to 90% but is user-adjustable.
* For pages above the leaf level, we use a fixed 70% fillfactor.
@@ -230,16 +254,15 @@ typedef struct BTMetaPageData
* tuples (non-pivot tuples). _bt_check_natts() enforces the rules
* described here.
*
- * Non-pivot tuple format:
+ * Non-pivot tuple format (plain/non-posting variant):
*
* t_tid | t_info | key values | INCLUDE columns, if any
*
* t_tid points to the heap TID, which is a tiebreaker key column as of
- * BTREE_VERSION 4. Currently, the INDEX_ALT_TID_MASK status bit is never
- * set for non-pivot tuples.
+ * BTREE_VERSION 4.
*
- * All other types of index tuples ("pivot" tuples) only have key columns,
- * since pivot tuples only exist to represent how the key space is
+ * Non-pivot tuples complement pivot tuples, which only have key columns.
+ * The sole purpose of pivot tuples is to represent how the key space is
* separated. In general, any B-Tree index that has more than one level
* (i.e. any index that does not just consist of a metapage and a single
* leaf root page) must have some number of pivot tuples, since pivot
@@ -264,7 +287,8 @@ typedef struct BTMetaPageData
* INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap
* TID column sometimes stored in pivot tuples -- that's represented by
* the presence of BT_PIVOT_HEAP_TID_ATTR. The INDEX_ALT_TID_MASK bit in
- * t_info is always set on BTREE_VERSION 4 pivot tuples.
+ * t_info is always set on BTREE_VERSION 4 pivot tuples, since
+ * BTreeTupleIsPivot() must work reliably on heapkeyspace versions.
*
* In version 3 indexes, the INDEX_ALT_TID_MASK flag might not be set in
* pivot tuples. In that case, the number of key columns is implicitly
@@ -279,90 +303,256 @@ typedef struct BTMetaPageData
* The 12 least significant offset bits from t_tid are used to represent
* the number of columns in INDEX_ALT_TID_MASK tuples, leaving 4 status
* bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
- * future use. BT_N_KEYS_OFFSET_MASK should be large enough to store any
- * number of columns/attributes <= INDEX_MAX_KEYS.
+ * future use. BT_OFFSET_MASK should be large enough to store any number
+ * of columns/attributes <= INDEX_MAX_KEYS.
+ *
+ * Sometimes non-pivot tuples also use a representation that repurposes
+ * t_tid to store metadata rather than a TID. PostgreSQL v13 introduced a
+ * new non-pivot tuple format to support deduplication: posting list
+ * tuples. Deduplication merges together multiple equal non-pivot tuples
+ * into a logically equivalent, space efficient representation. A posting
+ * list is an array of ItemPointerData elements. Non-pivot tuples are
+ * merged together to form posting list tuples lazily, at the point where
+ * we'd otherwise have to split a leaf page.
+ *
+ * Posting tuple format (alternative non-pivot tuple representation):
+ *
+ * t_tid | t_info | key values | posting list (TID array)
+ *
+ * Posting list tuples are recognized as such by having the
+ * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status
+ * bit set in t_tid. These flags redefine the content of the posting
+ * tuple's t_tid to store an offset to the posting list, as well as the
+ * total number of posting list array elements.
+ *
+ * The 12 least significant offset bits from t_tid are used to represent
+ * the number of posting items present in the tuple, leaving 4 status
+ * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
+ * future use. Like any non-pivot tuple, the number of columns stored is
+ * always implicitly the total number in the index (in practice there can
+ * never be non-key columns stored, since deduplication is not supported
+ * with INCLUDE indexes). BT_OFFSET_MASK should be large enough to store
+ * any number of posting list TIDs that might be present in a tuple (since
+ * tuple size is subject to the INDEX_SIZE_MASK limit).
*
* Note well: The macros that deal with the number of attributes in tuples
- * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple,
- * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot
- * tuple (or must have the same number of attributes as the index has
- * generally in the case of !heapkeyspace indexes). They will need to be
- * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK
- * for something else.
+ * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple or
+ * non-pivot posting tuple, and that a tuple without INDEX_ALT_TID_MASK set
+ * must be a non-pivot tuple (or must have the same number of attributes as
+ * the index has generally in the case of !heapkeyspace indexes).
*/
#define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT
/* Item pointer offset bits */
#define BT_RESERVED_OFFSET_MASK 0xF000
-#define BT_N_KEYS_OFFSET_MASK 0x0FFF
+#define BT_OFFSET_MASK 0x0FFF
#define BT_PIVOT_HEAP_TID_ATTR 0x1000
-
-/* Get/set downlink block number in pivot tuple */
-#define BTreeTupleGetDownLink(itup) \
- ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
-#define BTreeTupleSetDownLink(itup, blkno) \
- ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno))
+#define BT_IS_POSTING 0x2000
/*
- * Get/set leaf page highkey's link. During the second phase of deletion, the
- * target leaf page's high key may point to an ancestor page (at all other
- * times, the leaf level high key's link is not used). See the nbtree README
- * for full details.
+ * Note: BTreeTupleIsPivot() can have false negatives (but not false
+ * positives) when used with !heapkeyspace indexes
*/
-#define BTreeTupleGetTopParent(itup) \
- ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
-#define BTreeTupleSetTopParent(itup, blkno) \
- do { \
- ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno)); \
- BTreeTupleSetNAtts((itup), 0); \
- } while(0)
+static inline bool
+BTreeTupleIsPivot(IndexTuple itup)
+{
+ if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+ return false;
+ /* absence of BT_IS_POSTING in offset number indicates pivot tuple */
+ if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) != 0)
+ return false;
+
+ return true;
+}
+
+static inline bool
+BTreeTupleIsPosting(IndexTuple itup)
+{
+ if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+ return false;
+ /* presence of BT_IS_POSTING in offset number indicates posting tuple */
+ if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) == 0)
+ return false;
+
+ return true;
+}
+
+static inline void
+BTreeTupleSetPosting(IndexTuple itup, int nhtids, int postingoffset)
+{
+ Assert(nhtids > 1 && (nhtids & BT_OFFSET_MASK) == nhtids);
+ Assert(postingoffset == MAXALIGN(postingoffset));
+ Assert(postingoffset < INDEX_SIZE_MASK);
+
+ itup->t_info |= INDEX_ALT_TID_MASK;
+ ItemPointerSetOffsetNumber(&itup->t_tid, (nhtids | BT_IS_POSTING));
+ ItemPointerSetBlockNumber(&itup->t_tid, postingoffset);
+}
+
+static inline uint16
+BTreeTupleGetNPosting(IndexTuple posting)
+{
+ OffsetNumber existing;
+
+ Assert(BTreeTupleIsPosting(posting));
+
+ existing = ItemPointerGetOffsetNumberNoCheck(&posting->t_tid);
+ return (existing & BT_OFFSET_MASK);
+}
+
+static inline uint32
+BTreeTupleGetPostingOffset(IndexTuple posting)
+{
+ Assert(BTreeTupleIsPosting(posting));
+
+ return ItemPointerGetBlockNumberNoCheck(&posting->t_tid);
+}
+
+static inline ItemPointer
+BTreeTupleGetPosting(IndexTuple posting)
+{
+ return (ItemPointer) ((char *) posting +
+ BTreeTupleGetPostingOffset(posting));
+}
+
+static inline ItemPointer
+BTreeTupleGetPostingN(IndexTuple posting, int n)
+{
+ return BTreeTupleGetPosting(posting) + n;
+}
/*
- * Get/set number of attributes within B-tree index tuple.
+ * Get/set downlink block number in pivot tuple.
+ *
+ * Note: Cannot assert that tuple is a pivot tuple. If we did so then
+ * !heapkeyspace indexes would exhibit false positive assertion failures.
+ */
+static inline BlockNumber
+BTreeTupleGetDownLink(IndexTuple pivot)
+{
+ return ItemPointerGetBlockNumberNoCheck(&pivot->t_tid);
+}
+
+static inline void
+BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
+{
+ ItemPointerSetBlockNumber(&pivot->t_tid, blkno);
+}
+
+/*
+ * Get number of attributes within tuple.
*
* Note that this does not include an implicit tiebreaker heap TID
* attribute, if any. Note also that the number of key attributes must be
* explicitly represented in all heapkeyspace pivot tuples.
+ *
+ * Note: This is defined as a macro rather than an inline function to
+ * avoid including rel.h.
*/
#define BTreeTupleGetNAtts(itup, rel) \
( \
- (itup)->t_info & INDEX_ALT_TID_MASK ? \
+ (BTreeTupleIsPivot(itup)) ? \
( \
- ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
+ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_OFFSET_MASK \
) \
: \
IndexRelationGetNumberOfAttributes(rel) \
)
-#define BTreeTupleSetNAtts(itup, n) \
- do { \
- (itup)->t_info |= INDEX_ALT_TID_MASK; \
- ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
- } while(0)
/*
- * Get tiebreaker heap TID attribute, if any. Macro works with both pivot
- * and non-pivot tuples, despite differences in how heap TID is represented.
+ * Set number of attributes in tuple, making it into a pivot tuple
*/
-#define BTreeTupleGetHeapTID(itup) \
- ( \
- (itup)->t_info & INDEX_ALT_TID_MASK && \
- (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_PIVOT_HEAP_TID_ATTR) != 0 ? \
- ( \
- (ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \
- sizeof(ItemPointerData)) \
- ) \
- : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \
- )
+static inline void
+BTreeTupleSetNAtts(IndexTuple itup, int natts)
+{
+ Assert(natts <= INDEX_MAX_KEYS);
+
+ itup->t_info |= INDEX_ALT_TID_MASK;
+ /* BT_IS_POSTING bit may be unset -- tuple always becomes a pivot tuple */
+ ItemPointerSetOffsetNumber(&itup->t_tid, natts);
+ Assert(BTreeTupleIsPivot(itup));
+}
+
/*
- * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK
- * representation (currently limited to pivot tuples)
+ * Set the bit indicating heap TID attribute present in pivot tuple
*/
-#define BTreeTupleSetAltHeapTID(itup) \
- do { \
- Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
- ItemPointerSetOffsetNumber(&(itup)->t_tid, \
- ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_PIVOT_HEAP_TID_ATTR); \
- } while(0)
+static inline void
+BTreeTupleSetAltHeapTID(IndexTuple pivot)
+{
+ OffsetNumber existing;
+
+ Assert(BTreeTupleIsPivot(pivot));
+
+ existing = ItemPointerGetOffsetNumberNoCheck(&pivot->t_tid);
+ ItemPointerSetOffsetNumber(&pivot->t_tid,
+ existing | BT_PIVOT_HEAP_TID_ATTR);
+}
+
+/*
+ * Get/set leaf page's "top parent" link from its high key. Used during page
+ * deletion.
+ *
+ * Note: Cannot assert that tuple is a pivot tuple. If we did so then
+ * !heapkeyspace indexes would exhibit false positive assertion failures.
+ */
+static inline BlockNumber
+BTreeTupleGetTopParent(IndexTuple leafhikey)
+{
+ return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid);
+}
+
+static inline void
+BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
+{
+ ItemPointerSetBlockNumber(&leafhikey->t_tid, blkno);
+ BTreeTupleSetNAtts(leafhikey, 0);
+}
+
+/*
+ * Get tiebreaker heap TID attribute, if any.
+ *
+ * This returns the first/lowest heap TID in the case of a posting list tuple.
+ */
+static inline ItemPointer
+BTreeTupleGetHeapTID(IndexTuple itup)
+{
+ if (BTreeTupleIsPivot(itup))
+ {
+ /* Pivot tuple heap TID representation? */
+ if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
+ BT_PIVOT_HEAP_TID_ATTR) != 0)
+ return (ItemPointer) ((char *) itup + IndexTupleSize(itup) -
+ sizeof(ItemPointerData));
+
+ /* Heap TID attribute was truncated */
+ return NULL;
+ }
+ else if (BTreeTupleIsPosting(itup))
+ return BTreeTupleGetPosting(itup);
+
+ return &itup->t_tid;
+}
+
+/*
+ * Get maximum heap TID attribute, which could be the only TID in the case of
+ * a non-pivot tuple that does not have a posting list tuple.
+ *
+ * Works with non-pivot tuples only.
+ */
+static inline ItemPointer
+BTreeTupleGetMaxHeapTID(IndexTuple itup)
+{
+ Assert(!BTreeTupleIsPivot(itup));
+
+ if (BTreeTupleIsPosting(itup))
+ {
+ uint16 nposting = BTreeTupleGetNPosting(itup);
+
+ return BTreeTupleGetPostingN(itup, nposting - 1);
+ }
+
+ return &itup->t_tid;
+}
/*
* Operator strategy numbers for B-tree have been moved to access/stratnum.h,
@@ -439,6 +629,9 @@ typedef BTStackData *BTStack;
* indexes whose version is >= version 4. It's convenient to keep this close
* by, rather than accessing the metapage repeatedly.
*
+ * allequalimage is set to indicate that deduplication is safe for the index.
+ * This is also a property of the index relation rather than an indexscan.
+ *
* anynullkeys indicates if any of the keys had NULL value when scankey was
* built from index tuple (note that already-truncated tuple key attributes
* set NULL as a placeholder key value, which also affects value of
@@ -474,6 +667,7 @@ typedef BTStackData *BTStack;
typedef struct BTScanInsertData
{
bool heapkeyspace;
+ bool allequalimage;
bool anynullkeys;
bool nextkey;
bool pivotsearch;
@@ -512,10 +706,94 @@ typedef struct BTInsertStateData
bool bounds_valid;
OffsetNumber low;
OffsetNumber stricthigh;
+
+ /*
+ * if _bt_binsrch_insert found the location inside existing posting list,
+ * save the position inside the list. -1 sentinel value indicates overlap
+ * with an existing posting list tuple that has its LP_DEAD bit set.
+ */
+ int postingoff;
} BTInsertStateData;
typedef BTInsertStateData *BTInsertState;
+/*
+ * State used to representing an individual pending tuple during
+ * deduplication.
+ */
+typedef struct BTDedupInterval
+{
+ OffsetNumber baseoff;
+ uint16 nitems;
+} BTDedupInterval;
+
+/*
+ * BTDedupStateData is a working area used during deduplication.
+ *
+ * The status info fields track the state of a whole-page deduplication pass.
+ * State about the current pending posting list is also tracked.
+ *
+ * A pending posting list is comprised of a contiguous group of equal items
+ * from the page, starting from page offset number 'baseoff'. This is the
+ * offset number of the "base" tuple for new posting list. 'nitems' is the
+ * current total number of existing items from the page that will be merged to
+ * make a new posting list tuple, including the base tuple item. (Existing
+ * items may themselves be posting list tuples, or regular non-pivot tuples.)
+ *
+ * The total size of the existing tuples to be freed when pending posting list
+ * is processed gets tracked by 'phystupsize'. This information allows
+ * deduplication to calculate the space saving for each new posting list
+ * tuple, and for the entire pass over the page as a whole.
+ */
+typedef struct BTDedupStateData
+{
+ /* Deduplication status info for entire pass over page */
+ bool deduplicate; /* Still deduplicating page? */
+ Size maxpostingsize; /* Limit on size of final tuple */
+
+ /* Metadata about base tuple of current pending posting list */
+ IndexTuple base; /* Use to form new posting list */
+ OffsetNumber baseoff; /* page offset of base */
+ Size basetupsize; /* base size without original posting list */
+
+ /* Other metadata about pending posting list */
+ ItemPointer htids; /* Heap TIDs in pending posting list */
+ int nhtids; /* Number of heap TIDs in htids array */
+ int nitems; /* Number of existing tuples/line pointers */
+ Size phystupsize; /* Includes line pointer overhead */
+
+ /*
+ * Array of tuples to go on new version of the page. Contains one entry
+ * for each group of consecutive items. Note that existing tuples that
+ * will not become posting list tuples do not appear in the array (they
+ * are implicitly unchanged by deduplication pass).
+ */
+ int nintervals; /* current size of intervals array */
+ BTDedupInterval intervals[MaxIndexTuplesPerPage];
+} BTDedupStateData;
+
+typedef BTDedupStateData *BTDedupState;
+
+/*
+ * BTVacuumPostingData is state that represents how to VACUUM a posting list
+ * tuple when some (though not all) of its TIDs are to be deleted.
+ *
+ * Convention is that itup field is the original posting list tuple on input,
+ * and palloc()'d final tuple used to overwrite existing tuple on output.
+ */
+typedef struct BTVacuumPostingData
+{
+ /* Tuple that will be/was updated */
+ IndexTuple itup;
+ OffsetNumber updatedoffset;
+
+ /* State needed to describe final itup in WAL */
+ uint16 ndeletedtids;
+ uint16 deletetids[FLEXIBLE_ARRAY_MEMBER];
+} BTVacuumPostingData;
+
+typedef BTVacuumPostingData *BTVacuumPosting;
+
/*
* BTScanOpaqueData is the btree-private state needed for an indexscan.
* This consists of preprocessed scan keys (see _bt_preprocess_keys() for
@@ -539,7 +817,9 @@ typedef BTInsertStateData *BTInsertState;
* If we are doing an index-only scan, we save the entire IndexTuple for each
* matched item, otherwise only its heap TID and offset. The IndexTuples go
* into a separate workspace array; each BTScanPosItem stores its tuple's
- * offset within that array.
+ * offset within that array. Posting list tuples store a "base" tuple once,
+ * allowing the same key to be returned for each TID in the posting list
+ * tuple.
*/
typedef struct BTScanPosItem /* what we remember about each match */
@@ -583,7 +863,7 @@ typedef struct BTScanPosData
int lastItem; /* last valid index in items[] */
int itemIndex; /* current index in items[] */
- BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
+ BTScanPosItem items[MaxTIDsPerBTreePage]; /* MUST BE LAST */
} BTScanPosData;
typedef BTScanPosData *BTScanPos;
@@ -691,6 +971,7 @@ typedef struct BTOptions
int fillfactor; /* page fill factor in percent (0..100) */
/* fraction of newly inserted tuples prior to trigger index cleanup */
float8 vacuum_cleanup_index_scale_factor;
+ bool deduplicate_items; /* Try to deduplicate items? */
} BTOptions;
#define BTGetFillFactor(relation) \
@@ -701,6 +982,11 @@ typedef struct BTOptions
BTREE_DEFAULT_FILLFACTOR)
#define BTGetTargetPageFreeSpace(relation) \
(BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
+#define BTGetDeduplicateItems(relation) \
+ (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
+ relation->rd_rel->relam == BTREE_AM_OID), \
+ ((relation)->rd_options ? \
+ ((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
/*
* Constant definition for progress reporting. Phase numbers must match
@@ -747,6 +1033,22 @@ extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
extern void _bt_parallel_done(IndexScanDesc scan);
extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
+/*
+ * prototypes for functions in nbtdedup.c
+ */
+extern void _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
+ IndexTuple newitem, Size newitemsz,
+ bool checkingunique);
+extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
+ OffsetNumber baseoff);
+extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup);
+extern Size _bt_dedup_finish_pending(Page newpage, BTDedupState state);
+extern IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids,
+ int nhtids);
+extern void _bt_update_posting(BTVacuumPosting vacposting);
+extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting,
+ int postingoff);
+
/*
* prototypes for functions in nbtinsert.c
*/
@@ -765,14 +1067,16 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page page,
/*
* prototypes for functions in nbtpage.c
*/
-extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
+extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
+ bool allequalimage);
extern void _bt_update_meta_cleanup_info(Relation rel,
TransactionId oldestBtpoXact, float8 numHeapTuples);
extern void _bt_upgrademetapage(Page page);
extern Buffer _bt_getroot(Relation rel, int access);
extern Buffer _bt_gettrueroot(Relation rel);
extern int _bt_getrootheight(Relation rel);
-extern bool _bt_heapkeyspace(Relation rel);
+extern void _bt_metaversion(Relation rel, bool *heapkeyspace,
+ bool *allequalimage);
extern void _bt_checkpage(Relation rel, Buffer buf);
extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
@@ -781,7 +1085,8 @@ extern void _bt_relbuf(Relation rel, Buffer buf);
extern void _bt_pageinit(Page page, Size size);
extern bool _bt_page_recyclable(Page page);
extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
- OffsetNumber *deletable, int ndeletable);
+ OffsetNumber *deletable, int ndeletable,
+ BTVacuumPosting *updatable, int nupdatable);
extern void _bt_delitems_delete(Relation rel, Buffer buf,
OffsetNumber *deletable, int ndeletable,
Relation heapRel);
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index 776a9bd723..347976c532 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -28,7 +28,8 @@
#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
#define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */
#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
-/* 0x50 and 0x60 are unused */
+#define XLOG_BTREE_INSERT_POST 0x50 /* add index tuple with posting split */
+#define XLOG_BTREE_DEDUP 0x60 /* deduplicate tuples for a page */
#define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */
#define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */
#define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */
@@ -53,21 +54,34 @@ typedef struct xl_btree_metadata
uint32 fastlevel;
TransactionId oldest_btpo_xact;
float8 last_cleanup_num_heap_tuples;
+ bool allequalimage;
} xl_btree_metadata;
/*
* This is what we need to know about simple (without split) insert.
*
- * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
- * Note that INSERT_META implies it's not a leaf page.
+ * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and
+ * INSERT_POST. Note that INSERT_META and INSERT_UPPER implies it's not a
+ * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf
+ * page.
*
- * Backup Blk 0: original page (data contains the inserted tuple)
+ * Backup Blk 0: original page
* Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
* Backup Blk 2: xl_btree_metadata, if INSERT_META
+ *
+ * Note: The new tuple is actually the "original" new item in the posting
+ * list split insert case (i.e. the INSERT_POST case). A split offset for
+ * the posting list is logged before the original new item. Recovery needs
+ * both, since it must do an in-place update of the existing posting list
+ * that was split as an extra step. Also, recovery generates a "final"
+ * newitem. See _bt_swap_posting() for details on posting list splits.
*/
typedef struct xl_btree_insert
{
OffsetNumber offnum;
+
+ /* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */
+ /* NEW TUPLE ALWAYS FOLLOWS AT THE END */
} xl_btree_insert;
#define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
@@ -92,8 +106,37 @@ typedef struct xl_btree_insert
* Backup Blk 0: original page / new left page
*
* The left page's data portion contains the new item, if it's the _L variant.
- * An IndexTuple representing the high key of the left page must follow with
- * either variant.
+ * _R variant split records generally do not have a newitem (_R variant leaf
+ * page split records that must deal with a posting list split will include an
+ * explicit newitem, though it is never used on the right page -- it is
+ * actually an orignewitem needed to update existing posting list). The new
+ * high key of the left/original page appears last of all (and must always be
+ * present).
+ *
+ * Page split records that need the REDO routine to deal with a posting list
+ * split directly will have an explicit newitem, which is actually an
+ * orignewitem (the newitem as it was before the posting list split, not
+ * after). A posting list split always has a newitem that comes immediately
+ * after the posting list being split (which would have overlapped with
+ * orignewitem prior to split). Usually REDO must deal with posting list
+ * splits with an _L variant page split record, and usually both the new
+ * posting list and the final newitem go on the left page (the existing
+ * posting list will be inserted instead of the old, and the final newitem
+ * will be inserted next to that). However, _R variant split records will
+ * include an orignewitem when the split point for the page happens to have a
+ * lastleft tuple that is also the posting list being split (leaving newitem
+ * as the page split's firstright tuple). The existence of this corner case
+ * does not change the basic fact about newitem/orignewitem for the REDO
+ * routine: it is always state used for the left page alone. (This is why the
+ * record's postingoff field isn't a reliable indicator of whether or not a
+ * posting list split occurred during the page split; a non-zero value merely
+ * indicates that the REDO routine must reconstruct a new posting list tuple
+ * that is needed for the left page.)
+ *
+ * This posting list split handling is equivalent to the xl_btree_insert REDO
+ * routine's INSERT_POST handling. While the details are more complicated
+ * here, the concept and goals are exactly the same. See _bt_swap_posting()
+ * for details on posting list splits.
*
* Backup Blk 1: new right page
*
@@ -111,15 +154,33 @@ typedef struct xl_btree_split
{
uint32 level; /* tree level of page being split */
OffsetNumber firstright; /* first item moved to right page */
- OffsetNumber newitemoff; /* new item's offset (useful for _L variant) */
+ OffsetNumber newitemoff; /* new item's offset */
+ uint16 postingoff; /* offset inside orig posting tuple */
} xl_btree_split;
-#define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber))
+#define SizeOfBtreeSplit (offsetof(xl_btree_split, postingoff) + sizeof(uint16))
+
+/*
+ * When page is deduplicated, consecutive groups of tuples with equal keys are
+ * merged together into posting list tuples.
+ *
+ * The WAL record represents a deduplication pass for a leaf page. An array
+ * of BTDedupInterval structs follows.
+ */
+typedef struct xl_btree_dedup
+{
+ uint16 nintervals;
+
+ /* DEDUPLICATION INTERVALS FOLLOW */
+} xl_btree_dedup;
+
+#define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nintervals) + sizeof(uint16))
/*
* This is what we need to know about delete of individual leaf index tuples.
* The WAL record can represent deletion of any number of index tuples on a
- * single index page when *not* executed by VACUUM.
+ * single index page when *not* executed by VACUUM. Deletion of a subset of
+ * the TIDs within a posting list tuple is not supported.
*
* Backup Blk 0: index page
*/
@@ -150,21 +211,43 @@ typedef struct xl_btree_reuse_page
#define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page))
/*
- * This is what we need to know about vacuum of individual leaf index tuples.
- * The WAL record can represent deletion of any number of index tuples on a
- * single index page when executed by VACUUM.
+ * This is what we need to know about which TIDs to remove from an individual
+ * posting list tuple during vacuuming. An array of these may appear at the
+ * end of xl_btree_vacuum records.
+ */
+typedef struct xl_btree_update
+{
+ uint16 ndeletedtids;
+
+ /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */
+} xl_btree_update;
+
+#define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16))
+
+/*
+ * This is what we need to know about a VACUUM of a leaf page. The WAL record
+ * can represent deletion of any number of index tuples on a single index page
+ * when executed by VACUUM. It can also support "updates" of index tuples,
+ * which is how deletes of a subset of TIDs contained in an existing posting
+ * list tuple are implemented. (Updates are only used when there will be some
+ * remaining TIDs once VACUUM finishes; otherwise the posting list tuple can
+ * just be deleted).
*
- * Note that the WAL record in any vacuum of an index must have at least one
- * item to delete.
+ * Updated posting list tuples are represented using xl_btree_update metadata.
+ * The REDO routine uses each xl_btree_update (plus its corresponding original
+ * index tuple from the target leaf page) to generate the final updated tuple.
*/
typedef struct xl_btree_vacuum
{
- uint32 ndeleted;
+ uint16 ndeleted;
+ uint16 nupdated;
/* DELETED TARGET OFFSET NUMBERS FOLLOW */
+ /* UPDATED TARGET OFFSET NUMBERS FOLLOW */
+ /* UPDATED TUPLES METADATA ARRAY FOLLOWS */
} xl_btree_vacuum;
-#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, ndeleted) + sizeof(uint32))
+#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16))
/*
* This is what we need to know about marking an empty branch for deletion.
@@ -245,6 +328,8 @@ typedef struct xl_btree_newroot
extern void btree_redo(XLogReaderState *record);
extern void btree_desc(StringInfo buf, XLogReaderState *record);
extern const char *btree_identify(uint8 info);
+extern void btree_xlog_startup(void);
+extern void btree_xlog_cleanup(void);
extern void btree_mask(char *pagedata, BlockNumber blkno);
#endif /* NBTXLOG_H */
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index c88dccfb8d..6c15df7e70 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -36,7 +36,7 @@ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL,
PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL)
PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask)
PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask)
-PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask)
+PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask)
PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask)
PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask)
PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask)
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 087918d41d..27ded593ab 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -31,7 +31,7 @@
/*
* Each page of XLOG file has a header like this:
*/
-#define XLOG_PAGE_MAGIC 0xD104 /* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD105 /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{
diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out
index f567117a46..1646deb092 100644
--- a/src/test/regress/expected/btree_index.out
+++ b/src/test/regress/expected/btree_index.out
@@ -200,7 +200,7 @@ reset enable_indexscan;
reset enable_bitmapscan;
-- Also check LIKE optimization with binary-compatible cases
create temp table btree_bpchar (f1 text collate "C");
-create index on btree_bpchar(f1 bpchar_ops);
+create index on btree_bpchar(f1 bpchar_ops) WITH (deduplicate_items=on);
insert into btree_bpchar values ('foo'), ('fool'), ('bar'), ('quux');
-- doesn't match index:
explain (costs off)
@@ -266,6 +266,24 @@ select * from btree_bpchar where f1::bpchar like 'foo%';
fool
(2 rows)
+-- get test coverage for "single value" deduplication strategy:
+insert into btree_bpchar select 'foo' from generate_series(1,1500);
+--
+-- Perform unique checking, with and without the use of deduplication
+--
+CREATE TABLE dedup_unique_test_table (a int) WITH (autovacuum_enabled=false);
+CREATE UNIQUE INDEX dedup_unique ON dedup_unique_test_table (a) WITH (deduplicate_items=on);
+CREATE UNIQUE INDEX plain_unique ON dedup_unique_test_table (a) WITH (deduplicate_items=off);
+-- Generate enough garbage tuples in index to ensure that even the unique index
+-- with deduplication enabled has to check multiple leaf pages during unique
+-- checking (at least with a BLCKSZ of 8192 or less)
+DO $$
+BEGIN
+ FOR r IN 1..1350 LOOP
+ DELETE FROM dedup_unique_test_table;
+ INSERT INTO dedup_unique_test_table SELECT 1;
+ END LOOP;
+END$$;
--
-- Test B-tree fast path (cache rightmost leaf page) optimization.
--
diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql
index 558dcae0ec..6e14b935ce 100644
--- a/src/test/regress/sql/btree_index.sql
+++ b/src/test/regress/sql/btree_index.sql
@@ -86,7 +86,7 @@ reset enable_bitmapscan;
-- Also check LIKE optimization with binary-compatible cases
create temp table btree_bpchar (f1 text collate "C");
-create index on btree_bpchar(f1 bpchar_ops);
+create index on btree_bpchar(f1 bpchar_ops) WITH (deduplicate_items=on);
insert into btree_bpchar values ('foo'), ('fool'), ('bar'), ('quux');
-- doesn't match index:
explain (costs off)
@@ -103,6 +103,26 @@ explain (costs off)
select * from btree_bpchar where f1::bpchar like 'foo%';
select * from btree_bpchar where f1::bpchar like 'foo%';
+-- get test coverage for "single value" deduplication strategy:
+insert into btree_bpchar select 'foo' from generate_series(1,1500);
+
+--
+-- Perform unique checking, with and without the use of deduplication
+--
+CREATE TABLE dedup_unique_test_table (a int) WITH (autovacuum_enabled=false);
+CREATE UNIQUE INDEX dedup_unique ON dedup_unique_test_table (a) WITH (deduplicate_items=on);
+CREATE UNIQUE INDEX plain_unique ON dedup_unique_test_table (a) WITH (deduplicate_items=off);
+-- Generate enough garbage tuples in index to ensure that even the unique index
+-- with deduplication enabled has to check multiple leaf pages during unique
+-- checking (at least with a BLCKSZ of 8192 or less)
+DO $$
+BEGIN
+ FOR r IN 1..1350 LOOP
+ DELETE FROM dedup_unique_test_table;
+ INSERT INTO dedup_unique_test_table SELECT 1;
+ END LOOP;
+END$$;
+
--
-- Test B-tree fast path (cache rightmost leaf page) optimization.
--