diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 6a058ccdac..31717321b0 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -145,6 +145,7 @@ static void bt_tuple_present_callback(Relation index, ItemPointer tid, bool tupleIsAlive, void *checkstate); static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup); +static inline IndexTuple bt_posting_plain_tuple(IndexTuple itup, int n); static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup); static inline bool offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset); @@ -167,6 +168,7 @@ static ItemId PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page, OffsetNumber offset); static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, bool nonpivot); +static inline ItemPointer BTreeTupleGetPointsToTID(IndexTuple itup); /* * bt_index_check(index regclass, heapallindexed boolean) @@ -278,7 +280,8 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, if (btree_index_mainfork_expected(indrel)) { - bool heapkeyspace; + bool heapkeyspace, + allequalimage; RelationOpenSmgr(indrel); if (!smgrexists(indrel->rd_smgr, MAIN_FORKNUM)) @@ -288,7 +291,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, RelationGetRelationName(indrel)))); /* Check index, possibly against table it is an index on */ - heapkeyspace = _bt_heapkeyspace(indrel); + _bt_metaversion(indrel, &heapkeyspace, &allequalimage); bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck, heapallindexed, rootdescend); } @@ -419,12 +422,12 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, /* * Size Bloom filter based on estimated number of tuples in index, * while conservatively assuming that each block must contain at least - * MaxIndexTuplesPerPage / 5 non-pivot tuples. (Non-leaf pages cannot - * contain non-pivot tuples. That's okay because they generally make - * up no more than about 1% of all pages in the index.) + * MaxTIDsPerBTreePage / 3 "plain" tuples -- see + * bt_posting_plain_tuple() for definition, and details of how posting + * list tuples are handled. */ total_pages = RelationGetNumberOfBlocks(rel); - total_elems = Max(total_pages * (MaxIndexTuplesPerPage / 5), + total_elems = Max(total_pages * (MaxTIDsPerBTreePage / 3), (int64) state->rel->rd_rel->reltuples); /* Random seed relies on backend srandom() call to avoid repetition */ seed = random(); @@ -924,6 +927,7 @@ bt_target_page_check(BtreeCheckState *state) size_t tupsize; BTScanInsert skey; bool lowersizelimit; + ItemPointer scantid; CHECK_FOR_INTERRUPTS(); @@ -954,13 +958,15 @@ bt_target_page_check(BtreeCheckState *state) if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target, offset)) { + ItemPointer tid; char *itid, *htid; itid = psprintf("(%u,%u)", state->targetblock, offset); + tid = BTreeTupleGetPointsToTID(itup); htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -994,18 +1000,20 @@ bt_target_page_check(BtreeCheckState *state) /* * Readonly callers may optionally verify that non-pivot tuples can - * each be found by an independent search that starts from the root + * each be found by an independent search that starts from the root. + * Note that we deliberately don't do individual searches for each + * TID, since the posting list itself is validated by other checks. */ if (state->rootdescend && P_ISLEAF(topaque) && !bt_rootdescend(state, itup)) { + ItemPointer tid = BTreeTupleGetPointsToTID(itup); char *itid, *htid; itid = psprintf("(%u,%u)", state->targetblock, offset); - htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumber(&(itup->t_tid)), - ItemPointerGetOffsetNumber(&(itup->t_tid))); + htid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -1017,6 +1025,40 @@ bt_target_page_check(BtreeCheckState *state) (uint32) state->targetlsn))); } + /* + * If tuple is a posting list tuple, make sure posting list TIDs are + * in order + */ + if (BTreeTupleIsPosting(itup)) + { + ItemPointerData last; + ItemPointer current; + + ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last); + + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + + current = BTreeTupleGetPostingN(itup, i); + + if (ItemPointerCompare(current, &last) <= 0) + { + char *itid = psprintf("(%u,%u)", state->targetblock, offset); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("posting list contains misplaced TID in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s posting list offset=%d page lsn=%X/%X.", + itid, i, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn))); + } + + ItemPointerCopy(current, &last); + } + } + /* Build insertion scankey for current page offset */ skey = bt_mkscankey_pivotsearch(state->rel, itup); @@ -1049,13 +1091,14 @@ bt_target_page_check(BtreeCheckState *state) if (tupsize > (lowersizelimit ? BTMaxItemSize(state->target) : BTMaxItemSizeNoHeapTid(state->target))) { + ItemPointer tid = BTreeTupleGetPointsToTID(itup); char *itid, *htid; itid = psprintf("(%u,%u)", state->targetblock, offset); htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -1074,12 +1117,32 @@ bt_target_page_check(BtreeCheckState *state) { IndexTuple norm; - norm = bt_normalize_tuple(state, itup); - bloom_add_element(state->filter, (unsigned char *) norm, - IndexTupleSize(norm)); - /* Be tidy */ - if (norm != itup) - pfree(norm); + if (BTreeTupleIsPosting(itup)) + { + /* Fingerprint all elements as distinct "plain" tuples */ + for (int i = 0; i < BTreeTupleGetNPosting(itup); i++) + { + IndexTuple logtuple; + + logtuple = bt_posting_plain_tuple(itup, i); + norm = bt_normalize_tuple(state, logtuple); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != logtuple) + pfree(norm); + pfree(logtuple); + } + } + else + { + norm = bt_normalize_tuple(state, itup); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != itup) + pfree(norm); + } } /* @@ -1087,7 +1150,8 @@ bt_target_page_check(BtreeCheckState *state) * * If there is a high key (if this is not the rightmost page on its * entire level), check that high key actually is upper bound on all - * page items. + * page items. If this is a posting list tuple, we'll need to set + * scantid to be highest TID in posting list. * * We prefer to check all items against high key rather than checking * just the last and trusting that the operator class obeys the @@ -1127,17 +1191,22 @@ bt_target_page_check(BtreeCheckState *state) * tuple. (See also: "Notes About Data Representation" in the nbtree * README.) */ + scantid = skey->scantid; + if (state->heapkeyspace && BTreeTupleIsPosting(itup)) + skey->scantid = BTreeTupleGetMaxHeapTID(itup); + if (!P_RIGHTMOST(topaque) && !(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) : invariant_l_offset(state, skey, P_HIKEY))) { + ItemPointer tid = BTreeTupleGetPointsToTID(itup); char *itid, *htid; itid = psprintf("(%u,%u)", state->targetblock, offset); htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -1150,6 +1219,8 @@ bt_target_page_check(BtreeCheckState *state) (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); } + /* Reset, in case scantid was set to (itup) posting tuple's max TID */ + skey->scantid = scantid; /* * * Item order check * @@ -1160,15 +1231,17 @@ bt_target_page_check(BtreeCheckState *state) if (OffsetNumberNext(offset) <= max && !invariant_l_offset(state, skey, OffsetNumberNext(offset))) { + ItemPointer tid; char *itid, *htid, *nitid, *nhtid; itid = psprintf("(%u,%u)", state->targetblock, offset); + tid = BTreeTupleGetPointsToTID(itup); htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); nitid = psprintf("(%u,%u)", state->targetblock, OffsetNumberNext(offset)); @@ -1177,9 +1250,10 @@ bt_target_page_check(BtreeCheckState *state) state->target, OffsetNumberNext(offset)); itup = (IndexTuple) PageGetItem(state->target, itemid); + tid = BTreeTupleGetPointsToTID(itup); nhtid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -1953,10 +2027,9 @@ bt_tuple_present_callback(Relation index, ItemPointer tid, Datum *values, * verification. In particular, it won't try to normalize opclass-equal * datums with potentially distinct representations (e.g., btree/numeric_ops * index datums will not get their display scale normalized-away here). - * Normalization may need to be expanded to handle more cases in the future, - * though. For example, it's possible that non-pivot tuples could in the - * future have alternative logically equivalent representations due to using - * the INDEX_ALT_TID_MASK bit to implement intelligent deduplication. + * Caller does normalization for non-pivot tuples that have a posting list, + * since dummy CREATE INDEX callback code generates new tuples with the same + * normalized representation. */ static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) @@ -1969,6 +2042,9 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) IndexTuple reformed; int i; + /* Caller should only pass "logical" non-pivot tuples here */ + Assert(!BTreeTupleIsPosting(itup) && !BTreeTupleIsPivot(itup)); + /* Easy case: It's immediately clear that tuple has no varlena datums */ if (!IndexTupleHasVarwidths(itup)) return itup; @@ -2031,6 +2107,29 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) return reformed; } +/* + * Produce palloc()'d "plain" tuple for nth posting list entry/TID. + * + * In general, deduplication is not supposed to change the logical contents of + * an index. Multiple index tuples are merged together into one equivalent + * posting list index tuple when convenient. + * + * heapallindexed verification must normalize-away this variation in + * representation by converting posting list tuples into two or more "plain" + * tuples. Each tuple must be fingerprinted separately -- there must be one + * tuple for each corresponding Bloom filter probe during the heap scan. + * + * Note: Caller still needs to call bt_normalize_tuple() with returned tuple. + */ +static inline IndexTuple +bt_posting_plain_tuple(IndexTuple itup, int n) +{ + Assert(BTreeTupleIsPosting(itup)); + + /* Returns non-posting-list tuple */ + return _bt_form_posting(itup, BTreeTupleGetPostingN(itup, n), 1); +} + /* * Search for itup in index, starting from fast root page. itup must be a * non-pivot tuple. This is only supported with heapkeyspace indexes, since @@ -2087,6 +2186,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) insertstate.itup = itup; insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); insertstate.itup_key = key; + insertstate.postingoff = 0; insertstate.bounds_valid = false; insertstate.buf = lbuf; @@ -2094,7 +2194,9 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) offnum = _bt_binsrch_insert(state->rel, &insertstate); /* Compare first >= matching item on leaf page, if any */ page = BufferGetPage(lbuf); + /* Should match on first heap TID when tuple has a posting list */ if (offnum <= PageGetMaxOffsetNumber(page) && + insertstate.postingoff <= 0 && _bt_compare(state->rel, key, page, offnum) == 0) exists = true; _bt_relbuf(state->rel, lbuf); @@ -2548,26 +2650,69 @@ PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page, } /* - * BTreeTupleGetHeapTID() wrapper that lets caller enforce that a heap TID must - * be present in cases where that is mandatory. - * - * This doesn't add much as of BTREE_VERSION 4, since the INDEX_ALT_TID_MASK - * bit is effectively a proxy for whether or not the tuple is a pivot tuple. - * It may become more useful in the future, when non-pivot tuples support their - * own alternative INDEX_ALT_TID_MASK representation. + * BTreeTupleGetHeapTID() wrapper that enforces that a heap TID is present in + * cases where that is mandatory (i.e. for non-pivot tuples) */ static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, bool nonpivot) { - ItemPointer result = BTreeTupleGetHeapTID(itup); - BlockNumber targetblock = state->targetblock; + ItemPointer htid; - if (result == NULL && nonpivot) + /* + * Caller determines whether this is supposed to be a pivot or non-pivot + * tuple using page type and item offset number. Verify that tuple + * metadata agrees with this. + */ + Assert(state->heapkeyspace); + if (BTreeTupleIsPivot(itup) && nonpivot) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected pivot tuple", + state->targetblock, + RelationGetRelationName(state->rel)))); + + if (!BTreeTupleIsPivot(itup) && !nonpivot) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected non-pivot tuple", + state->targetblock, + RelationGetRelationName(state->rel)))); + + htid = BTreeTupleGetHeapTID(itup); + if (!ItemPointerIsValid(htid) && nonpivot) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID", - targetblock, RelationGetRelationName(state->rel)))); + state->targetblock, + RelationGetRelationName(state->rel)))); - return result; + return htid; +} + +/* + * Return the "pointed to" TID for itup, which is used to generate a + * descriptive error message. itup must be a "data item" tuple (it wouldn't + * make much sense to call here with a high key tuple, since there won't be a + * valid downlink/block number to display). + * + * Returns either a heap TID (which will be the first heap TID in posting list + * if itup is posting list tuple), or a TID that contains downlink block + * number, plus some encoded metadata (e.g., the number of attributes present + * in itup). + */ +static inline ItemPointer +BTreeTupleGetPointsToTID(IndexTuple itup) +{ + /* + * Rely on the assumption that !heapkeyspace internal page data items will + * correctly return TID with downlink here -- BTreeTupleGetHeapTID() won't + * recognize it as a pivot tuple, but everything still works out because + * the t_tid field is still returned + */ + if (!BTreeTupleIsPivot(itup)) + return BTreeTupleGetHeapTID(itup); + + /* Pivot tuple returns TID with downlink block (heapkeyspace variant) */ + return &itup->t_tid; } diff --git a/doc/src/sgml/btree.sgml b/doc/src/sgml/btree.sgml index fcf771c857..f02e02b0ac 100644 --- a/doc/src/sgml/btree.sgml +++ b/doc/src/sgml/btree.sgml @@ -557,11 +557,208 @@ equalimage(opcintype oid) returns bool Implementation + + This section covers B-Tree index implementation details that may be + of use to advanced users. See + src/backend/access/nbtree/README in the source + distribution for a much more detailed, internals-focused description + of the B-Tree implementation. + + + B-Tree Structure - An introduction to the btree index implementation can be found in - src/backend/access/nbtree/README. + PostgreSQL B-Tree indexes are + multi-level tree structures, where each level of the tree can be + used as a doubly-linked list of pages. A single metapage is stored + in a fixed position at the start of the first segment file of the + index. All other pages are either leaf pages or internal pages. + Leaf pages are the pages on the lowest level of the tree. All + other levels consist of internal pages. Each leaf page contains + tuples that point to table rows. Each internal page contains + tuples that point to the next level down in the tree. Typically, + over 99% of all pages are leaf pages. Both internal pages and leaf + pages use the standard page format described in . + + + New leaf pages are added to a B-Tree index when an existing leaf + page cannot fit an incoming tuple. A page + split operation makes room for items that originally + belonged on the overflowing page by moving a portion of the items + to a new page. Page splits must also insert a new + downlink to the new page in the parent page, + which may cause the parent to split in turn. Page splits + cascade upwards in a recursive fashion. When the + root page finally cannot fit a new downlink, a root page + split operation takes place. This adds a new level to + the tree structure by creating a new root page that is one level + above the original root page. + + + + + Deduplication + + A duplicate is a leaf page tuple (a tuple that points to a table + row) where all indexed key columns have values + that match corresponding column values from at least one other leaf + page tuple that's close by in the same index. Duplicate tuples are + quite common in practice. B-Tree indexes can use a special, + space-efficient representation for duplicates when an optional + technique is enabled: deduplication. + + + Deduplication works by periodically merging groups of duplicate + tuples together, forming a single posting list tuple for each + group. The column key value(s) only appear once in this + representation. This is followed by a sorted array of + TIDs that point to rows in the table. This + significantly reduces the storage size of indexes where each value + (or each distinct combination of column values) appears several + times on average. The latency of queries can be reduced + significantly. Overall query throughput may increase + significantly. The overhead of routine index vacuuming may also be + reduced significantly. + + + + While NULL is generally not considered to be equal to any other + value, including NULL, NULL is nevertheless treated as just + another value from the domain of indexed values by the B-Tree + implementation (except when enforcing uniqueness in a unique + index). B-Tree deduplication is therefore just as effective with + duplicates that contain a NULL value. + + + + The deduplication process occurs lazily, when a new item is + inserted that cannot fit on an existing leaf page. This prevents + (or at least delays) leaf page splits. Unlike GIN posting list + tuples, B-Tree posting list tuples do not need to expand every time + a new duplicate is inserted; they are merely an alternative + physical representation of the original logical contents of the + leaf page. This design prioritizes consistent performance with + mixed read-write workloads. Most client applications will at least + see a moderate performance benefit from using deduplication. + Deduplication is enabled by default. + + + Write-heavy workloads that don't benefit from deduplication due to + having few or no duplicate values in indexes will incur a small, + fixed performance penalty (unless deduplication is explicitly + disabled). The deduplicate_items storage + parameter can be used to disable deduplication within individual + indexes. There is never any performance penalty with read-only + workloads, since reading posting list tuples is at least as + efficient as reading the standard tuple representation. Disabling + deduplication isn't usually helpful. + + + B-Tree indexes are not directly aware that under MVCC, there might + be multiple extant versions of the same logical table row; to an + index, each tuple is an independent object that needs its own index + entry. Thus, an update of a row always creates all-new index + entries for the row, even if the key values did not change. Some + workloads suffer from index bloat caused by these + implementation-level version duplicates (this is typically a + problem for UPDATE-heavy workloads that cannot + apply the HOT optimization due to modifying at + least one indexed column). B-Tree deduplication does not + distinguish between these implementation-level version duplicates + and conventional duplicates. Deduplication can nevertheless help + with controlling index bloat caused by implementation-level version + churn. + + + + A special heuristic is applied to determine whether a + deduplication pass in a unique index should take place. It can + often skip straight to splitting a leaf page, avoiding a + performance penalty from wasting cycles on unhelpful deduplication + passes. If you're concerned about the overhead of deduplication, + consider setting deduplicate_items = off + selectively. Leaving deduplication enabled in unique indexes has + little downside. + + + + Deduplication cannot be used in all cases due to + implementation-level restrictions. Deduplication safety is + determined when CREATE INDEX or + REINDEX run. + + + Note that deduplication is deemed unsafe and cannot be used in the + following cases involving semantically significant differences + among equal datums: + + + + + + text, varchar, and char + cannot use deduplication when a + nondeterministic collation is used. Case + and accent differences must be preserved among equal datums. + + + + + + numeric cannot use deduplication. Numeric display + scale must be preserved among equal datums. + + + + + + jsonb cannot use deduplication, since the + jsonb B-Tree operator class uses + numeric internally. + + + + + + float4 and float8 cannot use + deduplication. These types have distinct representations for + -0 and 0, which are + nevertheless considered equal. This difference must be + preserved. + + + + + + There is one further implementation-level restriction that may be + lifted in a future version of + PostgreSQL: + + + + + + Container types (such as composite types, arrays, or range + types) cannot use deduplication. + + + + + + There is one further implementation-level restriction that applies + regardless of the operator class or collation used: + + + + + + INCLUDE indexes can never use deduplication. + + + + diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 057a6bb81a..20cdfabd7b 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -928,10 +928,11 @@ CREATE COLLATION ignore_accents (provider = icu, locale = 'und-u-ks-level1-kc-tr nondeterministic collations give a more correct behavior, especially when considering the full power of Unicode and its many special cases, they also have some drawbacks. Foremost, their use leads - to a performance penalty. Also, certain operations are not possible with - nondeterministic collations, such as pattern matching operations. - Therefore, they should be used only in cases where they are specifically - wanted. + to a performance penalty. Note, in particular, that B-tree cannot use + deduplication with indexes that use a nondeterministic collation. Also, + certain operations are not possible with nondeterministic collations, + such as pattern matching operations. Therefore, they should be used + only in cases where they are specifically wanted. diff --git a/doc/src/sgml/citext.sgml b/doc/src/sgml/citext.sgml index 667824fb0b..5986601327 100644 --- a/doc/src/sgml/citext.sgml +++ b/doc/src/sgml/citext.sgml @@ -233,9 +233,10 @@ SELECT * FROM users WHERE nick = 'Larry'; citext is not as efficient as text because the operator functions and the B-tree comparison functions must make copies - of the data and convert it to lower case for comparisons. It is, - however, slightly more efficient than using lower to get - case-insensitive matching. + of the data and convert it to lower case for comparisons. Also, only + text can support B-Tree deduplication. However, + citext is slightly more efficient than using + lower to get case-insensitive matching. diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index ceda48e0fc..28035f1635 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -16561,10 +16561,11 @@ AND rows. Two rows might have a different binary representation even though comparisons of the two rows with the equality operator is true. The ordering of rows under these comparison operators is deterministic - but not otherwise meaningful. These operators are used internally for - materialized views and might be useful for other specialized purposes - such as replication but are not intended to be generally useful for - writing queries. + but not otherwise meaningful. These operators are used internally + for materialized views and might be useful for other specialized + purposes such as replication and B-Tree deduplication (see ). They are not intended to be + generally useful for writing queries, though. diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index ab362a0dc5..a05e2e6b9c 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -171,6 +171,8 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] @@ -393,10 +395,39 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] - B-tree indexes additionally accept this parameter: + B-tree indexes also accept these parameters: + + deduplicate_items + + deduplicate_items + storage parameter + + + + + Controls usage of the B-tree deduplication technique described + in . Set to + ON or OFF to enable or + disable the optimization. (Alternative spellings of + ON and OFF are allowed as + described in .) The default is + ON. + + + + + Turning deduplicate_items off via + ALTER INDEX prevents future insertions from + triggering deduplication, but does not in itself make existing + posting list tuples use the standard tuple representation. + + + + + vacuum_cleanup_index_scale_factor @@ -451,9 +482,7 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] . It is a Boolean parameter: ON enables fast update, OFF disables it. - (Alternative spellings of ON and OFF are - allowed as described in .) The - default is ON. + The default is ON. @@ -805,6 +834,13 @@ CREATE UNIQUE INDEX title_idx ON films (title) INCLUDE (director, rating); + + To create a B-Tree index with deduplication disabled: + +CREATE INDEX title_idx ON films (title) WITH (deduplicate_items = off); + + + To create an index on the expression lower(title), allowing efficient case-insensitive searches: diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 79430d2b7b..5325dd3f61 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -158,6 +158,16 @@ static relopt_bool boolRelOpts[] = }, true }, + { + { + "deduplicate_items", + "Enables \"deduplicate items\" feature for this btree index", + RELOPT_KIND_BTREE, + ShareUpdateExclusiveLock /* since it applies only to later + * inserts */ + }, + true + }, /* list terminator */ {{NULL}} }; diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index c16eb05416..dfba5ae39a 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -276,6 +276,10 @@ BuildIndexValueDescription(Relation indexRelation, /* * Get the latestRemovedXid from the table entries pointed at by the index * tuples being deleted. + * + * Note: index access methods that don't consistently use the standard + * IndexTuple + heap TID item pointer representation will need to provide + * their own version of this function. */ TransactionId index_compute_xid_horizon_for_tuples(Relation irel, diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile index bf245f5dab..d69808e78c 100644 --- a/src/backend/access/nbtree/Makefile +++ b/src/backend/access/nbtree/Makefile @@ -14,6 +14,7 @@ include $(top_builddir)/src/Makefile.global OBJS = \ nbtcompare.o \ + nbtdedup.o \ nbtinsert.o \ nbtpage.o \ nbtree.o \ diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index c60a4d0d9e..6499f5adb7 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -432,7 +432,10 @@ because we allow LP_DEAD to be set with only a share lock (it's exactly like a hint bit for a heap tuple), but physically removing tuples requires exclusive lock. In the current code we try to remove LP_DEAD tuples when we are otherwise faced with having to split a page to do an insertion (and -hence have exclusive lock on it already). +hence have exclusive lock on it already). Deduplication can also prevent +a page split, but removing LP_DEAD tuples is the preferred approach. +(Note that posting list tuples can only have their LP_DEAD bit set when +every table TID within the posting list is known dead.) This leaves the index in a state where it has no entry for a dead tuple that still exists in the heap. This is not a problem for the current @@ -726,6 +729,134 @@ if it must. When a page that's already full of duplicates must be split, the fallback strategy assumes that duplicates are mostly inserted in ascending heap TID order. The page is split in a way that leaves the left half of the page mostly full, and the right half of the page mostly empty. +The overall effect is that leaf page splits gracefully adapt to inserts of +large groups of duplicates, maximizing space utilization. Note also that +"trapping" large groups of duplicates on the same leaf page like this makes +deduplication more efficient. Deduplication can be performed infrequently, +without merging together existing posting list tuples too often. + +Notes about deduplication +------------------------- + +We deduplicate non-pivot tuples in non-unique indexes to reduce storage +overhead, and to avoid (or at least delay) page splits. Note that the +goals for deduplication in unique indexes are rather different; see later +section for details. Deduplication alters the physical representation of +tuples without changing the logical contents of the index, and without +adding overhead to read queries. Non-pivot tuples are merged together +into a single physical tuple with a posting list (a simple array of heap +TIDs with the standard item pointer format). Deduplication is always +applied lazily, at the point where it would otherwise be necessary to +perform a page split. It occurs only when LP_DEAD items have been +removed, as our last line of defense against splitting a leaf page. We +can set the LP_DEAD bit with posting list tuples, though only when all +TIDs are known dead. + +Our lazy approach to deduplication allows the page space accounting used +during page splits to have absolutely minimal special case logic for +posting lists. Posting lists can be thought of as extra payload that +suffix truncation will reliably truncate away as needed during page +splits, just like non-key columns from an INCLUDE index tuple. +Incoming/new tuples can generally be treated as non-overlapping plain +items (though see section on posting list splits for information about how +overlapping new/incoming items are really handled). + +The representation of posting lists is almost identical to the posting +lists used by GIN, so it would be straightforward to apply GIN's varbyte +encoding compression scheme to individual posting lists. Posting list +compression would break the assumptions made by posting list splits about +page space accounting (see later section), so it's not clear how +compression could be integrated with nbtree. Besides, posting list +compression does not offer a compelling trade-off for nbtree, since in +general nbtree is optimized for consistent performance with many +concurrent readers and writers. + +A major goal of our lazy approach to deduplication is to limit the +performance impact of deduplication with random updates. Even concurrent +append-only inserts of the same key value will tend to have inserts of +individual index tuples in an order that doesn't quite match heap TID +order. Delaying deduplication minimizes page level fragmentation. + +Deduplication in unique indexes +------------------------------- + +Very often, the range of values that can be placed on a given leaf page in +a unique index is fixed and permanent. For example, a primary key on an +identity column will usually only have page splits caused by the insertion +of new logical rows within the rightmost leaf page. If there is a split +of a non-rightmost leaf page, then the split must have been triggered by +inserts associated with an UPDATE of an existing logical row. Splitting a +leaf page purely to store multiple versions should be considered +pathological, since it permanently degrades the index structure in order +to absorb a temporary burst of duplicates. Deduplication in unique +indexes helps to prevent these pathological page splits. Storing +duplicates in a space efficient manner is not the goal, since in the long +run there won't be any duplicates anyway. Rather, we're buying time for +standard garbage collection mechanisms to run before a page split is +needed. + +Unique index leaf pages only get a deduplication pass when an insertion +(that might have to split the page) observed an existing duplicate on the +page in passing. This is based on the assumption that deduplication will +only work out when _all_ new insertions are duplicates from UPDATEs. This +may mean that we miss an opportunity to delay a page split, but that's +okay because our ultimate goal is to delay leaf page splits _indefinitely_ +(i.e. to prevent them altogether). There is little point in trying to +delay a split that is probably inevitable anyway. This allows us to avoid +the overhead of attempting to deduplicate with unique indexes that always +have few or no duplicates. + +Posting list splits +------------------- + +When the incoming tuple happens to overlap with an existing posting list, +a posting list split is performed. Like a page split, a posting list +split resolves a situation where a new/incoming item "won't fit", while +inserting the incoming item in passing (i.e. as part of the same atomic +action). It's possible (though not particularly likely) that an insert of +a new item on to an almost-full page will overlap with a posting list, +resulting in both a posting list split and a page split. Even then, the +atomic action that splits the posting list also inserts the new item +(since page splits always insert the new item in passing). Including the +posting list split in the same atomic action as the insert avoids problems +caused by concurrent inserts into the same posting list -- the exact +details of how we change the posting list depend upon the new item, and +vice-versa. A single atomic action also minimizes the volume of extra +WAL required for a posting list split, since we don't have to explicitly +WAL-log the original posting list tuple. + +Despite piggy-backing on the same atomic action that inserts a new tuple, +posting list splits can be thought of as a separate, extra action to the +insert itself (or to the page split itself). Posting list splits +conceptually "rewrite" an insert that overlaps with an existing posting +list into an insert that adds its final new item just to the right of the +posting list instead. The size of the posting list won't change, and so +page space accounting code does not need to care about posting list splits +at all. This is an important upside of our design; the page split point +choice logic is very subtle even without it needing to deal with posting +list splits. + +Only a few isolated extra steps are required to preserve the illusion that +the new item never overlapped with an existing posting list in the first +place: the heap TID of the incoming tuple is swapped with the rightmost/max +heap TID from the existing/originally overlapping posting list. Also, the +posting-split-with-page-split case must generate a new high key based on +an imaginary version of the original page that has both the final new item +and the after-list-split posting tuple (page splits usually just operate +against an imaginary version that contains the new item/item that won't +fit). + +This approach avoids inventing an "eager" atomic posting split operation +that splits the posting list without simultaneously finishing the insert +of the incoming item. This alternative design might seem cleaner, but it +creates subtle problems for page space accounting. In general, there +might not be enough free space on the page to split a posting list such +that the incoming/new item no longer overlaps with either posting list +half --- the operation could fail before the actual retail insert of the +new item even begins. We'd end up having to handle posting list splits +that need a page split anyway. Besides, supporting variable "split points" +while splitting posting lists won't actually improve overall space +utilization. Notes About Data Representation ------------------------------- diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c new file mode 100644 index 0000000000..e5481f2f93 --- /dev/null +++ b/src/backend/access/nbtree/nbtdedup.c @@ -0,0 +1,830 @@ +/*------------------------------------------------------------------------- + * + * nbtdedup.c + * Deduplicate items in Postgres btrees. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtdedup.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/nbtxlog.h" +#include "miscadmin.h" +#include "utils/rel.h" + +static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, + OffsetNumber minoff, IndexTuple newitem); +static void _bt_singleval_fillfactor(Page page, BTDedupState state, + Size newitemsz); +#ifdef USE_ASSERT_CHECKING +static bool _bt_posting_valid(IndexTuple posting); +#endif + +/* + * Deduplicate items on a leaf page. The page will have to be split by caller + * if we cannot successfully free at least newitemsz (we also need space for + * newitem's line pointer, which isn't included in caller's newitemsz). + * + * The general approach taken here is to perform as much deduplication as + * possible to free as much space as possible. Note, however, that "single + * value" strategy is sometimes used for !checkingunique callers, in which + * case deduplication will leave a few tuples untouched at the end of the + * page. The general idea is to prepare the page for an anticipated page + * split that uses nbtsplitloc.c's "single value" strategy to determine a + * split point. (There is no reason to deduplicate items that will end up on + * the right half of the page after the anticipated page split; better to + * handle those if and when the anticipated right half page gets its own + * deduplication pass, following further inserts of duplicates.) + * + * This function should be called during insertion, when the page doesn't have + * enough space to fit an incoming newitem. If the BTP_HAS_GARBAGE page flag + * was set, caller should have removed any LP_DEAD items by calling + * _bt_vacuum_one_page() before calling here. We may still have to kill + * LP_DEAD items here when the page's BTP_HAS_GARBAGE hint is falsely unset, + * but that should be rare. Also, _bt_vacuum_one_page() won't unset the + * BTP_HAS_GARBAGE flag when it finds no LP_DEAD items, so a successful + * deduplication pass will always clear it, just to keep things tidy. + */ +void +_bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, + IndexTuple newitem, Size newitemsz, bool checkingunique) +{ + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buf); + BTPageOpaque opaque; + Page newpage; + int newpagendataitems = 0; + OffsetNumber deletable[MaxIndexTuplesPerPage]; + BTDedupState state; + int ndeletable = 0; + Size pagesaving = 0; + bool singlevalstrat = false; + int natts = IndexRelationGetNumberOfAttributes(rel); + + /* + * We can't assume that there are no LP_DEAD items. For one thing, VACUUM + * will clear the BTP_HAS_GARBAGE hint without reliably removing items + * that are marked LP_DEAD. We don't want to unnecessarily unset LP_DEAD + * bits when deduplicating items. Allowing it would be correct, though + * wasteful. + */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemid)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + { + _bt_delitems_delete(rel, buf, deletable, ndeletable, heapRel); + + /* + * Return when a split will be avoided. This is equivalent to + * avoiding a split using the usual _bt_vacuum_one_page() path. + */ + if (PageGetFreeSpace(page) >= newitemsz) + return; + + /* + * Reconsider number of items on page, in case _bt_delitems_delete() + * managed to delete an item or two + */ + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + } + + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + newitemsz += sizeof(ItemIdData); + + /* + * By here, it's clear that deduplication will definitely be attempted. + * Initialize deduplication state. + * + * It would be possible for maxpostingsize (limit on posting list tuple + * size) to be set to one third of the page. However, it seems like a + * good idea to limit the size of posting lists to one sixth of a page. + * That ought to leave us with a good split point when pages full of + * duplicates can be split several times. + */ + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state->deduplicate = true; + state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK); + /* Metadata about base tuple of current pending posting list */ + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + /* Metadata about current pending posting list TIDs */ + state->htids = palloc(state->maxpostingsize); + state->nhtids = 0; + state->nitems = 0; + /* Size of all physical tuples to be replaced by pending posting list */ + state->phystupsize = 0; + /* nintervals should be initialized to zero */ + state->nintervals = 0; + + /* Determine if "single value" strategy should be used */ + if (!checkingunique) + singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem); + + /* + * Deduplicate items from page, and write them to newpage. + * + * Copy the original page's LSN into newpage copy. This will become the + * updated version of the page. We need this because XLogInsert will + * examine the LSN and possibly dump it in a page image. + */ + newpage = PageGetTempPageCopySpecial(page); + PageSetLSN(newpage, PageGetLSN(page)); + + /* Copy high key, if any */ + if (!P_RIGHTMOST(opaque)) + { + ItemId hitemid = PageGetItemId(page, P_HIKEY); + Size hitemsz = ItemIdGetLength(hitemid); + IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid); + + if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add highkey"); + } + + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(!ItemIdIsDead(itemid)); + + if (offnum == minoff) + { + /* + * No previous/base tuple for the data item -- use the data item + * as base tuple of pending posting list + */ + _bt_dedup_start_pending(state, itup, offnum); + } + else if (state->deduplicate && + _bt_keep_natts_fast(rel, state->base, itup) > natts && + _bt_dedup_save_htid(state, itup)) + { + /* + * Tuple is equal to base tuple of pending posting list. Heap + * TID(s) for itup have been saved in state. + */ + } + else + { + /* + * Tuple is not equal to pending posting list tuple, or + * _bt_dedup_save_htid() opted to not merge current item into + * pending posting list for some other reason (e.g., adding more + * TIDs would have caused posting list to exceed current + * maxpostingsize). + * + * If state contains pending posting list with more than one item, + * form new posting tuple, and actually update the page. Else + * reset the state and move on without modifying the page. + */ + pagesaving += _bt_dedup_finish_pending(newpage, state); + newpagendataitems++; + + if (singlevalstrat) + { + /* + * Single value strategy's extra steps. + * + * Lower maxpostingsize for sixth and final item that might be + * deduplicated by current deduplication pass. When sixth + * item formed/observed, stop deduplicating items. + * + * Note: It's possible that this will be reached even when + * current deduplication pass has yet to merge together some + * existing items. It doesn't matter whether or not the + * current call generated the maxpostingsize-capped duplicate + * tuples at the start of the page. + */ + if (newpagendataitems == 5) + _bt_singleval_fillfactor(page, state, newitemsz); + else if (newpagendataitems == 6) + { + state->deduplicate = false; + singlevalstrat = false; /* won't be back here */ + } + } + + /* itup starts new pending posting list */ + _bt_dedup_start_pending(state, itup, offnum); + } + } + + /* Handle the last item */ + pagesaving += _bt_dedup_finish_pending(newpage, state); + newpagendataitems++; + + /* + * If no items suitable for deduplication were found, newpage must be + * exactly the same as the original page, so just return from function. + * + * We could determine whether or not to proceed on the basis the space + * savings being sufficient to avoid an immediate page split instead. We + * don't do that because there is some small value in nbtsplitloc.c always + * operating against a page that is fully deduplicated (apart from + * newitem). Besides, most of the cost has already been paid. + */ + if (state->nintervals == 0) + { + /* cannot leak memory here */ + pfree(newpage); + pfree(state->htids); + pfree(state); + return; + } + + /* + * By here, it's clear that deduplication will definitely go ahead. + * + * Clear the BTP_HAS_GARBAGE page flag in the unlikely event that it is + * still falsely set, just to keep things tidy. (We can't rely on + * _bt_vacuum_one_page() having done this already, and we can't rely on a + * page split or VACUUM getting to it in the near future.) + */ + if (P_HAS_GARBAGE(opaque)) + { + BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage); + + nopaque->btpo_flags &= ~BTP_HAS_GARBAGE; + } + + START_CRIT_SECTION(); + + PageRestoreTempPage(newpage, page); + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_btree_dedup xlrec_dedup; + + xlrec_dedup.nintervals = state->nintervals; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup); + + /* + * The intervals array is not in the buffer, but pretend that it is. + * When XLogInsert stores the whole buffer, the array need not be + * stored too. + */ + XLogRegisterBufData(0, (char *) state->intervals, + state->nintervals * sizeof(BTDedupInterval)); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* Local space accounting should agree with page accounting */ + Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz); + + /* cannot leak memory here */ + pfree(state->htids); + pfree(state); +} + +/* + * Create a new pending posting list tuple based on caller's base tuple. + * + * Every tuple processed by deduplication either becomes the base tuple for a + * posting list, or gets its heap TID(s) accepted into a pending posting list. + * A tuple that starts out as the base tuple for a posting list will only + * actually be rewritten within _bt_dedup_finish_pending() when it turns out + * that there are duplicates that can be merged into the base tuple. + */ +void +_bt_dedup_start_pending(BTDedupState state, IndexTuple base, + OffsetNumber baseoff) +{ + Assert(state->nhtids == 0); + Assert(state->nitems == 0); + Assert(!BTreeTupleIsPivot(base)); + + /* + * Copy heap TID(s) from new base tuple for new candidate posting list + * into working state's array + */ + if (!BTreeTupleIsPosting(base)) + { + memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData)); + state->nhtids = 1; + state->basetupsize = IndexTupleSize(base); + } + else + { + int nposting; + + nposting = BTreeTupleGetNPosting(base); + memcpy(state->htids, BTreeTupleGetPosting(base), + sizeof(ItemPointerData) * nposting); + state->nhtids = nposting; + /* basetupsize should not include existing posting list */ + state->basetupsize = BTreeTupleGetPostingOffset(base); + } + + /* + * Save new base tuple itself -- it'll be needed if we actually create a + * new posting list from new pending posting list. + * + * Must maintain physical size of all existing tuples (including line + * pointer overhead) so that we can calculate space savings on page. + */ + state->nitems = 1; + state->base = base; + state->baseoff = baseoff; + state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData); + /* Also save baseoff in pending state for interval */ + state->intervals[state->nintervals].baseoff = state->baseoff; +} + +/* + * Save itup heap TID(s) into pending posting list where possible. + * + * Returns bool indicating if the pending posting list managed by state now + * includes itup's heap TID(s). + */ +bool +_bt_dedup_save_htid(BTDedupState state, IndexTuple itup) +{ + int nhtids; + ItemPointer htids; + Size mergedtupsz; + + Assert(!BTreeTupleIsPivot(itup)); + + if (!BTreeTupleIsPosting(itup)) + { + nhtids = 1; + htids = &itup->t_tid; + } + else + { + nhtids = BTreeTupleGetNPosting(itup); + htids = BTreeTupleGetPosting(itup); + } + + /* + * Don't append (have caller finish pending posting list as-is) if + * appending heap TID(s) from itup would put us over maxpostingsize limit. + * + * This calculation needs to match the code used within _bt_form_posting() + * for new posting list tuples. + */ + mergedtupsz = MAXALIGN(state->basetupsize + + (state->nhtids + nhtids) * sizeof(ItemPointerData)); + + if (mergedtupsz > state->maxpostingsize) + return false; + + /* + * Save heap TIDs to pending posting list tuple -- itup can be merged into + * pending posting list + */ + state->nitems++; + memcpy(state->htids + state->nhtids, htids, + sizeof(ItemPointerData) * nhtids); + state->nhtids += nhtids; + state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); + + return true; +} + +/* + * Finalize pending posting list tuple, and add it to the page. Final tuple + * is based on saved base tuple, and saved list of heap TIDs. + * + * Returns space saving from deduplicating to make a new posting list tuple. + * Note that this includes line pointer overhead. This is zero in the case + * where no deduplication was possible. + */ +Size +_bt_dedup_finish_pending(Page newpage, BTDedupState state) +{ + OffsetNumber tupoff; + Size tuplesz; + Size spacesaving; + + Assert(state->nitems > 0); + Assert(state->nitems <= state->nhtids); + Assert(state->intervals[state->nintervals].baseoff == state->baseoff); + + tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage)); + if (state->nitems == 1) + { + /* Use original, unchanged base tuple */ + tuplesz = IndexTupleSize(state->base); + if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add tuple to page"); + + spacesaving = 0; + } + else + { + IndexTuple final; + + /* Form a tuple with a posting list */ + final = _bt_form_posting(state->base, state->htids, state->nhtids); + tuplesz = IndexTupleSize(final); + Assert(tuplesz <= state->maxpostingsize); + + /* Save final number of items for posting list */ + state->intervals[state->nintervals].nitems = state->nitems; + + Assert(tuplesz == MAXALIGN(IndexTupleSize(final))); + if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false, + false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add tuple to page"); + + pfree(final); + spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData)); + /* Increment nintervals, since we wrote a new posting list tuple */ + state->nintervals++; + Assert(spacesaving > 0 && spacesaving < BLCKSZ); + } + + /* Reset state for next pending posting list */ + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; + + return spacesaving; +} + +/* + * Determine if page non-pivot tuples (data items) are all duplicates of the + * same value -- if they are, deduplication's "single value" strategy should + * be applied. The general goal of this strategy is to ensure that + * nbtsplitloc.c (which uses its own single value strategy) will find a useful + * split point as further duplicates are inserted, and successive rightmost + * page splits occur among pages that store the same duplicate value. When + * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full, + * just like it would if deduplication were disabled. + * + * We expect that affected workloads will require _several_ single value + * strategy deduplication passes (over a page that only stores duplicates) + * before the page is finally split. The first deduplication pass should only + * find regular non-pivot tuples. Later deduplication passes will find + * existing maxpostingsize-capped posting list tuples, which must be skipped + * over. The penultimate pass is generally the first pass that actually + * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a + * few untouched non-pivot tuples. The final deduplication pass won't free + * any space -- it will skip over everything without merging anything (it + * retraces the steps of the penultimate pass). + * + * Fortunately, having several passes isn't too expensive. Each pass (after + * the first pass) won't spend many cycles on the large posting list tuples + * left by previous passes. Each pass will find a large contiguous group of + * smaller duplicate tuples to merge together at the end of the page. + * + * Note: We deliberately don't bother checking if the high key is a distinct + * value (prior to the TID tiebreaker column) before proceeding, unlike + * nbtsplitloc.c. Its single value strategy only gets applied on the + * rightmost page of duplicates of the same value (other leaf pages full of + * duplicates will get a simple 50:50 page split instead of splitting towards + * the end of the page). There is little point in making the same distinction + * here. + */ +static bool +_bt_do_singleval(Relation rel, Page page, BTDedupState state, + OffsetNumber minoff, IndexTuple newitem) +{ + int natts = IndexRelationGetNumberOfAttributes(rel); + ItemId itemid; + IndexTuple itup; + + itemid = PageGetItemId(page, minoff); + itup = (IndexTuple) PageGetItem(page, itemid); + + if (_bt_keep_natts_fast(rel, newitem, itup) > natts) + { + itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); + itup = (IndexTuple) PageGetItem(page, itemid); + + if (_bt_keep_natts_fast(rel, newitem, itup) > natts) + return true; + } + + return false; +} + +/* + * Lower maxpostingsize when using "single value" strategy, to avoid a sixth + * and final maxpostingsize-capped tuple. The sixth and final posting list + * tuple will end up somewhat smaller than the first five. (Note: The first + * five tuples could actually just be very large duplicate tuples that + * couldn't be merged together at all. Deduplication will simply not modify + * the page when that happens.) + * + * When there are six posting lists on the page (after current deduplication + * pass goes on to create/observe a sixth very large tuple), caller should end + * its deduplication pass. It isn't useful to try to deduplicate items that + * are supposed to end up on the new right sibling page following the + * anticipated page split. A future deduplication pass of future right + * sibling page might take care of it. (This is why the first single value + * strategy deduplication pass for a given leaf page will generally find only + * plain non-pivot tuples -- see _bt_do_singleval() comments.) + */ +static void +_bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz) +{ + Size leftfree; + int reduction; + + /* This calculation needs to match nbtsplitloc.c */ + leftfree = PageGetPageSize(page) - SizeOfPageHeaderData - + MAXALIGN(sizeof(BTPageOpaqueData)); + /* Subtract size of new high key (includes pivot heap TID space) */ + leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData)); + + /* + * Reduce maxpostingsize by an amount equal to target free space on left + * half of page + */ + reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0); + if (state->maxpostingsize > reduction) + state->maxpostingsize -= reduction; + else + state->maxpostingsize = 0; +} + +/* + * Build a posting list tuple based on caller's "base" index tuple and list of + * heap TIDs. When nhtids == 1, builds a standard non-pivot tuple without a + * posting list. (Posting list tuples can never have a single heap TID, partly + * because that ensures that deduplication always reduces final MAXALIGN()'d + * size of entire tuple.) + * + * Convention is that posting list starts at a MAXALIGN()'d offset (rather + * than a SHORTALIGN()'d offset), in line with the approach taken when + * appending a heap TID to new pivot tuple/high key during suffix truncation. + * This sometimes wastes a little space that was only needed as alignment + * padding in the original tuple. Following this convention simplifies the + * space accounting used when deduplicating a page (the same convention + * simplifies the accounting for choosing a point to split a page at). + * + * Note: Caller's "htids" array must be unique and already in ascending TID + * order. Any existing heap TIDs from "base" won't automatically appear in + * returned posting list tuple (they must be included in htids array.) + */ +IndexTuple +_bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids) +{ + uint32 keysize, + newsize; + IndexTuple itup; + + if (BTreeTupleIsPosting(base)) + keysize = BTreeTupleGetPostingOffset(base); + else + keysize = IndexTupleSize(base); + + Assert(!BTreeTupleIsPivot(base)); + Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX); + Assert(keysize == MAXALIGN(keysize)); + + /* Determine final size of new tuple */ + if (nhtids > 1) + newsize = MAXALIGN(keysize + + nhtids * sizeof(ItemPointerData)); + else + newsize = keysize; + + Assert(newsize <= INDEX_SIZE_MASK); + Assert(newsize == MAXALIGN(newsize)); + + /* Allocate memory using palloc0() (matches index_form_tuple()) */ + itup = palloc0(newsize); + memcpy(itup, base, keysize); + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + if (nhtids > 1) + { + /* Form posting list tuple */ + BTreeTupleSetPosting(itup, nhtids, keysize); + memcpy(BTreeTupleGetPosting(itup), htids, + sizeof(ItemPointerData) * nhtids); + Assert(_bt_posting_valid(itup)); + } + else + { + /* Form standard non-pivot tuple */ + itup->t_info &= ~INDEX_ALT_TID_MASK; + ItemPointerCopy(htids, &itup->t_tid); + Assert(ItemPointerIsValid(&itup->t_tid)); + } + + return itup; +} + +/* + * Generate a replacement tuple by "updating" a posting list tuple so that it + * no longer has TIDs that need to be deleted. + * + * Used by VACUUM. Caller's vacposting argument points to the existing + * posting list tuple to be updated. + * + * On return, caller's vacposting argument will point to final "updated" + * tuple, which will be palloc()'d in caller's memory context. + */ +void +_bt_update_posting(BTVacuumPosting vacposting) +{ + IndexTuple origtuple = vacposting->itup; + uint32 keysize, + newsize; + IndexTuple itup; + int nhtids; + int ui, + d; + ItemPointer htids; + + nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids; + + Assert(_bt_posting_valid(origtuple)); + Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple)); + + if (BTreeTupleIsPosting(origtuple)) + keysize = BTreeTupleGetPostingOffset(origtuple); + else + keysize = IndexTupleSize(origtuple); + + /* + * Determine final size of new tuple. + * + * This calculation needs to match the code used within _bt_form_posting() + * for new posting list tuples. We avoid calling _bt_form_posting() here + * to save ourselves a second memory allocation for a htids workspace. + */ + if (nhtids > 1) + newsize = MAXALIGN(keysize + + nhtids * sizeof(ItemPointerData)); + else + newsize = keysize; + + /* Allocate memory using palloc0() (matches index_form_tuple()) */ + itup = palloc0(newsize); + memcpy(itup, origtuple, keysize); + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + + if (nhtids > 1) + { + /* Form posting list tuple */ + BTreeTupleSetPosting(itup, nhtids, keysize); + htids = BTreeTupleGetPosting(itup); + } + else + { + /* Form standard non-pivot tuple */ + itup->t_info &= ~INDEX_ALT_TID_MASK; + htids = &itup->t_tid; + } + + ui = 0; + d = 0; + for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++) + { + if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i) + { + d++; + continue; + } + htids[ui++] = *BTreeTupleGetPostingN(origtuple, i); + } + Assert(ui == nhtids); + Assert(d == vacposting->ndeletedtids); + Assert(nhtids == 1 || _bt_posting_valid(itup)); + + /* vacposting arg's itup will now point to updated version */ + vacposting->itup = itup; +} + +/* + * Prepare for a posting list split by swapping heap TID in newitem with heap + * TID from original posting list (the 'oposting' heap TID located at offset + * 'postingoff'). Modifies newitem, so caller should pass their own private + * copy that can safely be modified. + * + * Returns new posting list tuple, which is palloc()'d in caller's context. + * This is guaranteed to be the same size as 'oposting'. Modified newitem is + * what caller actually inserts. (This happens inside the same critical + * section that performs an in-place update of old posting list using new + * posting list returned here.) + * + * While the keys from newitem and oposting must be opclass equal, and must + * generate identical output when run through the underlying type's output + * function, it doesn't follow that their representations match exactly. + * Caller must avoid assuming that there can't be representational differences + * that make datums from oposting bigger or smaller than the corresponding + * datums from newitem. For example, differences in TOAST input state might + * break a faulty assumption about tuple size (the executor is entitled to + * apply TOAST compression based on its own criteria). It also seems possible + * that further representational variation will be introduced in the future, + * in order to support nbtree features like page-level prefix compression. + * + * See nbtree/README for details on the design of posting list splits. + */ +IndexTuple +_bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff) +{ + int nhtids; + char *replacepos; + char *replaceposright; + Size nmovebytes; + IndexTuple nposting; + + nhtids = BTreeTupleGetNPosting(oposting); + Assert(_bt_posting_valid(oposting)); + Assert(postingoff > 0 && postingoff < nhtids); + + /* + * Move item pointers in posting list to make a gap for the new item's + * heap TID. We shift TIDs one place to the right, losing original + * rightmost TID. (nmovebytes must not include TIDs to the left of + * postingoff, nor the existing rightmost/max TID that gets overwritten.) + */ + nposting = CopyIndexTuple(oposting); + replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff); + replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1); + nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData); + memmove(replaceposright, replacepos, nmovebytes); + + /* Fill the gap at postingoff with TID of new item (original new TID) */ + Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem)); + ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos); + + /* Now copy oposting's rightmost/max TID into new item (final new TID) */ + ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid); + + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting), + BTreeTupleGetHeapTID(newitem)) < 0); + Assert(_bt_posting_valid(nposting)); + + return nposting; +} + +/* + * Verify posting list invariants for "posting", which must be a posting list + * tuple. Used within assertions. + */ +#ifdef USE_ASSERT_CHECKING +static bool +_bt_posting_valid(IndexTuple posting) +{ + ItemPointerData last; + ItemPointer htid; + + if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2) + return false; + + /* Remember first heap TID for loop */ + ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last); + if (!ItemPointerIsValid(&last)) + return false; + + /* Iterate, starting from second TID */ + for (int i = 1; i < BTreeTupleGetNPosting(posting); i++) + { + htid = BTreeTupleGetPostingN(posting, i); + + if (!ItemPointerIsValid(htid)) + return false; + if (ItemPointerCompare(htid, &last) <= 0) + return false; + ItemPointerCopy(htid, &last); + } + + return true; +} +#endif diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 4e5849ab8e..b913543221 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -47,10 +47,12 @@ static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, + int postingoff, bool split_only_page); static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, - IndexTuple newitem); + IndexTuple newitem, IndexTuple orignewitem, + IndexTuple nposting, uint16 postingoff); static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only); static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, @@ -125,6 +127,7 @@ _bt_doinsert(Relation rel, IndexTuple itup, insertstate.itup_key = itup_key; insertstate.bounds_valid = false; insertstate.buf = InvalidBuffer; + insertstate.postingoff = 0; /* * It's very common to have an index on an auto-incremented or @@ -295,7 +298,7 @@ top: newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique, stack, heapRel); _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack, - itup, newitemoff, false); + itup, newitemoff, insertstate.postingoff, false); } else { @@ -340,6 +343,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, uint32 *speculativeToken) { IndexTuple itup = insertstate->itup; + IndexTuple curitup; + ItemId curitemid; BTScanInsert itup_key = insertstate->itup_key; SnapshotData SnapshotDirty; OffsetNumber offset; @@ -348,6 +353,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, BTPageOpaque opaque; Buffer nbuf = InvalidBuffer; bool found = false; + bool inposting = false; + bool prevalldead = true; + int curposti = 0; /* Assume unique until we find a duplicate */ *is_unique = true; @@ -375,13 +383,21 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, Assert(itup_key->scantid == NULL); for (;;) { - ItemId curitemid; - IndexTuple curitup; - BlockNumber nblkno; - /* - * make sure the offset points to an actual item before trying to - * examine it... + * Each iteration of the loop processes one heap TID, not one index + * tuple. Current offset number for page isn't usually advanced on + * iterations that process heap TIDs from posting list tuples. + * + * "inposting" state is set when _inside_ a posting list --- not when + * we're at the start (or end) of a posting list. We advance curposti + * at the end of the iteration when inside a posting list tuple. In + * general, every loop iteration either advances the page offset or + * advances curposti --- an iteration that handles the rightmost/max + * heap TID in a posting list finally advances the page offset (and + * unsets "inposting"). + * + * Make sure the offset points to an actual index tuple before trying + * to examine it... */ if (offset <= maxoff) { @@ -406,31 +422,60 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, break; } - curitemid = PageGetItemId(page, offset); - /* - * We can skip items that are marked killed. + * We can skip items that are already marked killed. * * In the presence of heavy update activity an index may contain * many killed items with the same key; running _bt_compare() on * each killed item gets expensive. Just advance over killed * items as quickly as we can. We only apply _bt_compare() when - * we get to a non-killed item. Even those comparisons could be - * avoided (in the common case where there is only one page to - * visit) by reusing bounds, but just skipping dead items is fast - * enough. + * we get to a non-killed item. We could reuse the bounds to + * avoid _bt_compare() calls for known equal tuples, but it + * doesn't seem worth it. Workloads with heavy update activity + * tend to have many deduplication passes, so we'll often avoid + * most of those comparisons, too (we call _bt_compare() when the + * posting list tuple is initially encountered, though not when + * processing later TIDs from the same tuple). */ - if (!ItemIdIsDead(curitemid)) + if (!inposting) + curitemid = PageGetItemId(page, offset); + if (inposting || !ItemIdIsDead(curitemid)) { ItemPointerData htid; bool all_dead; - if (_bt_compare(rel, itup_key, page, offset) != 0) - break; /* we're past all the equal tuples */ + if (!inposting) + { + /* Plain tuple, or first TID in posting list tuple */ + if (_bt_compare(rel, itup_key, page, offset) != 0) + break; /* we're past all the equal tuples */ - /* okay, we gotta fetch the heap tuple ... */ - curitup = (IndexTuple) PageGetItem(page, curitemid); - htid = curitup->t_tid; + /* Advanced curitup */ + curitup = (IndexTuple) PageGetItem(page, curitemid); + Assert(!BTreeTupleIsPivot(curitup)); + } + + /* okay, we gotta fetch the heap tuple using htid ... */ + if (!BTreeTupleIsPosting(curitup)) + { + /* ... htid is from simple non-pivot tuple */ + Assert(!inposting); + htid = curitup->t_tid; + } + else if (!inposting) + { + /* ... htid is first TID in new posting list */ + inposting = true; + prevalldead = true; + curposti = 0; + htid = *BTreeTupleGetPostingN(curitup, 0); + } + else + { + /* ... htid is second or subsequent TID in posting list */ + Assert(curposti > 0); + htid = *BTreeTupleGetPostingN(curitup, curposti); + } /* * If we are doing a recheck, we expect to find the tuple we @@ -506,8 +551,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, * not part of this chain because it had a different index * entry. */ - htid = itup->t_tid; - if (table_index_fetch_tuple_check(heapRel, &htid, + if (table_index_fetch_tuple_check(heapRel, &itup->t_tid, SnapshotSelf, NULL)) { /* Normal case --- it's still live */ @@ -565,12 +609,14 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, RelationGetRelationName(rel)))); } } - else if (all_dead) + else if (all_dead && (!inposting || + (prevalldead && + curposti == BTreeTupleGetNPosting(curitup) - 1))) { /* - * The conflicting tuple (or whole HOT chain) is dead to - * everyone, so we may as well mark the index entry - * killed. + * The conflicting tuple (or all HOT chains pointed to by + * all posting list TIDs) is dead to everyone, so mark the + * index entry killed. */ ItemIdMarkDead(curitemid); opaque->btpo_flags |= BTP_HAS_GARBAGE; @@ -584,14 +630,29 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, else MarkBufferDirtyHint(insertstate->buf, true); } + + /* + * Remember if posting list tuple has even a single HOT chain + * whose members are not all dead + */ + if (!all_dead && inposting) + prevalldead = false; } } - /* - * Advance to next tuple to continue checking. - */ - if (offset < maxoff) + if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1) + { + /* Advance to next TID in same posting list */ + curposti++; + continue; + } + else if (offset < maxoff) + { + /* Advance to next tuple */ + curposti = 0; + inposting = false; offset = OffsetNumberNext(offset); + } else { int highkeycmp; @@ -606,7 +667,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, /* Advance to next non-dead page --- there must be one */ for (;;) { - nblkno = opaque->btpo_next; + BlockNumber nblkno = opaque->btpo_next; + nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ); page = BufferGetPage(nbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -616,6 +678,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, elog(ERROR, "fell off the end of index \"%s\"", RelationGetRelationName(rel)); } + /* Will also advance to next tuple */ + curposti = 0; + inposting = false; maxoff = PageGetMaxOffsetNumber(page); offset = P_FIRSTDATAKEY(opaque); /* Don't invalidate binary search bounds */ @@ -684,6 +749,7 @@ _bt_findinsertloc(Relation rel, BTScanInsert itup_key = insertstate->itup_key; Page page = BufferGetPage(insertstate->buf); BTPageOpaque lpageop; + OffsetNumber newitemoff; lpageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -696,9 +762,13 @@ _bt_findinsertloc(Relation rel, Assert(!insertstate->bounds_valid || checkingunique); Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL); Assert(itup_key->heapkeyspace || itup_key->scantid == NULL); + Assert(!itup_key->allequalimage || itup_key->heapkeyspace); if (itup_key->heapkeyspace) { + /* Keep track of whether checkingunique duplicate seen */ + bool uniquedup = false; + /* * If we're inserting into a unique index, we may have to walk right * through leaf pages to find the one leaf page that we must insert on @@ -715,6 +785,13 @@ _bt_findinsertloc(Relation rel, */ if (checkingunique) { + if (insertstate->low < insertstate->stricthigh) + { + /* Encountered a duplicate in _bt_check_unique() */ + Assert(insertstate->bounds_valid); + uniquedup = true; + } + for (;;) { /* @@ -741,18 +818,43 @@ _bt_findinsertloc(Relation rel, /* Update local state after stepping right */ page = BufferGetPage(insertstate->buf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + /* Assume duplicates (if checkingunique) */ + uniquedup = true; } } /* * If the target page is full, see if we can obtain enough space by - * erasing LP_DEAD items + * erasing LP_DEAD items. If that fails to free enough space, see if + * we can avoid a page split by performing a deduplication pass over + * the page. + * + * We only perform a deduplication pass for a checkingunique caller + * when the incoming item is a duplicate of an existing item on the + * leaf page. This heuristic avoids wasting cycles -- we only expect + * to benefit from deduplicating a unique index page when most or all + * recently added items are duplicates. See nbtree/README. */ - if (PageGetFreeSpace(page) < insertstate->itemsz && - P_HAS_GARBAGE(lpageop)) + if (PageGetFreeSpace(page) < insertstate->itemsz) { - _bt_vacuum_one_page(rel, insertstate->buf, heapRel); - insertstate->bounds_valid = false; + if (P_HAS_GARBAGE(lpageop)) + { + _bt_vacuum_one_page(rel, insertstate->buf, heapRel); + insertstate->bounds_valid = false; + + /* Might as well assume duplicates (if checkingunique) */ + uniquedup = true; + } + + if (itup_key->allequalimage && BTGetDeduplicateItems(rel) && + (!checkingunique || uniquedup) && + PageGetFreeSpace(page) < insertstate->itemsz) + { + _bt_dedup_one_page(rel, insertstate->buf, heapRel, + insertstate->itup, insertstate->itemsz, + checkingunique); + insertstate->bounds_valid = false; + } } } else @@ -834,7 +936,30 @@ _bt_findinsertloc(Relation rel, Assert(P_RIGHTMOST(lpageop) || _bt_compare(rel, itup_key, page, P_HIKEY) <= 0); - return _bt_binsrch_insert(rel, insertstate); + newitemoff = _bt_binsrch_insert(rel, insertstate); + + if (insertstate->postingoff == -1) + { + /* + * There is an overlapping posting list tuple with its LP_DEAD bit + * set. We don't want to unnecessarily unset its LP_DEAD bit while + * performing a posting list split, so delete all LP_DEAD items early. + * This is the only case where LP_DEAD deletes happen even though + * there is space for newitem on the page. + */ + _bt_vacuum_one_page(rel, insertstate->buf, heapRel); + + /* + * Do new binary search. New insert location cannot overlap with any + * posting list now. + */ + insertstate->bounds_valid = false; + insertstate->postingoff = 0; + newitemoff = _bt_binsrch_insert(rel, insertstate); + Assert(insertstate->postingoff == 0); + } + + return newitemoff; } /* @@ -900,10 +1025,12 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack) * * This recursive procedure does the following things: * + * + if postingoff != 0, splits existing posting list tuple + * (since it overlaps with new 'itup' tuple). * + if necessary, splits the target page, using 'itup_key' for * suffix truncation on leaf pages (caller passes NULL for * non-leaf pages). - * + inserts the tuple. + * + inserts the new tuple (might be split from posting list). * + if the page was split, pops the parent stack, and finds the * right place to insert the new child pointer (by walking * right using information stored in the parent stack). @@ -931,11 +1058,15 @@ _bt_insertonpg(Relation rel, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, + int postingoff, bool split_only_page) { Page page; BTPageOpaque lpageop; Size itemsz; + IndexTuple oposting; + IndexTuple origitup = NULL; + IndexTuple nposting = NULL; page = BufferGetPage(buf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -949,6 +1080,7 @@ _bt_insertonpg(Relation rel, Assert(P_ISLEAF(lpageop) || BTreeTupleGetNAtts(itup, rel) <= IndexRelationGetNumberOfKeyAttributes(rel)); + Assert(!BTreeTupleIsPosting(itup)); /* The caller should've finished any incomplete splits already. */ if (P_INCOMPLETE_SPLIT(lpageop)) @@ -959,6 +1091,34 @@ _bt_insertonpg(Relation rel, itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ + /* + * Do we need to split an existing posting list item? + */ + if (postingoff != 0) + { + ItemId itemid = PageGetItemId(page, newitemoff); + + /* + * The new tuple is a duplicate with a heap TID that falls inside the + * range of an existing posting list tuple on a leaf page. Prepare to + * split an existing posting list. Overwriting the posting list with + * its post-split version is treated as an extra step in either the + * insert or page split critical section. + */ + Assert(P_ISLEAF(lpageop) && !ItemIdIsDead(itemid)); + Assert(itup_key->heapkeyspace && itup_key->allequalimage); + oposting = (IndexTuple) PageGetItem(page, itemid); + + /* use a mutable copy of itup as our itup from here on */ + origitup = itup; + itup = CopyIndexTuple(origitup); + nposting = _bt_swap_posting(itup, oposting, postingoff); + /* itup now contains rightmost/max TID from oposting */ + + /* Alter offset so that newitem goes after posting list */ + newitemoff = OffsetNumberNext(newitemoff); + } + /* * Do we need to split the page to fit the item on it? * @@ -991,7 +1151,8 @@ _bt_insertonpg(Relation rel, BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* split the buffer into left and right halves */ - rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup); + rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup, + origitup, nposting, postingoff); PredicateLockPageSplit(rel, BufferGetBlockNumber(buf), BufferGetBlockNumber(rbuf)); @@ -1066,6 +1227,9 @@ _bt_insertonpg(Relation rel, /* Do the update. No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); + if (postingoff != 0) + memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); + if (!_bt_pgaddtup(page, itemsz, itup, newitemoff)) elog(PANIC, "failed to add new item to block %u in index \"%s\"", itup_blkno, RelationGetRelationName(rel)); @@ -1115,8 +1279,19 @@ _bt_insertonpg(Relation rel, XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert); - if (P_ISLEAF(lpageop)) + if (P_ISLEAF(lpageop) && postingoff == 0) + { + /* Simple leaf insert */ xlinfo = XLOG_BTREE_INSERT_LEAF; + } + else if (postingoff != 0) + { + /* + * Leaf insert with posting list split. Must include + * postingoff field before newitem/orignewitem. + */ + xlinfo = XLOG_BTREE_INSERT_POST; + } else { /* @@ -1139,6 +1314,7 @@ _bt_insertonpg(Relation rel, xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + xlmeta.allequalimage = metad->btm_allequalimage; XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata)); @@ -1147,7 +1323,27 @@ _bt_insertonpg(Relation rel, } XLogRegisterBuffer(0, buf, REGBUF_STANDARD); - XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup)); + if (postingoff == 0) + { + /* Simple, common case -- log itup from caller */ + XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup)); + } + else + { + /* + * Insert with posting list split (XLOG_BTREE_INSERT_POST + * record) case. + * + * Log postingoff. Also log origitup, not itup. REDO routine + * must reconstruct final itup (as well as nposting) using + * _bt_swap_posting(). + */ + uint16 upostingoff = postingoff; + + XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16)); + XLogRegisterBufData(0, (char *) origitup, + IndexTupleSize(origitup)); + } recptr = XLogInsert(RM_BTREE_ID, xlinfo); @@ -1189,6 +1385,14 @@ _bt_insertonpg(Relation rel, _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL) RelationSetTargetBlock(rel, cachedBlock); } + + /* be tidy */ + if (postingoff != 0) + { + /* itup is actually a modified copy of caller's original */ + pfree(nposting); + pfree(itup); + } } /* @@ -1204,12 +1408,24 @@ _bt_insertonpg(Relation rel, * This function will clear the INCOMPLETE_SPLIT flag on it, and * release the buffer. * + * orignewitem, nposting, and postingoff are needed when an insert of + * orignewitem results in both a posting list split and a page split. + * These extra posting list split details are used here in the same + * way as they are used in the more common case where a posting list + * split does not coincide with a page split. We need to deal with + * posting list splits directly in order to ensure that everything + * that follows from the insert of orignewitem is handled as a single + * atomic operation (though caller's insert of a new pivot/downlink + * into parent page will still be a separate operation). See + * nbtree/README for details on the design of posting list splits. + * * Returns the new right sibling of buf, pinned and write-locked. * The pin and lock on buf are maintained. */ static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, - OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem) + OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, + IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff) { Buffer rbuf; Page origpage; @@ -1229,6 +1445,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber leftoff, rightoff; OffsetNumber firstright; + OffsetNumber origpagepostingoff; OffsetNumber maxoff; OffsetNumber i; bool newitemonleft, @@ -1298,6 +1515,34 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, PageSetLSN(leftpage, PageGetLSN(origpage)); isleaf = P_ISLEAF(oopaque); + /* + * Determine page offset number of existing overlapped-with-orignewitem + * posting list when it is necessary to perform a posting list split in + * passing. Note that newitem was already changed by caller (newitem no + * longer has the orignewitem TID). + * + * This page offset number (origpagepostingoff) will be used to pretend + * that the posting split has already taken place, even though the + * required modifications to origpage won't occur until we reach the + * critical section. The lastleft and firstright tuples of our page split + * point should, in effect, come from an imaginary version of origpage + * that has the nposting tuple instead of the original posting list tuple. + * + * Note: _bt_findsplitloc() should have compensated for coinciding posting + * list splits in just the same way, at least in theory. It doesn't + * bother with that, though. In practice it won't affect its choice of + * split point. + */ + origpagepostingoff = InvalidOffsetNumber; + if (postingoff != 0) + { + Assert(isleaf); + Assert(ItemPointerCompare(&orignewitem->t_tid, + &newitem->t_tid) < 0); + Assert(BTreeTupleIsPosting(nposting)); + origpagepostingoff = OffsetNumberPrev(newitemoff); + } + /* * The "high key" for the new left page will be the first key that's going * to go into the new right page, or a truncated version if this is a leaf @@ -1335,6 +1580,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, itemid = PageGetItemId(origpage, firstright); itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); + if (firstright == origpagepostingoff) + item = nposting; } /* @@ -1368,6 +1615,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque)); itemid = PageGetItemId(origpage, lastleftoff); lastleft = (IndexTuple) PageGetItem(origpage, itemid); + if (lastleftoff == origpagepostingoff) + lastleft = nposting; } Assert(lastleft != item); @@ -1383,6 +1632,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, */ leftoff = P_HIKEY; + Assert(BTreeTupleIsPivot(lefthikey) || !itup_key->heapkeyspace); Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0); Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts); if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff, @@ -1447,6 +1697,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, itemid = PageGetItemId(origpage, P_HIKEY); itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); + Assert(BTreeTupleIsPivot(item) || !itup_key->heapkeyspace); Assert(BTreeTupleGetNAtts(item, rel) > 0); Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts); if (PageAddItem(rightpage, (Item) item, itemsz, rightoff, @@ -1475,8 +1726,16 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); + /* replace original item with nposting due to posting split? */ + if (i == origpagepostingoff) + { + Assert(BTreeTupleIsPosting(item)); + Assert(itemsz == MAXALIGN(IndexTupleSize(nposting))); + item = nposting; + } + /* does new item belong before this one? */ - if (i == newitemoff) + else if (i == newitemoff) { if (newitemonleft) { @@ -1645,8 +1904,12 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, XLogRecPtr recptr; xlrec.level = ropaque->btpo.level; + /* See comments below on newitem, orignewitem, and posting lists */ xlrec.firstright = firstright; xlrec.newitemoff = newitemoff; + xlrec.postingoff = 0; + if (postingoff != 0 && origpagepostingoff < firstright) + xlrec.postingoff = postingoff; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit); @@ -1665,11 +1928,35 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, * because it's included with all the other items on the right page.) * Show the new item as belonging to the left page buffer, so that it * is not stored if XLogInsert decides it needs a full-page image of - * the left page. We store the offset anyway, though, to support - * archive compression of these records. + * the left page. We always store newitemoff in the record, though. + * + * The details are sometimes slightly different for page splits that + * coincide with a posting list split. If both the replacement + * posting list and newitem go on the right page, then we don't need + * to log anything extra, just like the simple !newitemonleft + * no-posting-split case (postingoff is set to zero in the WAL record, + * so recovery doesn't need to process a posting list split at all). + * Otherwise, we set postingoff and log orignewitem instead of + * newitem, despite having actually inserted newitem. REDO routine + * must reconstruct nposting and newitem using _bt_swap_posting(). + * + * Note: It's possible that our page split point is the point that + * makes the posting list lastleft and newitem firstright. This is + * the only case where we log orignewitem/newitem despite newitem + * going on the right page. If XLogInsert decides that it can omit + * orignewitem due to logging a full-page image of the left page, + * everything still works out, since recovery only needs to log + * orignewitem for items on the left page (just like the regular + * newitem-logged case). */ - if (newitemonleft) + if (newitemonleft && xlrec.postingoff == 0) XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz)); + else if (xlrec.postingoff != 0) + { + Assert(newitemonleft || firstright == newitemoff); + Assert(MAXALIGN(newitemsz) == IndexTupleSize(orignewitem)); + XLogRegisterBufData(0, (char *) orignewitem, MAXALIGN(newitemsz)); + } /* Log the left page's new high key */ itemid = PageGetItemId(origpage, P_HIKEY); @@ -1829,7 +2116,7 @@ _bt_insert_parent(Relation rel, /* Recursively insert into the parent */ _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent, - new_item, stack->bts_offset + 1, + new_item, stack->bts_offset + 1, 0, is_only); /* be tidy */ @@ -2185,6 +2472,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) md.fastlevel = metad->btm_level; md.oldest_btpo_xact = metad->btm_oldest_btpo_xact; md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + md.allequalimage = metad->btm_allequalimage; XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); @@ -2265,7 +2553,7 @@ _bt_pgaddtup(Page page, static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel) { - OffsetNumber deletable[MaxOffsetNumber]; + OffsetNumber deletable[MaxIndexTuplesPerPage]; int ndeletable = 0; OffsetNumber offnum, minoff, @@ -2298,6 +2586,6 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel) * Note: if we didn't find any LP_DEAD items, then the page's * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a * separate write to clear it, however. We will clear it when we split - * the page. + * the page, or when deduplication runs. */ } diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index f05cbe7467..39b8f17f4b 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -24,6 +24,7 @@ #include "access/nbtree.h" #include "access/nbtxlog.h" +#include "access/tableam.h" #include "access/transam.h" #include "access/xlog.h" #include "access/xloginsert.h" @@ -37,6 +38,8 @@ static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack); static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty); +static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page, + OffsetNumber *deletable, int ndeletable); static bool _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, Buffer *topparent, OffsetNumber *topoff, BlockNumber *target, BlockNumber *rightsib); @@ -47,7 +50,8 @@ static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, * _bt_initmetapage() -- Fill a page buffer with a correct metapage image */ void -_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) +_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, + bool allequalimage) { BTMetaPageData *metad; BTPageOpaque metaopaque; @@ -63,6 +67,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) metad->btm_fastlevel = level; metad->btm_oldest_btpo_xact = InvalidTransactionId; metad->btm_last_cleanup_num_heap_tuples = -1.0; + metad->btm_allequalimage = allequalimage; metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); metaopaque->btpo_flags = BTP_META; @@ -102,6 +107,9 @@ _bt_upgrademetapage(Page page) metad->btm_version = BTREE_NOVAC_VERSION; metad->btm_oldest_btpo_xact = InvalidTransactionId; metad->btm_last_cleanup_num_heap_tuples = -1.0; + /* Only a REINDEX can set this field */ + Assert(!metad->btm_allequalimage); + metad->btm_allequalimage = false; /* Adjust pd_lower (see _bt_initmetapage() for details) */ ((PageHeader) page)->pd_lower = @@ -213,6 +221,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, md.fastlevel = metad->btm_fastlevel; md.oldest_btpo_xact = oldestBtpoXact; md.last_cleanup_num_heap_tuples = numHeapTuples; + md.allequalimage = metad->btm_allequalimage; XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata)); @@ -274,6 +283,8 @@ _bt_getroot(Relation rel, int access) Assert(metad->btm_magic == BTREE_MAGIC); Assert(metad->btm_version >= BTREE_MIN_VERSION); Assert(metad->btm_version <= BTREE_VERSION); + Assert(!metad->btm_allequalimage || + metad->btm_version > BTREE_NOVAC_VERSION); Assert(metad->btm_root != P_NONE); rootblkno = metad->btm_fastroot; @@ -394,6 +405,7 @@ _bt_getroot(Relation rel, int access) md.fastlevel = 0; md.oldest_btpo_xact = InvalidTransactionId; md.last_cleanup_num_heap_tuples = -1.0; + md.allequalimage = metad->btm_allequalimage; XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); @@ -618,22 +630,34 @@ _bt_getrootheight(Relation rel) Assert(metad->btm_magic == BTREE_MAGIC); Assert(metad->btm_version >= BTREE_MIN_VERSION); Assert(metad->btm_version <= BTREE_VERSION); + Assert(!metad->btm_allequalimage || + metad->btm_version > BTREE_NOVAC_VERSION); Assert(metad->btm_fastroot != P_NONE); return metad->btm_fastlevel; } /* - * _bt_heapkeyspace() -- is heap TID being treated as a key? + * _bt_metaversion() -- Get version/status info from metapage. + * + * Sets caller's *heapkeyspace and *allequalimage arguments using data + * from the B-Tree metapage (could be locally-cached version). This + * information needs to be stashed in insertion scankey, so we provide a + * single function that fetches both at once. * * This is used to determine the rules that must be used to descend a * btree. Version 4 indexes treat heap TID as a tiebreaker attribute. * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable * performance when inserting a new BTScanInsert-wise duplicate tuple * among many leaf pages already full of such duplicates. + * + * Also sets allequalimage field, which indicates whether or not it is + * safe to apply deduplication. We rely on the assumption that + * btm_allequalimage will be zero'ed on heapkeyspace indexes that were + * pg_upgrade'd from Postgres 12. */ -bool -_bt_heapkeyspace(Relation rel) +void +_bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage) { BTMetaPageData *metad; @@ -651,10 +675,11 @@ _bt_heapkeyspace(Relation rel) */ if (metad->btm_root == P_NONE) { - uint32 btm_version = metad->btm_version; + *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION; + *allequalimage = metad->btm_allequalimage; _bt_relbuf(rel, metabuf); - return btm_version > BTREE_NOVAC_VERSION; + return; } /* @@ -678,9 +703,12 @@ _bt_heapkeyspace(Relation rel) Assert(metad->btm_magic == BTREE_MAGIC); Assert(metad->btm_version >= BTREE_MIN_VERSION); Assert(metad->btm_version <= BTREE_VERSION); + Assert(!metad->btm_allequalimage || + metad->btm_version > BTREE_NOVAC_VERSION); Assert(metad->btm_fastroot != P_NONE); - return metad->btm_version > BTREE_NOVAC_VERSION; + *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION; + *allequalimage = metad->btm_allequalimage; } /* @@ -964,28 +992,106 @@ _bt_page_recyclable(Page page) * Delete item(s) from a btree leaf page during VACUUM. * * This routine assumes that the caller has a super-exclusive write lock on - * the buffer. Also, the given deletable array *must* be sorted in ascending - * order. + * the buffer. Also, the given deletable and updatable arrays *must* be + * sorted in ascending order. + * + * Routine deals with deleting TIDs when some (but not all) of the heap TIDs + * in an existing posting list item are to be removed by VACUUM. This works + * by updating/overwriting an existing item with caller's new version of the + * item (a version that lacks the TIDs that are to be deleted). * * We record VACUUMs and b-tree deletes differently in WAL. Deletes must * generate their own latestRemovedXid by accessing the heap directly, whereas - * VACUUMs rely on the initial heap scan taking care of it indirectly. + * VACUUMs rely on the initial heap scan taking care of it indirectly. Also, + * only VACUUM can perform granular deletes of individual TIDs in posting list + * tuples. */ void _bt_delitems_vacuum(Relation rel, Buffer buf, - OffsetNumber *deletable, int ndeletable) + OffsetNumber *deletable, int ndeletable, + BTVacuumPosting *updatable, int nupdatable) { Page page = BufferGetPage(buf); BTPageOpaque opaque; + Size itemsz; + char *updatedbuf = NULL; + Size updatedbuflen = 0; + OffsetNumber updatedoffsets[MaxIndexTuplesPerPage]; /* Shouldn't be called unless there's something to do */ - Assert(ndeletable > 0); + Assert(ndeletable > 0 || nupdatable > 0); + + for (int i = 0; i < nupdatable; i++) + { + /* Replace work area IndexTuple with updated version */ + _bt_update_posting(updatable[i]); + + /* Maintain array of updatable page offsets for WAL record */ + updatedoffsets[i] = updatable[i]->updatedoffset; + } + + /* XLOG stuff -- allocate and fill buffer before critical section */ + if (nupdatable > 0 && RelationNeedsWAL(rel)) + { + Size offset = 0; + + for (int i = 0; i < nupdatable; i++) + { + BTVacuumPosting vacposting = updatable[i]; + + itemsz = SizeOfBtreeUpdate + + vacposting->ndeletedtids * sizeof(uint16); + updatedbuflen += itemsz; + } + + updatedbuf = palloc(updatedbuflen); + for (int i = 0; i < nupdatable; i++) + { + BTVacuumPosting vacposting = updatable[i]; + xl_btree_update update; + + update.ndeletedtids = vacposting->ndeletedtids; + memcpy(updatedbuf + offset, &update.ndeletedtids, + SizeOfBtreeUpdate); + offset += SizeOfBtreeUpdate; + + itemsz = update.ndeletedtids * sizeof(uint16); + memcpy(updatedbuf + offset, vacposting->deletetids, itemsz); + offset += itemsz; + } + } /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); - /* Fix the page */ - PageIndexMultiDelete(page, deletable, ndeletable); + /* + * Handle posting tuple updates. + * + * Deliberately do this before handling simple deletes. If we did it the + * other way around (i.e. WAL record order -- simple deletes before + * updates) then we'd have to make compensating changes to the 'updatable' + * array of offset numbers. + * + * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it + * happens to already be set. Although we unset the BTP_HAS_GARBAGE page + * level flag, unsetting individual LP_DEAD bits should still be avoided. + */ + for (int i = 0; i < nupdatable; i++) + { + OffsetNumber updatedoffset = updatedoffsets[i]; + IndexTuple itup; + + itup = updatable[i]->itup; + itemsz = MAXALIGN(IndexTupleSize(itup)); + if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup, + itemsz)) + elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"", + BufferGetBlockNumber(buf), RelationGetRelationName(rel)); + } + + /* Now handle simple deletes of entire tuples */ + if (ndeletable > 0) + PageIndexMultiDelete(page, deletable, ndeletable); /* * We can clear the vacuum cycle ID since this page has certainly been @@ -1006,7 +1112,9 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, * limited, since we never falsely unset an LP_DEAD bit. Workloads that * are particularly dependent on LP_DEAD bits being set quickly will * usually manage to set the BTP_HAS_GARBAGE flag before the page fills up - * again anyway. + * again anyway. Furthermore, attempting a deduplication pass will remove + * all LP_DEAD items, regardless of whether the BTP_HAS_GARBAGE hint bit + * is set or not. */ opaque->btpo_flags &= ~BTP_HAS_GARBAGE; @@ -1019,18 +1127,22 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, xl_btree_vacuum xlrec_vacuum; xlrec_vacuum.ndeleted = ndeletable; + xlrec_vacuum.nupdated = nupdatable; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_STANDARD); XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum); - /* - * The deletable array is not in the buffer, but pretend that it is. - * When XLogInsert stores the whole buffer, the array need not be - * stored too. - */ - XLogRegisterBufData(0, (char *) deletable, - ndeletable * sizeof(OffsetNumber)); + if (ndeletable > 0) + XLogRegisterBufData(0, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + if (nupdatable > 0) + { + XLogRegisterBufData(0, (char *) updatedoffsets, + nupdatable * sizeof(OffsetNumber)); + XLogRegisterBufData(0, updatedbuf, updatedbuflen); + } recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM); @@ -1038,6 +1150,13 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, } END_CRIT_SECTION(); + + /* can't leak memory here */ + if (updatedbuf != NULL) + pfree(updatedbuf); + /* free tuples generated by calling _bt_update_posting() */ + for (int i = 0; i < nupdatable; i++) + pfree(updatable[i]->itup); } /* @@ -1050,6 +1169,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, * This is nearly the same as _bt_delitems_vacuum as far as what it does to * the page, but it needs to generate its own latestRemovedXid by accessing * the heap. This is used by the REDO routine to generate recovery conflicts. + * Also, it doesn't handle posting list tuples unless the entire tuple can be + * deleted as a whole (since there is only one LP_DEAD bit per line pointer). */ void _bt_delitems_delete(Relation rel, Buffer buf, @@ -1065,8 +1186,7 @@ _bt_delitems_delete(Relation rel, Buffer buf, if (XLogStandbyInfoActive() && RelationNeedsWAL(rel)) latestRemovedXid = - index_compute_xid_horizon_for_tuples(rel, heapRel, buf, - deletable, ndeletable); + _bt_xid_horizon(rel, heapRel, page, deletable, ndeletable); /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); @@ -1113,6 +1233,83 @@ _bt_delitems_delete(Relation rel, Buffer buf, END_CRIT_SECTION(); } +/* + * Get the latestRemovedXid from the table entries pointed to by the non-pivot + * tuples being deleted. + * + * This is a specialized version of index_compute_xid_horizon_for_tuples(). + * It's needed because btree tuples don't always store table TID using the + * standard index tuple header field. + */ +static TransactionId +_bt_xid_horizon(Relation rel, Relation heapRel, Page page, + OffsetNumber *deletable, int ndeletable) +{ + TransactionId latestRemovedXid = InvalidTransactionId; + int spacenhtids; + int nhtids; + ItemPointer htids; + + /* Array will grow iff there are posting list tuples to consider */ + spacenhtids = ndeletable; + nhtids = 0; + htids = (ItemPointer) palloc(sizeof(ItemPointerData) * spacenhtids); + for (int i = 0; i < ndeletable; i++) + { + ItemId itemid; + IndexTuple itup; + + itemid = PageGetItemId(page, deletable[i]); + itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(ItemIdIsDead(itemid)); + Assert(!BTreeTupleIsPivot(itup)); + + if (!BTreeTupleIsPosting(itup)) + { + if (nhtids + 1 > spacenhtids) + { + spacenhtids *= 2; + htids = (ItemPointer) + repalloc(htids, sizeof(ItemPointerData) * spacenhtids); + } + + Assert(ItemPointerIsValid(&itup->t_tid)); + ItemPointerCopy(&itup->t_tid, &htids[nhtids]); + nhtids++; + } + else + { + int nposting = BTreeTupleGetNPosting(itup); + + if (nhtids + nposting > spacenhtids) + { + spacenhtids = Max(spacenhtids * 2, nhtids + nposting); + htids = (ItemPointer) + repalloc(htids, sizeof(ItemPointerData) * spacenhtids); + } + + for (int j = 0; j < nposting; j++) + { + ItemPointer htid = BTreeTupleGetPostingN(itup, j); + + Assert(ItemPointerIsValid(htid)); + ItemPointerCopy(htid, &htids[nhtids]); + nhtids++; + } + } + } + + Assert(nhtids >= ndeletable); + + latestRemovedXid = + table_compute_xid_horizon_for_tuples(heapRel, htids, nhtids); + + pfree(htids); + + return latestRemovedXid; +} + /* * Returns true, if the given block has the half-dead flag set. */ @@ -2058,6 +2255,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) xlmeta.fastlevel = metad->btm_fastlevel; xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + xlmeta.allequalimage = metad->btm_allequalimage; XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata)); xlinfo = XLOG_BTREE_UNLINK_PAGE_META; diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 5254bc7ef5..4bb16297c3 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -95,6 +95,10 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BTCycleId cycleid, TransactionId *oldestBtpoXact); static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno); +static BTVacuumPosting btreevacuumposting(BTVacState *vstate, + IndexTuple posting, + OffsetNumber updatedoffset, + int *nremaining); /* @@ -161,7 +165,7 @@ btbuildempty(Relation index) /* Construct metapage. */ metapage = (Page) palloc(BLCKSZ); - _bt_initmetapage(metapage, P_NONE, 0); + _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false)); /* * Write the page and log it. It might seem that an immediate sync would @@ -264,8 +268,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) */ if (so->killedItems == NULL) so->killedItems = (int *) - palloc(MaxIndexTuplesPerPage * sizeof(int)); - if (so->numKilled < MaxIndexTuplesPerPage) + palloc(MaxTIDsPerBTreePage * sizeof(int)); + if (so->numKilled < MaxTIDsPerBTreePage) so->killedItems[so->numKilled++] = so->currPos.itemIndex; } @@ -1154,11 +1158,15 @@ restart: } else if (P_ISLEAF(opaque)) { - OffsetNumber deletable[MaxOffsetNumber]; + OffsetNumber deletable[MaxIndexTuplesPerPage]; int ndeletable; + BTVacuumPosting updatable[MaxIndexTuplesPerPage]; + int nupdatable; OffsetNumber offnum, minoff, maxoff; + int nhtidsdead, + nhtidslive; /* * Trade in the initial read lock for a super-exclusive write lock on @@ -1190,8 +1198,11 @@ restart: * point using callback. */ ndeletable = 0; + nupdatable = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); + nhtidsdead = 0; + nhtidslive = 0; if (callback) { for (offnum = minoff; @@ -1199,11 +1210,9 @@ restart: offnum = OffsetNumberNext(offnum)) { IndexTuple itup; - ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); - htup = &(itup->t_tid); /* * Hot Standby assumes that it's okay that XLOG_BTREE_VACUUM @@ -1226,22 +1235,82 @@ restart: * simple, and allows us to always avoid generating our own * conflicts. */ - if (callback(htup, callback_state)) - deletable[ndeletable++] = offnum; + Assert(!BTreeTupleIsPivot(itup)); + if (!BTreeTupleIsPosting(itup)) + { + /* Regular tuple, standard table TID representation */ + if (callback(&itup->t_tid, callback_state)) + { + deletable[ndeletable++] = offnum; + nhtidsdead++; + } + else + nhtidslive++; + } + else + { + BTVacuumPosting vacposting; + int nremaining; + + /* Posting list tuple */ + vacposting = btreevacuumposting(vstate, itup, offnum, + &nremaining); + if (vacposting == NULL) + { + /* + * All table TIDs from the posting tuple remain, so no + * delete or update required + */ + Assert(nremaining == BTreeTupleGetNPosting(itup)); + } + else if (nremaining > 0) + { + + /* + * Store metadata about posting list tuple in + * updatable array for entire page. Existing tuple + * will be updated during the later call to + * _bt_delitems_vacuum(). + */ + Assert(nremaining < BTreeTupleGetNPosting(itup)); + updatable[nupdatable++] = vacposting; + nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining; + } + else + { + /* + * All table TIDs from the posting list must be + * deleted. We'll delete the index tuple completely + * (no update required). + */ + Assert(nremaining == 0); + deletable[ndeletable++] = offnum; + nhtidsdead += BTreeTupleGetNPosting(itup); + pfree(vacposting); + } + + nhtidslive += nremaining; + } } } /* - * Apply any needed deletes. We issue just one _bt_delitems_vacuum() - * call per page, so as to minimize WAL traffic. + * Apply any needed deletes or updates. We issue just one + * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic. */ - if (ndeletable > 0) + if (ndeletable > 0 || nupdatable > 0) { - _bt_delitems_vacuum(rel, buf, deletable, ndeletable); + Assert(nhtidsdead >= Max(ndeletable, 1)); + _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable, + nupdatable); - stats->tuples_removed += ndeletable; + stats->tuples_removed += nhtidsdead; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); + + /* can't leak memory here */ + for (int i = 0; i < nupdatable; i++) + pfree(updatable[i]); } else { @@ -1254,6 +1323,7 @@ restart: * We treat this like a hint-bit update because there's no need to * WAL-log it. */ + Assert(nhtidsdead == 0); if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid) { @@ -1263,15 +1333,18 @@ restart: } /* - * If it's now empty, try to delete; else count the live tuples. We - * don't delete when recursing, though, to avoid putting entries into - * freePages out-of-order (doesn't seem worth any extra code to handle - * the case). + * If it's now empty, try to delete; else count the live tuples (live + * table TIDs in posting lists are counted as separate live tuples). + * We don't delete when recursing, though, to avoid putting entries + * into freePages out-of-order (doesn't seem worth any extra code to + * handle the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); else - stats->num_index_tuples += maxoff - minoff + 1; + stats->num_index_tuples += nhtidslive; + + Assert(!delete_now || nhtidslive == 0); } if (delete_now) @@ -1303,9 +1376,10 @@ restart: /* * This is really tail recursion, but if the compiler is too stupid to * optimize it as such, we'd eat an uncomfortably large amount of stack - * space per recursion level (due to the deletable[] array). A failure is - * improbable since the number of levels isn't likely to be large ... but - * just in case, let's hand-optimize into a loop. + * space per recursion level (due to the arrays used to track details of + * deletable/updatable items). A failure is improbable since the number + * of levels isn't likely to be large ... but just in case, let's + * hand-optimize into a loop. */ if (recurse_to != P_NONE) { @@ -1314,6 +1388,61 @@ restart: } } +/* + * btreevacuumposting --- determine TIDs still needed in posting list + * + * Returns metadata describing how to build replacement tuple without the TIDs + * that VACUUM needs to delete. Returned value is NULL in the common case + * where no changes are needed to caller's posting list tuple (we avoid + * allocating memory here as an optimization). + * + * The number of TIDs that should remain in the posting list tuple is set for + * caller in *nremaining. + */ +static BTVacuumPosting +btreevacuumposting(BTVacState *vstate, IndexTuple posting, + OffsetNumber updatedoffset, int *nremaining) +{ + int live = 0; + int nitem = BTreeTupleGetNPosting(posting); + ItemPointer items = BTreeTupleGetPosting(posting); + BTVacuumPosting vacposting = NULL; + + for (int i = 0; i < nitem; i++) + { + if (!vstate->callback(items + i, vstate->callback_state)) + { + /* Live table TID */ + live++; + } + else if (vacposting == NULL) + { + /* + * First dead table TID encountered. + * + * It's now clear that we need to delete one or more dead table + * TIDs, so start maintaining metadata describing how to update + * existing posting list tuple. + */ + vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) + + nitem * sizeof(uint16)); + + vacposting->itup = posting; + vacposting->updatedoffset = updatedoffset; + vacposting->ndeletedtids = 0; + vacposting->deletetids[vacposting->ndeletedtids++] = i; + } + else + { + /* Second or subsequent dead table TID */ + vacposting->deletetids[vacposting->ndeletedtids++] = i; + } + } + + *nremaining = live; + return vacposting; +} + /* * btcanreturn() -- Check whether btree indexes support index-only scans. * diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index df065d72f8..7aaa8c17b0 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -26,10 +26,18 @@ static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); +static int _bt_binsrch_posting(BTScanInsert key, Page page, + OffsetNumber offnum); static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum); static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup); +static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, ItemPointer heapTid, + IndexTuple itup); +static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, + ItemPointer heapTid, int tupleOffset); static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, @@ -142,6 +150,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, offnum = _bt_binsrch(rel, key, *bufP); itemid = PageGetItemId(page, offnum); itup = (IndexTuple) PageGetItem(page, itemid); + Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace); blkno = BTreeTupleGetDownLink(itup); par_blkno = BufferGetBlockNumber(*bufP); @@ -434,7 +443,10 @@ _bt_binsrch(Relation rel, * low) makes bounds invalid. * * Caller is responsible for invalidating bounds when it modifies the page - * before calling here a second time. + * before calling here a second time, and for dealing with posting list + * tuple matches (callers can use insertstate's postingoff field to + * determine which existing heap TID will need to be replaced by a posting + * list split). */ OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate) @@ -453,6 +465,7 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) Assert(P_ISLEAF(opaque)); Assert(!key->nextkey); + Assert(insertstate->postingoff == 0); if (!insertstate->bounds_valid) { @@ -509,6 +522,16 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) if (result != 0) stricthigh = high; } + + /* + * If tuple at offset located by binary search is a posting list whose + * TID range overlaps with caller's scantid, perform posting list + * binary search to set postingoff for caller. Caller must split the + * posting list when postingoff is set. This should happen + * infrequently. + */ + if (unlikely(result == 0 && key->scantid != NULL)) + insertstate->postingoff = _bt_binsrch_posting(key, page, mid); } /* @@ -528,6 +551,73 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) return low; } +/*---------- + * _bt_binsrch_posting() -- posting list binary search. + * + * Helper routine for _bt_binsrch_insert(). + * + * Returns offset into posting list where caller's scantid belongs. + *---------- + */ +static int +_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum) +{ + IndexTuple itup; + ItemId itemid; + int low, + high, + mid, + res; + + /* + * If this isn't a posting tuple, then the index must be corrupt (if it is + * an ordinary non-pivot tuple then there must be an existing tuple with a + * heap TID that equals inserter's new heap TID/scantid). Defensively + * check that tuple is a posting list tuple whose posting list range + * includes caller's scantid. + * + * (This is also needed because contrib/amcheck's rootdescend option needs + * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().) + */ + itemid = PageGetItemId(page, offnum); + itup = (IndexTuple) PageGetItem(page, itemid); + if (!BTreeTupleIsPosting(itup)) + return 0; + + Assert(key->heapkeyspace && key->allequalimage); + + /* + * In the event that posting list tuple has LP_DEAD bit set, indicate this + * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A + * second call to _bt_binsrch_insert() can take place when its caller has + * removed the dead item. + */ + if (ItemIdIsDead(itemid)) + return -1; + + /* "high" is past end of posting list for loop invariant */ + low = 0; + high = BTreeTupleGetNPosting(itup); + Assert(high >= 2); + + while (high > low) + { + mid = low + ((high - low) / 2); + res = ItemPointerCompare(key->scantid, + BTreeTupleGetPostingN(itup, mid)); + + if (res > 0) + low = mid + 1; + else if (res < 0) + high = mid; + else + return mid; + } + + /* Exact match not found */ + return low; +} + /*---------- * _bt_compare() -- Compare insertion-type scankey to tuple on a page. * @@ -537,9 +627,14 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) * <0 if scankey < tuple at offnum; * 0 if scankey == tuple at offnum; * >0 if scankey > tuple at offnum. - * NULLs in the keys are treated as sortable values. Therefore - * "equality" does not necessarily mean that the item should be - * returned to the caller as a matching key! + * + * NULLs in the keys are treated as sortable values. Therefore + * "equality" does not necessarily mean that the item should be returned + * to the caller as a matching key. Similarly, an insertion scankey + * with its scantid set is treated as equal to a posting tuple whose TID + * range overlaps with their scantid. There generally won't be a + * matching TID in the posting tuple, which caller must handle + * themselves (e.g., by splitting the posting list tuple). * * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be * "minus infinity": this routine will always claim it is less than the @@ -563,6 +658,7 @@ _bt_compare(Relation rel, ScanKey scankey; int ncmpkey; int ntupatts; + int32 result; Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum)); Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel)); @@ -592,12 +688,12 @@ _bt_compare(Relation rel, ncmpkey = Min(ntupatts, key->keysz); Assert(key->heapkeyspace || ncmpkey == key->keysz); + Assert(!BTreeTupleIsPosting(itup) || key->allequalimage); scankey = key->scankeys; for (int i = 1; i <= ncmpkey; i++) { Datum datum; bool isNull; - int32 result; datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull); @@ -712,8 +808,25 @@ _bt_compare(Relation rel, if (heapTid == NULL) return 1; + /* + * Scankey must be treated as equal to a posting list tuple if its scantid + * value falls within the range of the posting list. In all other cases + * there can only be a single heap TID value, which is compared directly + * with scantid. + */ Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel)); - return ItemPointerCompare(key->scantid, heapTid); + result = ItemPointerCompare(key->scantid, heapTid); + if (result <= 0 || !BTreeTupleIsPosting(itup)) + return result; + else + { + result = ItemPointerCompare(key->scantid, + BTreeTupleGetMaxHeapTID(itup)); + if (result > 0) + return 1; + } + + return 0; } /* @@ -1228,7 +1341,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } /* Initialize remaining insertion scan key fields */ - inskey.heapkeyspace = _bt_heapkeyspace(rel); + _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage); inskey.anynullkeys = false; /* unused */ inskey.nextkey = nextkey; inskey.pivotsearch = false; @@ -1483,9 +1596,35 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan)) { - /* tuple passes all scan key conditions, so remember it */ - _bt_saveitem(so, itemIndex, offnum, itup); - itemIndex++; + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + _bt_saveitem(so, itemIndex, offnum, itup); + itemIndex++; + } + else + { + int tupleOffset; + + /* + * Set up state to return posting list, and remember first + * TID + */ + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + itemIndex++; + /* Remember additional TIDs */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + itemIndex++; + } + } } /* When !continuescan, there can't be any more matches, so stop */ if (!continuescan) @@ -1518,7 +1657,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (!continuescan) so->currPos.moreRight = false; - Assert(itemIndex <= MaxIndexTuplesPerPage); + Assert(itemIndex <= MaxTIDsPerBTreePage); so->currPos.firstItem = 0; so->currPos.lastItem = itemIndex - 1; so->currPos.itemIndex = 0; @@ -1526,7 +1665,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) else { /* load items[] in descending order */ - itemIndex = MaxIndexTuplesPerPage; + itemIndex = MaxTIDsPerBTreePage; offnum = Min(offnum, maxoff); @@ -1567,9 +1706,41 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) &continuescan); if (passes_quals && tuple_alive) { - /* tuple passes all scan key conditions, so remember it */ - itemIndex--; - _bt_saveitem(so, itemIndex, offnum, itup); + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + itemIndex--; + _bt_saveitem(so, itemIndex, offnum, itup); + } + else + { + int tupleOffset; + + /* + * Set up state to return posting list, and remember first + * TID. + * + * Note that we deliberately save/return items from + * posting lists in ascending heap TID order for backwards + * scans. This allows _bt_killitems() to make a + * consistent assumption about the order of items + * associated with the same posting list tuple. + */ + itemIndex--; + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + /* Remember additional TIDs */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + itemIndex--; + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + } + } } if (!continuescan) { @@ -1583,8 +1754,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) Assert(itemIndex >= 0); so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxIndexTuplesPerPage - 1; - so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; + so->currPos.lastItem = MaxTIDsPerBTreePage - 1; + so->currPos.itemIndex = MaxTIDsPerBTreePage - 1; } return (so->currPos.firstItem <= so->currPos.lastItem); @@ -1597,6 +1768,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, { BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup)); + currItem->heapTid = itup->t_tid; currItem->indexOffset = offnum; if (so->currTuples) @@ -1609,6 +1782,71 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, } } +/* + * Setup state to save TIDs/items from a single posting list tuple. + * + * Saves an index item into so->currPos.items[itemIndex] for TID that is + * returned to scan first. Second or subsequent TIDs for posting list should + * be saved by calling _bt_savepostingitem(). + * + * Returns an offset into tuple storage space that main tuple is stored at if + * needed. + */ +static int +_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + Assert(BTreeTupleIsPosting(itup)); + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + if (so->currTuples) + { + /* Save base IndexTuple (truncate posting list) */ + IndexTuple base; + Size itupsz = BTreeTupleGetPostingOffset(itup); + + itupsz = MAXALIGN(itupsz); + currItem->tupleOffset = so->currPos.nextTupleOffset; + base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset); + memcpy(base, itup, itupsz); + /* Defensively reduce work area index tuple header size */ + base->t_info &= ~INDEX_SIZE_MASK; + base->t_info |= itupsz; + so->currPos.nextTupleOffset += itupsz; + + return currItem->tupleOffset; + } + + return 0; +} + +/* + * Save an index item into so->currPos.items[itemIndex] for current posting + * tuple. + * + * Assumes that _bt_setuppostingitems() has already been called for current + * posting list tuple. Caller passes its return value as tupleOffset. + */ +static inline void +_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, int tupleOffset) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + + /* + * Have index-only scans return the same base IndexTuple for every TID + * that originates from the same posting list + */ + if (so->currTuples) + currItem->tupleOffset = tupleOffset; +} + /* * _bt_steppage() -- Step to next page containing valid data for scan * diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index baec5de999..e66cd36dfa 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -243,6 +243,7 @@ typedef struct BTPageState BlockNumber btps_blkno; /* block # to write this page at */ IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */ OffsetNumber btps_lastoff; /* last item offset loaded */ + Size btps_lastextra; /* last item's extra posting list space */ uint32 btps_level; /* tree level (0 = leaf) */ Size btps_full; /* "full" if less than this much free space */ struct BTPageState *btps_next; /* link to parent level, if any */ @@ -277,7 +278,10 @@ static void _bt_slideleft(Page page); static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off); static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, - IndexTuple itup); + IndexTuple itup, Size truncextra); +static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, + BTPageState *state, + BTDedupState dstate); static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state); static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2); @@ -563,6 +567,8 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) wstate.heap = btspool->heap; wstate.index = btspool->index; wstate.inskey = _bt_mkscankey(wstate.index, NULL); + /* _bt_mkscankey() won't set allequalimage without metapage */ + wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true); /* * We need to log index creation in WAL iff WAL archiving/streaming is @@ -711,6 +717,7 @@ _bt_pagestate(BTWriteState *wstate, uint32 level) state->btps_lowkey = NULL; /* initialize lastoff so first item goes into P_FIRSTKEY */ state->btps_lastoff = P_HIKEY; + state->btps_lastextra = 0; state->btps_level = level; /* set "full" threshold based on level. See notes at head of file. */ if (level > 0) @@ -789,7 +796,8 @@ _bt_sortaddtup(Page page, } /*---------- - * Add an item to a disk page from the sort output. + * Add an item to a disk page from the sort output (or add a posting list + * item formed from the sort output). * * We must be careful to observe the page layout conventions of nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. @@ -821,14 +829,27 @@ _bt_sortaddtup(Page page, * the truncated high key at offset 1. * * 'last' pointer indicates the last offset added to the page. + * + * 'truncextra' is the size of the posting list in itup, if any. This + * information is stashed for the next call here, when we may benefit + * from considering the impact of truncating away the posting list on + * the page before deciding to finish the page off. Posting lists are + * often relatively large, so it is worth going to the trouble of + * accounting for the saving from truncating away the posting list of + * the tuple that becomes the high key (that may be the only way to + * get close to target free space on the page). Note that this is + * only used for the soft fillfactor-wise limit, not the critical hard + * limit. *---------- */ static void -_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) +_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, + Size truncextra) { Page npage; BlockNumber nblkno; OffsetNumber last_off; + Size last_truncextra; Size pgspc; Size itupsz; bool isleaf; @@ -842,6 +863,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) npage = state->btps_page; nblkno = state->btps_blkno; last_off = state->btps_lastoff; + last_truncextra = state->btps_lastextra; + state->btps_lastextra = truncextra; pgspc = PageGetFreeSpace(npage); itupsz = IndexTupleSize(itup); @@ -883,10 +906,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * page. Disregard fillfactor and insert on "full" current page if we * don't have the minimum number of items yet. (Note that we deliberately * assume that suffix truncation neither enlarges nor shrinks new high key - * when applying soft limit.) + * when applying soft limit, except when last tuple has a posting list.) */ if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) || - (pgspc < state->btps_full && last_off > P_FIRSTKEY)) + (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY)) { /* * Finish off the page and write it out. @@ -944,11 +967,14 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * We don't try to bias our choice of split point to make it more * likely that _bt_truncate() can truncate away more attributes, * whereas the split point used within _bt_split() is chosen much - * more delicately. Suffix truncation is mostly useful because it - * improves space utilization for workloads with random - * insertions. It doesn't seem worthwhile to add logic for - * choosing a split point here for a benefit that is bound to be - * much smaller. + * more delicately. Even still, the lastleft and firstright + * tuples passed to _bt_truncate() here are at least not fully + * equal to each other when deduplication is used, unless there is + * a large group of duplicates (also, unique index builds usually + * have few or no spool2 duplicates). When the split point is + * between two unequal tuples, _bt_truncate() will avoid including + * a heap TID in the new high key, which is the most important + * benefit of suffix truncation. * * Overwrite the old item with new truncated high key directly. * oitup is already located at the physical beginning of tuple @@ -983,7 +1009,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 || !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage))); BTreeTupleSetDownLink(state->btps_lowkey, oblkno); - _bt_buildadd(wstate, state->btps_next, state->btps_lowkey); + _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0); pfree(state->btps_lowkey); /* @@ -1045,6 +1071,43 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) state->btps_lastoff = last_off; } +/* + * Finalize pending posting list tuple, and add it to the index. Final tuple + * is based on saved base tuple, and saved list of heap TIDs. + * + * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple + * using _bt_buildadd(). + */ +static void +_bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, + BTDedupState dstate) +{ + Assert(dstate->nitems > 0); + + if (dstate->nitems == 1) + _bt_buildadd(wstate, state, dstate->base, 0); + else + { + IndexTuple postingtuple; + Size truncextra; + + /* form a tuple with a posting list */ + postingtuple = _bt_form_posting(dstate->base, + dstate->htids, + dstate->nhtids); + /* Calculate posting list overhead */ + truncextra = IndexTupleSize(postingtuple) - + BTreeTupleGetPostingOffset(postingtuple); + + _bt_buildadd(wstate, state, postingtuple, truncextra); + pfree(postingtuple); + } + + dstate->nhtids = 0; + dstate->nitems = 0; + dstate->phystupsize = 0; +} + /* * Finish writing out the completed btree. */ @@ -1090,7 +1153,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 || !P_LEFTMOST(opaque)); BTreeTupleSetDownLink(s->btps_lowkey, blkno); - _bt_buildadd(wstate, s->btps_next, s->btps_lowkey); + _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0); pfree(s->btps_lowkey); s->btps_lowkey = NULL; } @@ -1111,7 +1174,8 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) * by filling in a valid magic number in the metapage. */ metapage = (Page) palloc(BLCKSZ); - _bt_initmetapage(metapage, rootblkno, rootlevel); + _bt_initmetapage(metapage, rootblkno, rootlevel, + wstate->inskey->allequalimage); _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); } @@ -1132,6 +1196,10 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); SortSupport sortKeys; int64 tuples_done = 0; + bool deduplicate; + + deduplicate = wstate->inskey->allequalimage && + BTGetDeduplicateItems(wstate->index); if (merge) { @@ -1228,12 +1296,12 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) if (load1) { - _bt_buildadd(wstate, state, itup); + _bt_buildadd(wstate, state, itup, 0); itup = tuplesort_getindextuple(btspool->sortstate, true); } else { - _bt_buildadd(wstate, state, itup2); + _bt_buildadd(wstate, state, itup2, 0); itup2 = tuplesort_getindextuple(btspool2->sortstate, true); } @@ -1243,9 +1311,100 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) } pfree(sortKeys); } + else if (deduplicate) + { + /* merge is unnecessary, deduplicate into posting lists */ + BTDedupState dstate; + + dstate = (BTDedupState) palloc(sizeof(BTDedupStateData)); + dstate->deduplicate = true; /* unused */ + dstate->maxpostingsize = 0; /* set later */ + /* Metadata about base tuple of current pending posting list */ + dstate->base = NULL; + dstate->baseoff = InvalidOffsetNumber; /* unused */ + dstate->basetupsize = 0; + /* Metadata about current pending posting list TIDs */ + dstate->htids = NULL; + dstate->nhtids = 0; + dstate->nitems = 0; + dstate->phystupsize = 0; /* unused */ + dstate->nintervals = 0; /* unused */ + + while ((itup = tuplesort_getindextuple(btspool->sortstate, + true)) != NULL) + { + /* When we see first tuple, create first index page */ + if (state == NULL) + { + state = _bt_pagestate(wstate, 0); + + /* + * Limit size of posting list tuples to 1/10 space we want to + * leave behind on the page, plus space for final item's line + * pointer. This is equal to the space that we'd like to + * leave behind on each leaf page when fillfactor is 90, + * allowing us to get close to fillfactor% space utilization + * when there happen to be a great many duplicates. (This + * makes higher leaf fillfactor settings ineffective when + * building indexes that have many duplicates, but packing + * leaf pages full with few very large tuples doesn't seem + * like a useful goal.) + */ + dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) - + sizeof(ItemIdData); + Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) && + dstate->maxpostingsize <= INDEX_SIZE_MASK); + dstate->htids = palloc(dstate->maxpostingsize); + + /* start new pending posting list with itup copy */ + _bt_dedup_start_pending(dstate, CopyIndexTuple(itup), + InvalidOffsetNumber); + } + else if (_bt_keep_natts_fast(wstate->index, dstate->base, + itup) > keysz && + _bt_dedup_save_htid(dstate, itup)) + { + /* + * Tuple is equal to base tuple of pending posting list. Heap + * TID from itup has been saved in state. + */ + } + else + { + /* + * Tuple is not equal to pending posting list tuple, or + * _bt_dedup_save_htid() opted to not merge current item into + * pending posting list. + */ + _bt_sort_dedup_finish_pending(wstate, state, dstate); + pfree(dstate->base); + + /* start new pending posting list with itup copy */ + _bt_dedup_start_pending(dstate, CopyIndexTuple(itup), + InvalidOffsetNumber); + } + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + + if (state) + { + /* + * Handle the last item (there must be a last item when the + * tuplesort returned one or more tuples) + */ + _bt_sort_dedup_finish_pending(wstate, state, dstate); + pfree(dstate->base); + pfree(dstate->htids); + } + + pfree(dstate); + } else { - /* merge is unnecessary */ + /* merging and deduplication are both unnecessary */ while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) { @@ -1253,7 +1412,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) if (state == NULL) state = _bt_pagestate(wstate, 0); - _bt_buildadd(wstate, state, itup); + _bt_buildadd(wstate, state, itup, 0); /* Report progress */ pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index 76c2d945c8..8ba055be9e 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -183,6 +183,9 @@ _bt_findsplitloc(Relation rel, state.minfirstrightsz = SIZE_MAX; state.newitemoff = newitemoff; + /* newitem cannot be a posting list item */ + Assert(!BTreeTupleIsPosting(newitem)); + /* * maxsplits should never exceed maxoff because there will be at most as * many candidate split points as there are points _between_ tuples, once @@ -459,6 +462,7 @@ _bt_recsplitloc(FindSplitData *state, int16 leftfree, rightfree; Size firstrightitemsz; + Size postingsz = 0; bool newitemisfirstonright; /* Is the new item going to be the first item on the right page? */ @@ -468,8 +472,30 @@ _bt_recsplitloc(FindSplitData *state, if (newitemisfirstonright) firstrightitemsz = state->newitemsz; else + { firstrightitemsz = firstoldonrightsz; + /* + * Calculate suffix truncation space saving when firstright is a + * posting list tuple, though only when the firstright is over 64 + * bytes including line pointer overhead (arbitrary). This avoids + * accessing the tuple in cases where its posting list must be very + * small (if firstright has one at all). + */ + if (state->is_leaf && firstrightitemsz > 64) + { + ItemId itemid; + IndexTuple newhighkey; + + itemid = PageGetItemId(state->page, firstoldonright); + newhighkey = (IndexTuple) PageGetItem(state->page, itemid); + + if (BTreeTupleIsPosting(newhighkey)) + postingsz = IndexTupleSize(newhighkey) - + BTreeTupleGetPostingOffset(newhighkey); + } + } + /* Account for all the old tuples */ leftfree = state->leftspace - olddataitemstoleft; rightfree = state->rightspace - @@ -491,11 +517,17 @@ _bt_recsplitloc(FindSplitData *state, * If we are on the leaf level, assume that suffix truncation cannot avoid * adding a heap TID to the left half's new high key when splitting at the * leaf level. In practice the new high key will often be smaller and - * will rarely be larger, but conservatively assume the worst case. + * will rarely be larger, but conservatively assume the worst case. We do + * go to the trouble of subtracting away posting list overhead, though + * only when it looks like it will make an appreciable difference. + * (Posting lists are the only case where truncation will typically make + * the final high key far smaller than firstright, so being a bit more + * precise there noticeably improves the balance of free space.) */ if (state->is_leaf) leftfree -= (int16) (firstrightitemsz + - MAXALIGN(sizeof(ItemPointerData))); + MAXALIGN(sizeof(ItemPointerData)) - + postingsz); else leftfree -= (int16) firstrightitemsz; @@ -691,7 +723,8 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff)); tup = (IndexTuple) PageGetItem(state->page, itemid); /* Do cheaper test first */ - if (!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) + if (BTreeTupleIsPosting(tup) || + !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index af07732eab..54afa6f417 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -81,7 +81,10 @@ static int _bt_keep_natts(Relation rel, IndexTuple lastleft, * determine whether or not the keys in the index are expected to be * unique (i.e. if this is a "heapkeyspace" index). We assume a * heapkeyspace index when caller passes a NULL tuple, allowing index - * build callers to avoid accessing the non-existent metapage. + * build callers to avoid accessing the non-existent metapage. We + * also assume that the index is _not_ allequalimage when a NULL tuple + * is passed; CREATE INDEX callers call _bt_allequalimage() to set the + * field themselves. */ BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup) @@ -108,7 +111,14 @@ _bt_mkscankey(Relation rel, IndexTuple itup) */ key = palloc(offsetof(BTScanInsertData, scankeys) + sizeof(ScanKeyData) * indnkeyatts); - key->heapkeyspace = itup == NULL || _bt_heapkeyspace(rel); + if (itup) + _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage); + else + { + /* Utility statement callers can set these fields themselves */ + key->heapkeyspace = true; + key->allequalimage = false; + } key->anynullkeys = false; /* initial assumption */ key->nextkey = false; key->pivotsearch = false; @@ -1374,6 +1384,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * attribute passes the qual. */ Assert(ScanDirectionIsForward(dir)); + Assert(BTreeTupleIsPivot(tuple)); continue; } @@ -1535,6 +1546,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * attribute passes the qual. */ Assert(ScanDirectionIsForward(dir)); + Assert(BTreeTupleIsPivot(tuple)); cmpresult = 0; if (subkey->sk_flags & SK_ROW_END) break; @@ -1774,10 +1786,65 @@ _bt_killitems(IndexScanDesc scan) { ItemId iid = PageGetItemId(page, offnum); IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); + bool killtuple = false; - if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + if (BTreeTupleIsPosting(ituple)) { - /* found the item */ + int pi = i + 1; + int nposting = BTreeTupleGetNPosting(ituple); + int j; + + /* + * Note that we rely on the assumption that heap TIDs in the + * scanpos items array are always in ascending heap TID order + * within a posting list + */ + for (j = 0; j < nposting; j++) + { + ItemPointer item = BTreeTupleGetPostingN(ituple, j); + + if (!ItemPointerEquals(item, &kitem->heapTid)) + break; /* out of posting list loop */ + + /* kitem must have matching offnum when heap TIDs match */ + Assert(kitem->indexOffset == offnum); + + /* + * Read-ahead to later kitems here. + * + * We rely on the assumption that not advancing kitem here + * will prevent us from considering the posting list tuple + * fully dead by not matching its next heap TID in next + * loop iteration. + * + * If, on the other hand, this is the final heap TID in + * the posting list tuple, then tuple gets killed + * regardless (i.e. we handle the case where the last + * kitem is also the last heap TID in the last index tuple + * correctly -- posting tuple still gets killed). + */ + if (pi < numKilled) + kitem = &so->currPos.items[so->killedItems[pi++]]; + } + + /* + * Don't bother advancing the outermost loop's int iterator to + * avoid processing killed items that relate to the same + * offnum/posting list tuple. This micro-optimization hardly + * seems worth it. (Further iterations of the outermost loop + * will fail to match on this same posting list's first heap + * TID instead, so we'll advance to the next offnum/index + * tuple pretty quickly.) + */ + if (j == nposting) + killtuple = true; + } + else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + killtuple = true; + + if (killtuple) + { + /* found the item/all posting list items */ ItemIdMarkDead(iid); killedsomething = true; break; /* out of inner search loop */ @@ -2018,7 +2085,9 @@ btoptions(Datum reloptions, bool validate) static const relopt_parse_elt tab[] = { {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)}, {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL, - offsetof(BTOptions, vacuum_cleanup_index_scale_factor)} + offsetof(BTOptions, vacuum_cleanup_index_scale_factor)}, + {"deduplicate_items", RELOPT_TYPE_BOOL, + offsetof(BTOptions, deduplicate_items)} }; @@ -2119,11 +2188,10 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, Size newsize; /* - * We should only ever truncate leaf index tuples. It's never okay to - * truncate a second time. + * We should only ever truncate non-pivot tuples from leaf pages. It's + * never okay to truncate when splitting an internal page. */ - Assert(BTreeTupleGetNAtts(lastleft, rel) == natts); - Assert(BTreeTupleGetNAtts(firstright, rel) == natts); + Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); /* Determine how many attributes must be kept in truncated tuple */ keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); @@ -2139,6 +2207,19 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, pivot = index_truncate_tuple(itupdesc, firstright, keepnatts); + if (BTreeTupleIsPosting(pivot)) + { + /* + * index_truncate_tuple() just returns a straight copy of + * firstright when it has no key attributes to truncate. We need + * to truncate away the posting list ourselves. + */ + Assert(keepnatts == nkeyatts); + Assert(natts == nkeyatts); + pivot->t_info &= ~INDEX_SIZE_MASK; + pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright)); + } + /* * If there is a distinguishing key attribute within new pivot tuple, * there is no need to add an explicit heap TID attribute @@ -2155,6 +2236,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * attribute to the new pivot tuple. */ Assert(natts != nkeyatts); + Assert(!BTreeTupleIsPosting(lastleft) && + !BTreeTupleIsPosting(firstright)); newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData)); tidpivot = palloc0(newsize); memcpy(tidpivot, pivot, IndexTupleSize(pivot)); @@ -2172,6 +2255,19 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, newsize = IndexTupleSize(firstright) + MAXALIGN(sizeof(ItemPointerData)); pivot = palloc0(newsize); memcpy(pivot, firstright, IndexTupleSize(firstright)); + + if (BTreeTupleIsPosting(firstright)) + { + /* + * New pivot tuple was copied from firstright, which happens to be + * a posting list tuple. We will have to include the max lastleft + * heap TID in the final pivot tuple, but we can remove the + * posting list now. (Pivot tuples should never contain a posting + * list.) + */ + newsize = MAXALIGN(BTreeTupleGetPostingOffset(firstright)) + + MAXALIGN(sizeof(ItemPointerData)); + } } /* @@ -2199,7 +2295,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, */ pivotheaptid = (ItemPointer) ((char *) pivot + newsize - sizeof(ItemPointerData)); - ItemPointerCopy(&lastleft->t_tid, pivotheaptid); + ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid); /* * Lehman and Yao require that the downlink to the right page, which is to @@ -2210,9 +2306,12 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * tiebreaker. */ #ifndef DEBUG_NO_TRUNCATE - Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < 0); - Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0); - Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0); + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft), + BTreeTupleGetHeapTID(firstright)) < 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(lastleft)) >= 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); #else /* @@ -2225,7 +2324,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * attribute values along with lastleft's heap TID value when lastleft's * TID happens to be greater than firstright's TID. */ - ItemPointerCopy(&firstright->t_tid, pivotheaptid); + ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid); /* * Pivot heap TID should never be fully equal to firstright. Note that @@ -2234,7 +2333,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, */ ItemPointerSetOffsetNumber(pivotheaptid, OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid))); - Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); #endif BTreeTupleSetNAtts(pivot, nkeyatts); @@ -2301,6 +2401,13 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, keepnatts++; } + /* + * Assert that _bt_keep_natts_fast() agrees with us in passing. This is + * expected in an allequalimage index. + */ + Assert(!itup_key->allequalimage || + keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); + return keepnatts; } @@ -2315,13 +2422,16 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * The approach taken here usually provides the same answer as _bt_keep_natts * will (for the same pair of tuples from a heapkeyspace index), since the * majority of btree opclasses can never indicate that two datums are equal - * unless they're bitwise equal after detoasting. + * unless they're bitwise equal after detoasting. When an index only has + * "equal image" columns, routine is guaranteed to give the same result as + * _bt_keep_natts would. * - * These issues must be acceptable to callers, typically because they're only - * concerned about making suffix truncation as effective as possible without - * leaving excessive amounts of free space on either side of page split. * Callers can rely on the fact that attributes considered equal here are - * definitely also equal according to _bt_keep_natts. + * definitely also equal according to _bt_keep_natts, even when the index uses + * an opclass or collation that is not "allequalimage"/deduplication-safe. + * This weaker guarantee is good enough for nbtsplitloc.c caller, since false + * negatives generally only have the effect of making leaf page splits use a + * more balanced split point. */ int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) @@ -2393,28 +2503,42 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) * Mask allocated for number of keys in index tuple must be able to fit * maximum possible number of index attributes */ - StaticAssertStmt(BT_N_KEYS_OFFSET_MASK >= INDEX_MAX_KEYS, - "BT_N_KEYS_OFFSET_MASK can't fit INDEX_MAX_KEYS"); + StaticAssertStmt(BT_OFFSET_MASK >= INDEX_MAX_KEYS, + "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS"); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); tupnatts = BTreeTupleGetNAtts(itup, rel); + /* !heapkeyspace indexes do not support deduplication */ + if (!heapkeyspace && BTreeTupleIsPosting(itup)) + return false; + + /* Posting list tuples should never have "pivot heap TID" bit set */ + if (BTreeTupleIsPosting(itup) && + (ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & + BT_PIVOT_HEAP_TID_ATTR) != 0) + return false; + + /* INCLUDE indexes do not support deduplication */ + if (natts != nkeyatts && BTreeTupleIsPosting(itup)) + return false; + if (P_ISLEAF(opaque)) { if (offnum >= P_FIRSTDATAKEY(opaque)) { /* - * Non-pivot tuples currently never use alternative heap TID - * representation -- even those within heapkeyspace indexes + * Non-pivot tuple should never be explicitly marked as a pivot + * tuple */ - if ((itup->t_info & INDEX_ALT_TID_MASK) != 0) + if (BTreeTupleIsPivot(itup)) return false; /* * Leaf tuples that are not the page high key (non-pivot tuples) * should never be truncated. (Note that tupnatts must have been - * inferred, rather than coming from an explicit on-disk - * representation.) + * inferred, even with a posting list tuple, because only pivot + * tuples store tupnatts directly.) */ return tupnatts == natts; } @@ -2458,12 +2582,12 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) * non-zero, or when there is no explicit representation and the * tuple is evidently not a pre-pg_upgrade tuple. * - * Prior to v11, downlinks always had P_HIKEY as their offset. Use - * that to decide if the tuple is a pre-v11 tuple. + * Prior to v11, downlinks always had P_HIKEY as their offset. + * Accept that as an alternative indication of a valid + * !heapkeyspace negative infinity tuple. */ return tupnatts == 0 || - ((itup->t_info & INDEX_ALT_TID_MASK) == 0 && - ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY); + ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY; } else { @@ -2489,7 +2613,11 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) * heapkeyspace index pivot tuples, regardless of whether or not there are * non-key attributes. */ - if ((itup->t_info & INDEX_ALT_TID_MASK) == 0) + if (!BTreeTupleIsPivot(itup)) + return false; + + /* Pivot tuple should not use posting list representation (redundant) */ + if (BTreeTupleIsPosting(itup)) return false; /* @@ -2559,8 +2687,8 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, BTMaxItemSizeNoHeapTid(page), RelationGetRelationName(rel)), errdetail("Index row references tuple (%u,%u) in relation \"%s\".", - ItemPointerGetBlockNumber(&newtup->t_tid), - ItemPointerGetOffsetNumber(&newtup->t_tid), + ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)), + ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)), RelationGetRelationName(heap)), errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" "Consider a function index of an MD5 hash of the value, " diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 2e5202c2d6..99d0914e72 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -22,6 +22,9 @@ #include "access/xlogutils.h" #include "miscadmin.h" #include "storage/procarray.h" +#include "utils/memutils.h" + +static MemoryContext opCtx; /* working memory for operations */ /* * _bt_restore_page -- re-enter all the index tuples on a page @@ -111,6 +114,7 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id) Assert(md->btm_version >= BTREE_NOVAC_VERSION); md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact; md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples; + md->btm_allequalimage = xlrec->allequalimage; pageop = (BTPageOpaque) PageGetSpecialPointer(metapg); pageop->btpo_flags = BTP_META; @@ -156,7 +160,8 @@ _bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id) } static void -btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record) +btree_xlog_insert(bool isleaf, bool ismeta, bool posting, + XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record); @@ -181,9 +186,52 @@ btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record) page = BufferGetPage(buffer); - if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, - false, false) == InvalidOffsetNumber) - elog(PANIC, "btree_xlog_insert: failed to add item"); + if (!posting) + { + /* Simple retail insertion */ + if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "failed to add new item"); + } + else + { + ItemId itemid; + IndexTuple oposting, + newitem, + nposting; + uint16 postingoff; + + /* + * A posting list split occurred during leaf page insertion. WAL + * record data will start with an offset number representing the + * point in an existing posting list that a split occurs at. + * + * Use _bt_swap_posting() to repeat posting list split steps from + * primary. Note that newitem from WAL record is 'orignewitem', + * not the final version of newitem that is actually inserted on + * page. + */ + postingoff = *((uint16 *) datapos); + datapos += sizeof(uint16); + datalen -= sizeof(uint16); + + itemid = PageGetItemId(page, OffsetNumberPrev(xlrec->offnum)); + oposting = (IndexTuple) PageGetItem(page, itemid); + + /* Use mutable, aligned newitem copy in _bt_swap_posting() */ + Assert(isleaf && postingoff > 0); + newitem = CopyIndexTuple((IndexTuple) datapos); + nposting = _bt_swap_posting(newitem, oposting, postingoff); + + /* Replace existing posting list with post-split version */ + memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); + + /* Insert "final" new item (not orignewitem from WAL stream) */ + Assert(IndexTupleSize(newitem) == datalen); + if (PageAddItem(page, (Item) newitem, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "failed to add posting split new item"); + } PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -265,20 +313,38 @@ btree_xlog_split(bool onleft, XLogReaderState *record) BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); OffsetNumber off; IndexTuple newitem = NULL, - left_hikey = NULL; + left_hikey = NULL, + nposting = NULL; Size newitemsz = 0, left_hikeysz = 0; Page newlpage; - OffsetNumber leftoff; + OffsetNumber leftoff, + replacepostingoff = InvalidOffsetNumber; datapos = XLogRecGetBlockData(record, 0, &datalen); - if (onleft) + if (onleft || xlrec->postingoff != 0) { newitem = (IndexTuple) datapos; newitemsz = MAXALIGN(IndexTupleSize(newitem)); datapos += newitemsz; datalen -= newitemsz; + + if (xlrec->postingoff != 0) + { + ItemId itemid; + IndexTuple oposting; + + /* Posting list must be at offset number before new item's */ + replacepostingoff = OffsetNumberPrev(xlrec->newitemoff); + + /* Use mutable, aligned newitem copy in _bt_swap_posting() */ + newitem = CopyIndexTuple(newitem); + itemid = PageGetItemId(lpage, replacepostingoff); + oposting = (IndexTuple) PageGetItem(lpage, itemid); + nposting = _bt_swap_posting(newitem, oposting, + xlrec->postingoff); + } } /* @@ -308,8 +374,20 @@ btree_xlog_split(bool onleft, XLogReaderState *record) Size itemsz; IndexTuple item; + /* Add replacement posting list when required */ + if (off == replacepostingoff) + { + Assert(onleft || xlrec->firstright == xlrec->newitemoff); + if (PageAddItem(newlpage, (Item) nposting, + MAXALIGN(IndexTupleSize(nposting)), leftoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add new posting list item to left page after split"); + leftoff = OffsetNumberNext(leftoff); + continue; /* don't insert oposting */ + } + /* add the new item if it was inserted on left page */ - if (onleft && off == xlrec->newitemoff) + else if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) @@ -383,6 +461,98 @@ btree_xlog_split(bool onleft, XLogReaderState *record) } } +static void +btree_xlog_dedup(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_btree_dedup *xlrec = (xl_btree_dedup *) XLogRecGetData(record); + Buffer buf; + + if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO) + { + char *ptr = XLogRecGetBlockData(record, 0, NULL); + Page page = (Page) BufferGetPage(buf); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + OffsetNumber offnum, + minoff, + maxoff; + BTDedupState state; + BTDedupInterval *intervals; + Page newpage; + + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state->deduplicate = true; /* unused */ + /* Conservatively use larger maxpostingsize than primary */ + state->maxpostingsize = BTMaxItemSize(page); + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + state->htids = palloc(state->maxpostingsize); + state->nhtids = 0; + state->nitems = 0; + state->phystupsize = 0; + state->nintervals = 0; + + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + newpage = PageGetTempPageCopySpecial(page); + + if (!P_RIGHTMOST(opaque)) + { + ItemId itemid = PageGetItemId(page, P_HIKEY); + Size itemsz = ItemIdGetLength(itemid); + IndexTuple item = (IndexTuple) PageGetItem(page, itemid); + + if (PageAddItem(newpage, (Item) item, itemsz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add highkey"); + } + + intervals = (BTDedupInterval *) ptr; + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + if (offnum == minoff) + _bt_dedup_start_pending(state, itup, offnum); + else if (state->nintervals < xlrec->nintervals && + state->baseoff == intervals[state->nintervals].baseoff && + state->nitems < intervals[state->nintervals].nitems) + { + if (!_bt_dedup_save_htid(state, itup)) + elog(ERROR, "deduplication failed to add heap tid to pending posting list"); + } + else + { + _bt_dedup_finish_pending(newpage, state); + _bt_dedup_start_pending(state, itup, offnum); + } + } + + _bt_dedup_finish_pending(newpage, state); + Assert(state->nintervals == xlrec->nintervals); + Assert(memcmp(state->intervals, intervals, + state->nintervals * sizeof(BTDedupInterval)) == 0); + + if (P_HAS_GARBAGE(opaque)) + { + BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage); + + nopaque->btpo_flags &= ~BTP_HAS_GARBAGE; + } + + PageRestoreTempPage(newpage, page); + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + } + + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); +} + static void btree_xlog_vacuum(XLogReaderState *record) { @@ -405,7 +575,56 @@ btree_xlog_vacuum(XLogReaderState *record) page = (Page) BufferGetPage(buffer); - PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); + if (xlrec->nupdated > 0) + { + OffsetNumber *updatedoffsets; + xl_btree_update *updates; + + updatedoffsets = (OffsetNumber *) + (ptr + xlrec->ndeleted * sizeof(OffsetNumber)); + updates = (xl_btree_update *) ((char *) updatedoffsets + + xlrec->nupdated * + sizeof(OffsetNumber)); + + for (int i = 0; i < xlrec->nupdated; i++) + { + BTVacuumPosting vacposting; + IndexTuple origtuple; + ItemId itemid; + Size itemsz; + + itemid = PageGetItemId(page, updatedoffsets[i]); + origtuple = (IndexTuple) PageGetItem(page, itemid); + + vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) + + updates->ndeletedtids * sizeof(uint16)); + vacposting->updatedoffset = updatedoffsets[i]; + vacposting->itup = origtuple; + vacposting->ndeletedtids = updates->ndeletedtids; + memcpy(vacposting->deletetids, + (char *) updates + SizeOfBtreeUpdate, + updates->ndeletedtids * sizeof(uint16)); + + _bt_update_posting(vacposting); + + /* Overwrite updated version of tuple */ + itemsz = MAXALIGN(IndexTupleSize(vacposting->itup)); + if (!PageIndexTupleOverwrite(page, updatedoffsets[i], + (Item) vacposting->itup, itemsz)) + elog(PANIC, "failed to update partially dead item"); + + pfree(vacposting->itup); + pfree(vacposting); + + /* advance to next xl_btree_update from array */ + updates = (xl_btree_update *) + ((char *) updates + SizeOfBtreeUpdate + + updates->ndeletedtids * sizeof(uint16)); + } + } + + if (xlrec->ndeleted > 0) + PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); /* * Mark the page as not containing any LP_DEAD items --- see comments @@ -724,17 +943,19 @@ void btree_redo(XLogReaderState *record) { uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + MemoryContext oldCtx; + oldCtx = MemoryContextSwitchTo(opCtx); switch (info) { case XLOG_BTREE_INSERT_LEAF: - btree_xlog_insert(true, false, record); + btree_xlog_insert(true, false, false, record); break; case XLOG_BTREE_INSERT_UPPER: - btree_xlog_insert(false, false, record); + btree_xlog_insert(false, false, false, record); break; case XLOG_BTREE_INSERT_META: - btree_xlog_insert(false, true, record); + btree_xlog_insert(false, true, false, record); break; case XLOG_BTREE_SPLIT_L: btree_xlog_split(true, record); @@ -742,6 +963,12 @@ btree_redo(XLogReaderState *record) case XLOG_BTREE_SPLIT_R: btree_xlog_split(false, record); break; + case XLOG_BTREE_INSERT_POST: + btree_xlog_insert(true, false, true, record); + break; + case XLOG_BTREE_DEDUP: + btree_xlog_dedup(record); + break; case XLOG_BTREE_VACUUM: btree_xlog_vacuum(record); break; @@ -767,6 +994,23 @@ btree_redo(XLogReaderState *record) default: elog(PANIC, "btree_redo: unknown op code %u", info); } + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(opCtx); +} + +void +btree_xlog_startup(void) +{ + opCtx = AllocSetContextCreate(CurrentMemoryContext, + "Btree recovery temporary context", + ALLOCSET_DEFAULT_SIZES); +} + +void +btree_xlog_cleanup(void) +{ + MemoryContextDelete(opCtx); + opCtx = NULL; } /* diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 7d63a7124e..7a1616f371 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -27,6 +27,7 @@ btree_desc(StringInfo buf, XLogReaderState *record) case XLOG_BTREE_INSERT_LEAF: case XLOG_BTREE_INSERT_UPPER: case XLOG_BTREE_INSERT_META: + case XLOG_BTREE_INSERT_POST: { xl_btree_insert *xlrec = (xl_btree_insert *) rec; @@ -38,15 +39,24 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_split *xlrec = (xl_btree_split *) rec; - appendStringInfo(buf, "level %u, firstright %d, newitemoff %d", - xlrec->level, xlrec->firstright, xlrec->newitemoff); + appendStringInfo(buf, "level %u, firstright %d, newitemoff %d, postingoff %d", + xlrec->level, xlrec->firstright, + xlrec->newitemoff, xlrec->postingoff); + break; + } + case XLOG_BTREE_DEDUP: + { + xl_btree_dedup *xlrec = (xl_btree_dedup *) rec; + + appendStringInfo(buf, "nintervals %u", xlrec->nintervals); break; } case XLOG_BTREE_VACUUM: { xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; - appendStringInfo(buf, "ndeleted %u", xlrec->ndeleted); + appendStringInfo(buf, "ndeleted %u; nupdated %u", + xlrec->ndeleted, xlrec->nupdated); break; } case XLOG_BTREE_DELETE: @@ -130,6 +140,12 @@ btree_identify(uint8 info) case XLOG_BTREE_SPLIT_R: id = "SPLIT_R"; break; + case XLOG_BTREE_INSERT_POST: + id = "INSERT_POST"; + break; + case XLOG_BTREE_DEDUP: + id = "DEDUP"; + break; case XLOG_BTREE_VACUUM: id = "VACUUM"; break; diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 4ea6ea7a3d..cb7b8c8a63 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -1048,8 +1048,10 @@ PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum) * This is better than deleting and reinserting the tuple, because it * avoids any data shifting when the tuple size doesn't change; and * even when it does, we avoid moving the line pointers around. - * Conceivably this could also be of use to an index AM that cares about - * the physical order of tuples as well as their ItemId order. + * This could be used by an index AM that doesn't want to unset the + * LP_DEAD bit when it happens to be set. It could conceivably also be + * used by an index AM that cares about the physical order of tuples as + * well as their logical/ItemId order. * * If there's insufficient space for the new tuple, return false. Other * errors represent data-corruption problems, so we just elog. @@ -1134,8 +1136,9 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum, } } - /* Update the item's tuple length (other fields shouldn't change) */ - ItemIdSetNormal(tupid, offset + size_diff, newsize); + /* Update the item's tuple length without changing its lp_flags field */ + tupid->lp_off = offset + size_diff; + tupid->lp_len = newsize; /* Copy new tuple data onto page */ memcpy(PageGetItem(page, tupid), newtup, newsize); diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index dc03fbde13..b6b08d0ccb 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -1731,14 +1731,14 @@ psql_completion(const char *text, int start, int end) /* ALTER INDEX SET|RESET ( */ else if (Matches("ALTER", "INDEX", MatchAny, "RESET", "(")) COMPLETE_WITH("fillfactor", - "vacuum_cleanup_index_scale_factor", /* BTREE */ + "vacuum_cleanup_index_scale_factor", "deduplicate_items", /* BTREE */ "fastupdate", "gin_pending_list_limit", /* GIN */ "buffering", /* GiST */ "pages_per_range", "autosummarize" /* BRIN */ ); else if (Matches("ALTER", "INDEX", MatchAny, "SET", "(")) COMPLETE_WITH("fillfactor =", - "vacuum_cleanup_index_scale_factor =", /* BTREE */ + "vacuum_cleanup_index_scale_factor =", "deduplicate_items =", /* BTREE */ "fastupdate =", "gin_pending_list_limit =", /* GIN */ "buffering =", /* GiST */ "pages_per_range =", "autosummarize =" /* BRIN */ diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index e8d4d2b55b..bfe49f46b0 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -108,6 +108,7 @@ typedef struct BTMetaPageData * pages */ float8 btm_last_cleanup_num_heap_tuples; /* number of heap tuples * during last cleanup */ + bool btm_allequalimage; /* are all columns "equalimage"? */ } BTMetaPageData; #define BTPageGetMeta(p) \ @@ -124,6 +125,14 @@ typedef struct BTMetaPageData * need to be immediately re-indexed at pg_upgrade. In order to get the * new heapkeyspace semantics, however, a REINDEX is needed. * + * Deduplication is safe to use when the btm_allequalimage field is set to + * true. It's safe to read the btm_allequalimage field on version 3, but + * only version 4 indexes make use of deduplication. Even version 4 + * indexes created on PostgreSQL v12 will need a REINDEX to make use of + * deduplication, though, since there is no other way to set + * btm_allequalimage to true (pg_upgrade hasn't been taught to set the + * metapage field). + * * Btree version 2 is mostly the same as version 3. There are two new * fields in the metapage that were introduced in version 3. A version 2 * metapage will be automatically upgraded to version 3 on the first @@ -156,6 +165,21 @@ typedef struct BTMetaPageData MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ MAXALIGN(sizeof(BTPageOpaqueData))) / 3) +/* + * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples + * that may be stored on a btree leaf page. It is used to size the + * per-page temporary buffers used by index scans.) + * + * Note: we don't bother considering per-tuple overheads here to keep + * things simple (value is based on how many elements a single array of + * heap TIDs must have to fill the space between the page header and + * special area). The value is slightly higher (i.e. more conservative) + * than necessary as a result, which is considered acceptable. + */ +#define MaxTIDsPerBTreePage \ + (int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \ + sizeof(ItemPointerData)) + /* * The leaf-page fillfactor defaults to 90% but is user-adjustable. * For pages above the leaf level, we use a fixed 70% fillfactor. @@ -230,16 +254,15 @@ typedef struct BTMetaPageData * tuples (non-pivot tuples). _bt_check_natts() enforces the rules * described here. * - * Non-pivot tuple format: + * Non-pivot tuple format (plain/non-posting variant): * * t_tid | t_info | key values | INCLUDE columns, if any * * t_tid points to the heap TID, which is a tiebreaker key column as of - * BTREE_VERSION 4. Currently, the INDEX_ALT_TID_MASK status bit is never - * set for non-pivot tuples. + * BTREE_VERSION 4. * - * All other types of index tuples ("pivot" tuples) only have key columns, - * since pivot tuples only exist to represent how the key space is + * Non-pivot tuples complement pivot tuples, which only have key columns. + * The sole purpose of pivot tuples is to represent how the key space is * separated. In general, any B-Tree index that has more than one level * (i.e. any index that does not just consist of a metapage and a single * leaf root page) must have some number of pivot tuples, since pivot @@ -264,7 +287,8 @@ typedef struct BTMetaPageData * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap * TID column sometimes stored in pivot tuples -- that's represented by * the presence of BT_PIVOT_HEAP_TID_ATTR. The INDEX_ALT_TID_MASK bit in - * t_info is always set on BTREE_VERSION 4 pivot tuples. + * t_info is always set on BTREE_VERSION 4 pivot tuples, since + * BTreeTupleIsPivot() must work reliably on heapkeyspace versions. * * In version 3 indexes, the INDEX_ALT_TID_MASK flag might not be set in * pivot tuples. In that case, the number of key columns is implicitly @@ -279,90 +303,256 @@ typedef struct BTMetaPageData * The 12 least significant offset bits from t_tid are used to represent * the number of columns in INDEX_ALT_TID_MASK tuples, leaving 4 status * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for - * future use. BT_N_KEYS_OFFSET_MASK should be large enough to store any - * number of columns/attributes <= INDEX_MAX_KEYS. + * future use. BT_OFFSET_MASK should be large enough to store any number + * of columns/attributes <= INDEX_MAX_KEYS. + * + * Sometimes non-pivot tuples also use a representation that repurposes + * t_tid to store metadata rather than a TID. PostgreSQL v13 introduced a + * new non-pivot tuple format to support deduplication: posting list + * tuples. Deduplication merges together multiple equal non-pivot tuples + * into a logically equivalent, space efficient representation. A posting + * list is an array of ItemPointerData elements. Non-pivot tuples are + * merged together to form posting list tuples lazily, at the point where + * we'd otherwise have to split a leaf page. + * + * Posting tuple format (alternative non-pivot tuple representation): + * + * t_tid | t_info | key values | posting list (TID array) + * + * Posting list tuples are recognized as such by having the + * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status + * bit set in t_tid. These flags redefine the content of the posting + * tuple's t_tid to store an offset to the posting list, as well as the + * total number of posting list array elements. + * + * The 12 least significant offset bits from t_tid are used to represent + * the number of posting items present in the tuple, leaving 4 status + * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for + * future use. Like any non-pivot tuple, the number of columns stored is + * always implicitly the total number in the index (in practice there can + * never be non-key columns stored, since deduplication is not supported + * with INCLUDE indexes). BT_OFFSET_MASK should be large enough to store + * any number of posting list TIDs that might be present in a tuple (since + * tuple size is subject to the INDEX_SIZE_MASK limit). * * Note well: The macros that deal with the number of attributes in tuples - * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple, - * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot - * tuple (or must have the same number of attributes as the index has - * generally in the case of !heapkeyspace indexes). They will need to be - * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK - * for something else. + * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple or + * non-pivot posting tuple, and that a tuple without INDEX_ALT_TID_MASK set + * must be a non-pivot tuple (or must have the same number of attributes as + * the index has generally in the case of !heapkeyspace indexes). */ #define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT /* Item pointer offset bits */ #define BT_RESERVED_OFFSET_MASK 0xF000 -#define BT_N_KEYS_OFFSET_MASK 0x0FFF +#define BT_OFFSET_MASK 0x0FFF #define BT_PIVOT_HEAP_TID_ATTR 0x1000 - -/* Get/set downlink block number in pivot tuple */ -#define BTreeTupleGetDownLink(itup) \ - ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) -#define BTreeTupleSetDownLink(itup, blkno) \ - ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno)) +#define BT_IS_POSTING 0x2000 /* - * Get/set leaf page highkey's link. During the second phase of deletion, the - * target leaf page's high key may point to an ancestor page (at all other - * times, the leaf level high key's link is not used). See the nbtree README - * for full details. + * Note: BTreeTupleIsPivot() can have false negatives (but not false + * positives) when used with !heapkeyspace indexes */ -#define BTreeTupleGetTopParent(itup) \ - ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) -#define BTreeTupleSetTopParent(itup, blkno) \ - do { \ - ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno)); \ - BTreeTupleSetNAtts((itup), 0); \ - } while(0) +static inline bool +BTreeTupleIsPivot(IndexTuple itup) +{ + if ((itup->t_info & INDEX_ALT_TID_MASK) == 0) + return false; + /* absence of BT_IS_POSTING in offset number indicates pivot tuple */ + if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) != 0) + return false; + + return true; +} + +static inline bool +BTreeTupleIsPosting(IndexTuple itup) +{ + if ((itup->t_info & INDEX_ALT_TID_MASK) == 0) + return false; + /* presence of BT_IS_POSTING in offset number indicates posting tuple */ + if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) == 0) + return false; + + return true; +} + +static inline void +BTreeTupleSetPosting(IndexTuple itup, int nhtids, int postingoffset) +{ + Assert(nhtids > 1 && (nhtids & BT_OFFSET_MASK) == nhtids); + Assert(postingoffset == MAXALIGN(postingoffset)); + Assert(postingoffset < INDEX_SIZE_MASK); + + itup->t_info |= INDEX_ALT_TID_MASK; + ItemPointerSetOffsetNumber(&itup->t_tid, (nhtids | BT_IS_POSTING)); + ItemPointerSetBlockNumber(&itup->t_tid, postingoffset); +} + +static inline uint16 +BTreeTupleGetNPosting(IndexTuple posting) +{ + OffsetNumber existing; + + Assert(BTreeTupleIsPosting(posting)); + + existing = ItemPointerGetOffsetNumberNoCheck(&posting->t_tid); + return (existing & BT_OFFSET_MASK); +} + +static inline uint32 +BTreeTupleGetPostingOffset(IndexTuple posting) +{ + Assert(BTreeTupleIsPosting(posting)); + + return ItemPointerGetBlockNumberNoCheck(&posting->t_tid); +} + +static inline ItemPointer +BTreeTupleGetPosting(IndexTuple posting) +{ + return (ItemPointer) ((char *) posting + + BTreeTupleGetPostingOffset(posting)); +} + +static inline ItemPointer +BTreeTupleGetPostingN(IndexTuple posting, int n) +{ + return BTreeTupleGetPosting(posting) + n; +} /* - * Get/set number of attributes within B-tree index tuple. + * Get/set downlink block number in pivot tuple. + * + * Note: Cannot assert that tuple is a pivot tuple. If we did so then + * !heapkeyspace indexes would exhibit false positive assertion failures. + */ +static inline BlockNumber +BTreeTupleGetDownLink(IndexTuple pivot) +{ + return ItemPointerGetBlockNumberNoCheck(&pivot->t_tid); +} + +static inline void +BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno) +{ + ItemPointerSetBlockNumber(&pivot->t_tid, blkno); +} + +/* + * Get number of attributes within tuple. * * Note that this does not include an implicit tiebreaker heap TID * attribute, if any. Note also that the number of key attributes must be * explicitly represented in all heapkeyspace pivot tuples. + * + * Note: This is defined as a macro rather than an inline function to + * avoid including rel.h. */ #define BTreeTupleGetNAtts(itup, rel) \ ( \ - (itup)->t_info & INDEX_ALT_TID_MASK ? \ + (BTreeTupleIsPivot(itup)) ? \ ( \ - ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_OFFSET_MASK \ ) \ : \ IndexRelationGetNumberOfAttributes(rel) \ ) -#define BTreeTupleSetNAtts(itup, n) \ - do { \ - (itup)->t_info |= INDEX_ALT_TID_MASK; \ - ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \ - } while(0) /* - * Get tiebreaker heap TID attribute, if any. Macro works with both pivot - * and non-pivot tuples, despite differences in how heap TID is represented. + * Set number of attributes in tuple, making it into a pivot tuple */ -#define BTreeTupleGetHeapTID(itup) \ - ( \ - (itup)->t_info & INDEX_ALT_TID_MASK && \ - (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_PIVOT_HEAP_TID_ATTR) != 0 ? \ - ( \ - (ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \ - sizeof(ItemPointerData)) \ - ) \ - : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \ - ) +static inline void +BTreeTupleSetNAtts(IndexTuple itup, int natts) +{ + Assert(natts <= INDEX_MAX_KEYS); + + itup->t_info |= INDEX_ALT_TID_MASK; + /* BT_IS_POSTING bit may be unset -- tuple always becomes a pivot tuple */ + ItemPointerSetOffsetNumber(&itup->t_tid, natts); + Assert(BTreeTupleIsPivot(itup)); +} + /* - * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK - * representation (currently limited to pivot tuples) + * Set the bit indicating heap TID attribute present in pivot tuple */ -#define BTreeTupleSetAltHeapTID(itup) \ - do { \ - Assert((itup)->t_info & INDEX_ALT_TID_MASK); \ - ItemPointerSetOffsetNumber(&(itup)->t_tid, \ - ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_PIVOT_HEAP_TID_ATTR); \ - } while(0) +static inline void +BTreeTupleSetAltHeapTID(IndexTuple pivot) +{ + OffsetNumber existing; + + Assert(BTreeTupleIsPivot(pivot)); + + existing = ItemPointerGetOffsetNumberNoCheck(&pivot->t_tid); + ItemPointerSetOffsetNumber(&pivot->t_tid, + existing | BT_PIVOT_HEAP_TID_ATTR); +} + +/* + * Get/set leaf page's "top parent" link from its high key. Used during page + * deletion. + * + * Note: Cannot assert that tuple is a pivot tuple. If we did so then + * !heapkeyspace indexes would exhibit false positive assertion failures. + */ +static inline BlockNumber +BTreeTupleGetTopParent(IndexTuple leafhikey) +{ + return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid); +} + +static inline void +BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno) +{ + ItemPointerSetBlockNumber(&leafhikey->t_tid, blkno); + BTreeTupleSetNAtts(leafhikey, 0); +} + +/* + * Get tiebreaker heap TID attribute, if any. + * + * This returns the first/lowest heap TID in the case of a posting list tuple. + */ +static inline ItemPointer +BTreeTupleGetHeapTID(IndexTuple itup) +{ + if (BTreeTupleIsPivot(itup)) + { + /* Pivot tuple heap TID representation? */ + if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & + BT_PIVOT_HEAP_TID_ATTR) != 0) + return (ItemPointer) ((char *) itup + IndexTupleSize(itup) - + sizeof(ItemPointerData)); + + /* Heap TID attribute was truncated */ + return NULL; + } + else if (BTreeTupleIsPosting(itup)) + return BTreeTupleGetPosting(itup); + + return &itup->t_tid; +} + +/* + * Get maximum heap TID attribute, which could be the only TID in the case of + * a non-pivot tuple that does not have a posting list tuple. + * + * Works with non-pivot tuples only. + */ +static inline ItemPointer +BTreeTupleGetMaxHeapTID(IndexTuple itup) +{ + Assert(!BTreeTupleIsPivot(itup)); + + if (BTreeTupleIsPosting(itup)) + { + uint16 nposting = BTreeTupleGetNPosting(itup); + + return BTreeTupleGetPostingN(itup, nposting - 1); + } + + return &itup->t_tid; +} /* * Operator strategy numbers for B-tree have been moved to access/stratnum.h, @@ -439,6 +629,9 @@ typedef BTStackData *BTStack; * indexes whose version is >= version 4. It's convenient to keep this close * by, rather than accessing the metapage repeatedly. * + * allequalimage is set to indicate that deduplication is safe for the index. + * This is also a property of the index relation rather than an indexscan. + * * anynullkeys indicates if any of the keys had NULL value when scankey was * built from index tuple (note that already-truncated tuple key attributes * set NULL as a placeholder key value, which also affects value of @@ -474,6 +667,7 @@ typedef BTStackData *BTStack; typedef struct BTScanInsertData { bool heapkeyspace; + bool allequalimage; bool anynullkeys; bool nextkey; bool pivotsearch; @@ -512,10 +706,94 @@ typedef struct BTInsertStateData bool bounds_valid; OffsetNumber low; OffsetNumber stricthigh; + + /* + * if _bt_binsrch_insert found the location inside existing posting list, + * save the position inside the list. -1 sentinel value indicates overlap + * with an existing posting list tuple that has its LP_DEAD bit set. + */ + int postingoff; } BTInsertStateData; typedef BTInsertStateData *BTInsertState; +/* + * State used to representing an individual pending tuple during + * deduplication. + */ +typedef struct BTDedupInterval +{ + OffsetNumber baseoff; + uint16 nitems; +} BTDedupInterval; + +/* + * BTDedupStateData is a working area used during deduplication. + * + * The status info fields track the state of a whole-page deduplication pass. + * State about the current pending posting list is also tracked. + * + * A pending posting list is comprised of a contiguous group of equal items + * from the page, starting from page offset number 'baseoff'. This is the + * offset number of the "base" tuple for new posting list. 'nitems' is the + * current total number of existing items from the page that will be merged to + * make a new posting list tuple, including the base tuple item. (Existing + * items may themselves be posting list tuples, or regular non-pivot tuples.) + * + * The total size of the existing tuples to be freed when pending posting list + * is processed gets tracked by 'phystupsize'. This information allows + * deduplication to calculate the space saving for each new posting list + * tuple, and for the entire pass over the page as a whole. + */ +typedef struct BTDedupStateData +{ + /* Deduplication status info for entire pass over page */ + bool deduplicate; /* Still deduplicating page? */ + Size maxpostingsize; /* Limit on size of final tuple */ + + /* Metadata about base tuple of current pending posting list */ + IndexTuple base; /* Use to form new posting list */ + OffsetNumber baseoff; /* page offset of base */ + Size basetupsize; /* base size without original posting list */ + + /* Other metadata about pending posting list */ + ItemPointer htids; /* Heap TIDs in pending posting list */ + int nhtids; /* Number of heap TIDs in htids array */ + int nitems; /* Number of existing tuples/line pointers */ + Size phystupsize; /* Includes line pointer overhead */ + + /* + * Array of tuples to go on new version of the page. Contains one entry + * for each group of consecutive items. Note that existing tuples that + * will not become posting list tuples do not appear in the array (they + * are implicitly unchanged by deduplication pass). + */ + int nintervals; /* current size of intervals array */ + BTDedupInterval intervals[MaxIndexTuplesPerPage]; +} BTDedupStateData; + +typedef BTDedupStateData *BTDedupState; + +/* + * BTVacuumPostingData is state that represents how to VACUUM a posting list + * tuple when some (though not all) of its TIDs are to be deleted. + * + * Convention is that itup field is the original posting list tuple on input, + * and palloc()'d final tuple used to overwrite existing tuple on output. + */ +typedef struct BTVacuumPostingData +{ + /* Tuple that will be/was updated */ + IndexTuple itup; + OffsetNumber updatedoffset; + + /* State needed to describe final itup in WAL */ + uint16 ndeletedtids; + uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]; +} BTVacuumPostingData; + +typedef BTVacuumPostingData *BTVacuumPosting; + /* * BTScanOpaqueData is the btree-private state needed for an indexscan. * This consists of preprocessed scan keys (see _bt_preprocess_keys() for @@ -539,7 +817,9 @@ typedef BTInsertStateData *BTInsertState; * If we are doing an index-only scan, we save the entire IndexTuple for each * matched item, otherwise only its heap TID and offset. The IndexTuples go * into a separate workspace array; each BTScanPosItem stores its tuple's - * offset within that array. + * offset within that array. Posting list tuples store a "base" tuple once, + * allowing the same key to be returned for each TID in the posting list + * tuple. */ typedef struct BTScanPosItem /* what we remember about each match */ @@ -583,7 +863,7 @@ typedef struct BTScanPosData int lastItem; /* last valid index in items[] */ int itemIndex; /* current index in items[] */ - BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ + BTScanPosItem items[MaxTIDsPerBTreePage]; /* MUST BE LAST */ } BTScanPosData; typedef BTScanPosData *BTScanPos; @@ -691,6 +971,7 @@ typedef struct BTOptions int fillfactor; /* page fill factor in percent (0..100) */ /* fraction of newly inserted tuples prior to trigger index cleanup */ float8 vacuum_cleanup_index_scale_factor; + bool deduplicate_items; /* Try to deduplicate items? */ } BTOptions; #define BTGetFillFactor(relation) \ @@ -701,6 +982,11 @@ typedef struct BTOptions BTREE_DEFAULT_FILLFACTOR) #define BTGetTargetPageFreeSpace(relation) \ (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100) +#define BTGetDeduplicateItems(relation) \ + (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ + relation->rd_rel->relam == BTREE_AM_OID), \ + ((relation)->rd_options ? \ + ((BTOptions *) (relation)->rd_options)->deduplicate_items : true)) /* * Constant definition for progress reporting. Phase numbers must match @@ -747,6 +1033,22 @@ extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page); extern void _bt_parallel_done(IndexScanDesc scan); extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); +/* + * prototypes for functions in nbtdedup.c + */ +extern void _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, + IndexTuple newitem, Size newitemsz, + bool checkingunique); +extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, + OffsetNumber baseoff); +extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup); +extern Size _bt_dedup_finish_pending(Page newpage, BTDedupState state); +extern IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, + int nhtids); +extern void _bt_update_posting(BTVacuumPosting vacposting); +extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, + int postingoff); + /* * prototypes for functions in nbtinsert.c */ @@ -765,14 +1067,16 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page page, /* * prototypes for functions in nbtpage.c */ -extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level); +extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, + bool allequalimage); extern void _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples); extern void _bt_upgrademetapage(Page page); extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_gettrueroot(Relation rel); extern int _bt_getrootheight(Relation rel); -extern bool _bt_heapkeyspace(Relation rel); +extern void _bt_metaversion(Relation rel, bool *heapkeyspace, + bool *allequalimage); extern void _bt_checkpage(Relation rel, Buffer buf); extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, @@ -781,7 +1085,8 @@ extern void _bt_relbuf(Relation rel, Buffer buf); extern void _bt_pageinit(Page page, Size size); extern bool _bt_page_recyclable(Page page); extern void _bt_delitems_vacuum(Relation rel, Buffer buf, - OffsetNumber *deletable, int ndeletable); + OffsetNumber *deletable, int ndeletable, + BTVacuumPosting *updatable, int nupdatable); extern void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel); diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 776a9bd723..347976c532 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -28,7 +28,8 @@ #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */ #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ -/* 0x50 and 0x60 are unused */ +#define XLOG_BTREE_INSERT_POST 0x50 /* add index tuple with posting split */ +#define XLOG_BTREE_DEDUP 0x60 /* deduplicate tuples for a page */ #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */ #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */ @@ -53,21 +54,34 @@ typedef struct xl_btree_metadata uint32 fastlevel; TransactionId oldest_btpo_xact; float8 last_cleanup_num_heap_tuples; + bool allequalimage; } xl_btree_metadata; /* * This is what we need to know about simple (without split) insert. * - * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META. - * Note that INSERT_META implies it's not a leaf page. + * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and + * INSERT_POST. Note that INSERT_META and INSERT_UPPER implies it's not a + * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf + * page. * - * Backup Blk 0: original page (data contains the inserted tuple) + * Backup Blk 0: original page * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META * Backup Blk 2: xl_btree_metadata, if INSERT_META + * + * Note: The new tuple is actually the "original" new item in the posting + * list split insert case (i.e. the INSERT_POST case). A split offset for + * the posting list is logged before the original new item. Recovery needs + * both, since it must do an in-place update of the existing posting list + * that was split as an extra step. Also, recovery generates a "final" + * newitem. See _bt_swap_posting() for details on posting list splits. */ typedef struct xl_btree_insert { OffsetNumber offnum; + + /* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */ + /* NEW TUPLE ALWAYS FOLLOWS AT THE END */ } xl_btree_insert; #define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) @@ -92,8 +106,37 @@ typedef struct xl_btree_insert * Backup Blk 0: original page / new left page * * The left page's data portion contains the new item, if it's the _L variant. - * An IndexTuple representing the high key of the left page must follow with - * either variant. + * _R variant split records generally do not have a newitem (_R variant leaf + * page split records that must deal with a posting list split will include an + * explicit newitem, though it is never used on the right page -- it is + * actually an orignewitem needed to update existing posting list). The new + * high key of the left/original page appears last of all (and must always be + * present). + * + * Page split records that need the REDO routine to deal with a posting list + * split directly will have an explicit newitem, which is actually an + * orignewitem (the newitem as it was before the posting list split, not + * after). A posting list split always has a newitem that comes immediately + * after the posting list being split (which would have overlapped with + * orignewitem prior to split). Usually REDO must deal with posting list + * splits with an _L variant page split record, and usually both the new + * posting list and the final newitem go on the left page (the existing + * posting list will be inserted instead of the old, and the final newitem + * will be inserted next to that). However, _R variant split records will + * include an orignewitem when the split point for the page happens to have a + * lastleft tuple that is also the posting list being split (leaving newitem + * as the page split's firstright tuple). The existence of this corner case + * does not change the basic fact about newitem/orignewitem for the REDO + * routine: it is always state used for the left page alone. (This is why the + * record's postingoff field isn't a reliable indicator of whether or not a + * posting list split occurred during the page split; a non-zero value merely + * indicates that the REDO routine must reconstruct a new posting list tuple + * that is needed for the left page.) + * + * This posting list split handling is equivalent to the xl_btree_insert REDO + * routine's INSERT_POST handling. While the details are more complicated + * here, the concept and goals are exactly the same. See _bt_swap_posting() + * for details on posting list splits. * * Backup Blk 1: new right page * @@ -111,15 +154,33 @@ typedef struct xl_btree_split { uint32 level; /* tree level of page being split */ OffsetNumber firstright; /* first item moved to right page */ - OffsetNumber newitemoff; /* new item's offset (useful for _L variant) */ + OffsetNumber newitemoff; /* new item's offset */ + uint16 postingoff; /* offset inside orig posting tuple */ } xl_btree_split; -#define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber)) +#define SizeOfBtreeSplit (offsetof(xl_btree_split, postingoff) + sizeof(uint16)) + +/* + * When page is deduplicated, consecutive groups of tuples with equal keys are + * merged together into posting list tuples. + * + * The WAL record represents a deduplication pass for a leaf page. An array + * of BTDedupInterval structs follows. + */ +typedef struct xl_btree_dedup +{ + uint16 nintervals; + + /* DEDUPLICATION INTERVALS FOLLOW */ +} xl_btree_dedup; + +#define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nintervals) + sizeof(uint16)) /* * This is what we need to know about delete of individual leaf index tuples. * The WAL record can represent deletion of any number of index tuples on a - * single index page when *not* executed by VACUUM. + * single index page when *not* executed by VACUUM. Deletion of a subset of + * the TIDs within a posting list tuple is not supported. * * Backup Blk 0: index page */ @@ -150,21 +211,43 @@ typedef struct xl_btree_reuse_page #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page)) /* - * This is what we need to know about vacuum of individual leaf index tuples. - * The WAL record can represent deletion of any number of index tuples on a - * single index page when executed by VACUUM. + * This is what we need to know about which TIDs to remove from an individual + * posting list tuple during vacuuming. An array of these may appear at the + * end of xl_btree_vacuum records. + */ +typedef struct xl_btree_update +{ + uint16 ndeletedtids; + + /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */ +} xl_btree_update; + +#define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16)) + +/* + * This is what we need to know about a VACUUM of a leaf page. The WAL record + * can represent deletion of any number of index tuples on a single index page + * when executed by VACUUM. It can also support "updates" of index tuples, + * which is how deletes of a subset of TIDs contained in an existing posting + * list tuple are implemented. (Updates are only used when there will be some + * remaining TIDs once VACUUM finishes; otherwise the posting list tuple can + * just be deleted). * - * Note that the WAL record in any vacuum of an index must have at least one - * item to delete. + * Updated posting list tuples are represented using xl_btree_update metadata. + * The REDO routine uses each xl_btree_update (plus its corresponding original + * index tuple from the target leaf page) to generate the final updated tuple. */ typedef struct xl_btree_vacuum { - uint32 ndeleted; + uint16 ndeleted; + uint16 nupdated; /* DELETED TARGET OFFSET NUMBERS FOLLOW */ + /* UPDATED TARGET OFFSET NUMBERS FOLLOW */ + /* UPDATED TUPLES METADATA ARRAY FOLLOWS */ } xl_btree_vacuum; -#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, ndeleted) + sizeof(uint32)) +#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16)) /* * This is what we need to know about marking an empty branch for deletion. @@ -245,6 +328,8 @@ typedef struct xl_btree_newroot extern void btree_redo(XLogReaderState *record); extern void btree_desc(StringInfo buf, XLogReaderState *record); extern const char *btree_identify(uint8 info); +extern void btree_xlog_startup(void); +extern void btree_xlog_cleanup(void); extern void btree_mask(char *pagedata, BlockNumber blkno); #endif /* NBTXLOG_H */ diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index c88dccfb8d..6c15df7e70 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -36,7 +36,7 @@ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL) PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask) PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask) -PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask) +PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask) PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask) PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask) PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask) diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 087918d41d..27ded593ab 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -31,7 +31,7 @@ /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD104 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD105 /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData { diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index f567117a46..1646deb092 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -200,7 +200,7 @@ reset enable_indexscan; reset enable_bitmapscan; -- Also check LIKE optimization with binary-compatible cases create temp table btree_bpchar (f1 text collate "C"); -create index on btree_bpchar(f1 bpchar_ops); +create index on btree_bpchar(f1 bpchar_ops) WITH (deduplicate_items=on); insert into btree_bpchar values ('foo'), ('fool'), ('bar'), ('quux'); -- doesn't match index: explain (costs off) @@ -266,6 +266,24 @@ select * from btree_bpchar where f1::bpchar like 'foo%'; fool (2 rows) +-- get test coverage for "single value" deduplication strategy: +insert into btree_bpchar select 'foo' from generate_series(1,1500); +-- +-- Perform unique checking, with and without the use of deduplication +-- +CREATE TABLE dedup_unique_test_table (a int) WITH (autovacuum_enabled=false); +CREATE UNIQUE INDEX dedup_unique ON dedup_unique_test_table (a) WITH (deduplicate_items=on); +CREATE UNIQUE INDEX plain_unique ON dedup_unique_test_table (a) WITH (deduplicate_items=off); +-- Generate enough garbage tuples in index to ensure that even the unique index +-- with deduplication enabled has to check multiple leaf pages during unique +-- checking (at least with a BLCKSZ of 8192 or less) +DO $$ +BEGIN + FOR r IN 1..1350 LOOP + DELETE FROM dedup_unique_test_table; + INSERT INTO dedup_unique_test_table SELECT 1; + END LOOP; +END$$; -- -- Test B-tree fast path (cache rightmost leaf page) optimization. -- diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index 558dcae0ec..6e14b935ce 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -86,7 +86,7 @@ reset enable_bitmapscan; -- Also check LIKE optimization with binary-compatible cases create temp table btree_bpchar (f1 text collate "C"); -create index on btree_bpchar(f1 bpchar_ops); +create index on btree_bpchar(f1 bpchar_ops) WITH (deduplicate_items=on); insert into btree_bpchar values ('foo'), ('fool'), ('bar'), ('quux'); -- doesn't match index: explain (costs off) @@ -103,6 +103,26 @@ explain (costs off) select * from btree_bpchar where f1::bpchar like 'foo%'; select * from btree_bpchar where f1::bpchar like 'foo%'; +-- get test coverage for "single value" deduplication strategy: +insert into btree_bpchar select 'foo' from generate_series(1,1500); + +-- +-- Perform unique checking, with and without the use of deduplication +-- +CREATE TABLE dedup_unique_test_table (a int) WITH (autovacuum_enabled=false); +CREATE UNIQUE INDEX dedup_unique ON dedup_unique_test_table (a) WITH (deduplicate_items=on); +CREATE UNIQUE INDEX plain_unique ON dedup_unique_test_table (a) WITH (deduplicate_items=off); +-- Generate enough garbage tuples in index to ensure that even the unique index +-- with deduplication enabled has to check multiple leaf pages during unique +-- checking (at least with a BLCKSZ of 8192 or less) +DO $$ +BEGIN + FOR r IN 1..1350 LOOP + DELETE FROM dedup_unique_test_table; + INSERT INTO dedup_unique_test_table SELECT 1; + END LOOP; +END$$; + -- -- Test B-tree fast path (cache rightmost leaf page) optimization. --