1996-07-09 08:22:35 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
2003-11-09 22:30:38 +01:00
|
|
|
* nbtinsert.c
|
1996-07-09 08:22:35 +02:00
|
|
|
* Item insertion in Lehman and Yao btrees for Postgres.
|
|
|
|
*
|
2024-01-04 02:49:05 +01:00
|
|
|
* Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/access/nbtree/nbtinsert.c
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "postgres.h"
|
1996-10-23 09:42:13 +02:00
|
|
|
|
1999-07-16 07:00:38 +02:00
|
|
|
#include "access/nbtree.h"
|
2017-02-14 21:37:59 +01:00
|
|
|
#include "access/nbtxlog.h"
|
2006-07-13 18:49:20 +02:00
|
|
|
#include "access/transam.h"
|
2014-11-06 12:52:08 +01:00
|
|
|
#include "access/xloginsert.h"
|
2024-02-16 21:05:36 +01:00
|
|
|
#include "common/int.h"
|
Replace random(), pg_erand48(), etc with a better PRNG API and algorithm.
Standardize on xoroshiro128** as our basic PRNG algorithm, eliminating
a bunch of platform dependencies as well as fundamentally-obsolete PRNG
code. In addition, this API replacement will ease replacing the
algorithm again in future, should that become necessary.
xoroshiro128** is a few percent slower than the drand48 family,
but it can produce full-width 64-bit random values not only 48-bit,
and it should be much more trustworthy. It's likely to be noticeably
faster than the platform's random(), depending on which platform you
are thinking about; and we can have non-global state vectors easily,
unlike with random(). It is not cryptographically strong, but neither
are the functions it replaces.
Fabien Coelho, reviewed by Dean Rasheed, Aleksander Alekseev, and myself
Discussion: https://postgr.es/m/alpine.DEB.2.22.394.2105241211230.165418@pseudo
2021-11-29 03:32:36 +01:00
|
|
|
#include "common/pg_prng.h"
|
2021-01-13 18:21:32 +01:00
|
|
|
#include "lib/qunique.h"
|
2001-01-14 06:08:17 +01:00
|
|
|
#include "miscadmin.h"
|
2008-05-12 02:00:54 +02:00
|
|
|
#include "storage/lmgr.h"
|
Implement genuine serializable isolation level.
Until now, our Serializable mode has in fact been what's called Snapshot
Isolation, which allows some anomalies that could not occur in any
serialized ordering of the transactions. This patch fixes that using a
method called Serializable Snapshot Isolation, based on research papers by
Michael J. Cahill (see README-SSI for full references). In Serializable
Snapshot Isolation, transactions run like they do in Snapshot Isolation,
but a predicate lock manager observes the reads and writes performed and
aborts transactions if it detects that an anomaly might occur. This method
produces some false positives, ie. it sometimes aborts transactions even
though there is no anomaly.
To track reads we implement predicate locking, see storage/lmgr/predicate.c.
Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared
memory is finite, so when a transaction takes many tuple-level locks on a
page, the locks are promoted to a single page-level lock, and further to a
single relation level lock if necessary. To lock key values with no matching
tuple, a sequential scan always takes a relation-level lock, and an index
scan acquires a page-level lock that covers the search key, whether or not
there are any matching keys at the moment.
A predicate lock doesn't conflict with any regular locks or with another
predicate locks in the normal sense. They're only used by the predicate lock
manager to detect the danger of anomalies. Only serializable transactions
participate in predicate locking, so there should be no extra overhead for
for other transactions.
Predicate locks can't be released at commit, but must be remembered until
all the transactions that overlapped with it have completed. That means that
we need to remember an unbounded amount of predicate locks, so we apply a
lossy but conservative method of tracking locks for committed transactions.
If we run short of shared memory, we overflow to a new "pg_serial" SLRU
pool.
We don't currently allow Serializable transactions in Hot Standby mode.
That would be hard, because even read-only transactions can cause anomalies
that wouldn't otherwise occur.
Serializable isolation mode now means the new fully serializable level.
Repeatable Read gives you the old Snapshot Isolation level that we have
always had.
Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and
Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
|
|
|
#include "storage/predicate.h"
|
2018-03-26 14:09:24 +02:00
|
|
|
#include "storage/smgr.h"
|
1996-10-20 12:53:18 +02:00
|
|
|
|
2018-04-11 00:21:03 +02:00
|
|
|
/* Minimum tree height for application of fastpath optimization */
|
|
|
|
#define BTREE_FASTPATH_MIN_LEVEL 2
|
1996-11-03 13:35:27 +01:00
|
|
|
|
2001-01-26 02:24:31 +01:00
|
|
|
|
2023-04-02 05:12:26 +02:00
|
|
|
static BTStack _bt_search_insert(Relation rel, Relation heaprel,
|
|
|
|
BTInsertState insertstate);
|
2019-03-20 17:30:57 +01:00
|
|
|
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
|
|
|
|
Relation heapRel,
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
IndexUniqueCheck checkUnique, bool *is_unique,
|
|
|
|
uint32 *speculativeToken);
|
2019-03-20 17:30:57 +01:00
|
|
|
static OffsetNumber _bt_findinsertloc(Relation rel,
|
|
|
|
BTInsertState insertstate,
|
|
|
|
bool checkingunique,
|
2021-01-13 18:21:32 +01:00
|
|
|
bool indexUnchanged,
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
BTStack stack,
|
2010-03-28 11:27:02 +02:00
|
|
|
Relation heapRel);
|
2023-04-02 05:12:26 +02:00
|
|
|
static void _bt_stepright(Relation rel, Relation heaprel,
|
|
|
|
BTInsertState insertstate, BTStack stack);
|
|
|
|
static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
Buffer buf,
|
|
|
|
Buffer cbuf,
|
2000-07-21 08:42:39 +02:00
|
|
|
BTStack stack,
|
2006-01-26 00:04:21 +01:00
|
|
|
IndexTuple itup,
|
2020-03-16 20:00:10 +01:00
|
|
|
Size itemsz,
|
2007-03-03 21:13:06 +01:00
|
|
|
OffsetNumber newitemoff,
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
int postingoff,
|
2003-02-21 01:06:22 +01:00
|
|
|
bool split_only_page);
|
2023-04-02 05:12:26 +02:00
|
|
|
static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
|
|
|
|
Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
|
|
|
|
Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
IndexTuple nposting, uint16 postingoff);
|
2023-04-02 05:12:26 +02:00
|
|
|
static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
|
|
|
|
Buffer rbuf, BTStack stack, bool isroot, bool isonly);
|
2023-06-10 23:08:25 +02:00
|
|
|
static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
|
2020-04-14 01:39:55 +02:00
|
|
|
static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
|
|
|
|
OffsetNumber itup_off, bool newfirstdataitem);
|
2020-11-17 18:45:56 +01:00
|
|
|
static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
|
|
|
|
BTInsertState insertstate,
|
2021-01-13 18:21:32 +01:00
|
|
|
bool simpleonly, bool checkingunique,
|
|
|
|
bool uniquedup, bool indexUnchanged);
|
|
|
|
static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
|
|
|
|
OffsetNumber *deletable, int ndeletable,
|
|
|
|
IndexTuple newitem, OffsetNumber minoff,
|
|
|
|
OffsetNumber maxoff);
|
|
|
|
static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
|
|
|
|
int ndeletable, IndexTuple newitem,
|
|
|
|
int *nblocks);
|
|
|
|
static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
|
1996-07-09 08:22:35 +02:00
|
|
|
|
|
|
|
/*
|
2006-01-26 00:04:21 +01:00
|
|
|
* _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2014-02-26 17:48:21 +01:00
|
|
|
* This routine is called by the public interface routine, btinsert.
|
|
|
|
* By here, itup is filled in, including the TID.
|
2009-07-29 22:56:21 +02:00
|
|
|
*
|
|
|
|
* If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
|
|
|
|
* will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
|
|
|
|
* UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
|
|
|
|
* For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
|
|
|
|
* don't actually insert.
|
|
|
|
*
|
2021-01-13 18:21:32 +01:00
|
|
|
* indexUnchanged executor hint indicates if itup is from an
|
|
|
|
* UPDATE that didn't logically change the indexed value, but
|
|
|
|
* must nevertheless have a new entry to point to a successor
|
|
|
|
* version.
|
|
|
|
*
|
2009-07-29 22:56:21 +02:00
|
|
|
* The result value is only significant for UNIQUE_CHECK_PARTIAL:
|
2017-08-16 06:22:32 +02:00
|
|
|
* it must be true if the entry is known unique, else false.
|
|
|
|
* (In the current implementation we'll also return true after a
|
2009-07-29 22:56:21 +02:00
|
|
|
* successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
|
|
|
|
* that's just a coding artifact.)
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2009-07-29 22:56:21 +02:00
|
|
|
bool
|
2006-01-26 00:04:21 +01:00
|
|
|
_bt_doinsert(Relation rel, IndexTuple itup,
|
2021-01-13 18:21:32 +01:00
|
|
|
IndexUniqueCheck checkUnique, bool indexUnchanged,
|
|
|
|
Relation heapRel)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2009-07-29 22:56:21 +02:00
|
|
|
bool is_unique = false;
|
2019-03-20 17:30:57 +01:00
|
|
|
BTInsertStateData insertstate;
|
|
|
|
BTScanInsert itup_key;
|
2020-03-18 22:42:49 +01:00
|
|
|
BTStack stack;
|
2019-03-20 17:30:57 +01:00
|
|
|
bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
|
2018-04-07 22:00:39 +02:00
|
|
|
|
2006-01-17 01:09:01 +01:00
|
|
|
/* we need an insertion scan key to do our search, so build one */
|
2023-06-10 23:08:25 +02:00
|
|
|
itup_key = _bt_mkscankey(rel, itup);
|
Prevent O(N^2) unique index insertion edge case.
Commit dd299df8 made nbtree treat heap TID as a tiebreaker column,
establishing the principle that there is only one correct location (page
and page offset number) for every index tuple, no matter what.
Insertions of tuples into non-unique indexes proceed as if heap TID
(scan key's scantid) is just another user-attribute value, but
insertions into unique indexes are more delicate. The TID value in
scantid must initially be omitted to ensure that the unique index
insertion visits every leaf page that duplicates could be on. The
scantid is set once again after unique checking finishes successfully,
which can force _bt_findinsertloc() to step right one or more times, to
locate the leaf page that the new tuple must be inserted on.
Stepping right within _bt_findinsertloc() was assumed to occur no more
frequently than stepping right within _bt_check_unique(), but there was
one important case where that assumption was incorrect: inserting a
"duplicate" with NULL values. Since _bt_check_unique() didn't do any
real work in this case, it wasn't appropriate for _bt_findinsertloc() to
behave as if it was finishing off a conventional unique insertion, where
any existing physical duplicate must be dead or recently dead.
_bt_findinsertloc() might have to grovel through a substantial portion
of all of the leaf pages in the index to insert a single tuple, even
when there were no dead tuples.
To fix, treat insertions of tuples with NULLs into a unique index as if
they were insertions into a non-unique index: never unset scantid before
calling _bt_search() to descend the tree, and bypass _bt_check_unique()
entirely. _bt_check_unique() is no longer responsible for incoming
tuples with NULL values.
Discussion: https://postgr.es/m/CAH2-Wzm08nr+JPx4jMOa9CGqxWYDQ-_D4wtPBiKghXAUiUy-nQ@mail.gmail.com
2019-04-23 19:33:57 +02:00
|
|
|
|
|
|
|
if (checkingunique)
|
|
|
|
{
|
|
|
|
if (!itup_key->anynullkeys)
|
|
|
|
{
|
|
|
|
/* No (heapkeyspace) scantid until uniqueness established */
|
|
|
|
itup_key->scantid = NULL;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Scan key for new tuple contains NULL key values. Bypass
|
|
|
|
* checkingunique steps. They are unnecessary because core code
|
|
|
|
* considers NULL unequal to every value, including NULL.
|
|
|
|
*
|
|
|
|
* This optimization avoids O(N^2) behavior within the
|
|
|
|
* _bt_findinsertloc() heapkeyspace path when a unique index has a
|
|
|
|
* large number of "duplicates" with NULL key values.
|
|
|
|
*/
|
|
|
|
checkingunique = false;
|
|
|
|
/* Tuple is unique in the sense that core code cares about */
|
|
|
|
Assert(checkUnique != UNIQUE_CHECK_EXISTING);
|
|
|
|
is_unique = true;
|
|
|
|
}
|
|
|
|
}
|
2019-03-20 17:30:57 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Fill in the BTInsertState working area, to track the current page and
|
2020-03-16 20:00:10 +01:00
|
|
|
* position within the page to insert on.
|
|
|
|
*
|
|
|
|
* Note that itemsz is passed down to lower level code that deals with
|
|
|
|
* inserting the item. It must be MAXALIGN()'d. This ensures that space
|
|
|
|
* accounting code consistently considers the alignment overhead that we
|
|
|
|
* expect PageAddItem() will add later. (Actually, index_form_tuple() is
|
|
|
|
* already conservative about alignment, but we don't rely on that from
|
|
|
|
* this distance. Besides, preserving the "true" tuple size in index
|
|
|
|
* tuple headers for the benefit of nbtsplitloc.c might happen someday.
|
|
|
|
* Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
|
2019-03-20 17:30:57 +01:00
|
|
|
*/
|
|
|
|
insertstate.itup = itup;
|
|
|
|
insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
|
|
|
|
insertstate.itup_key = itup_key;
|
|
|
|
insertstate.bounds_valid = false;
|
|
|
|
insertstate.buf = InvalidBuffer;
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
insertstate.postingoff = 0;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2020-03-18 22:42:49 +01:00
|
|
|
search:
|
|
|
|
|
2018-03-26 14:09:24 +02:00
|
|
|
/*
|
2020-03-18 22:42:49 +01:00
|
|
|
* Find and lock the leaf page that the tuple should be added to by
|
|
|
|
* searching from the root page. insertstate.buf will hold a buffer that
|
|
|
|
* is locked in exclusive mode afterwards.
|
2018-03-26 14:09:24 +02:00
|
|
|
*/
|
2023-04-02 05:12:26 +02:00
|
|
|
stack = _bt_search_insert(rel, heapRel, &insertstate);
|
2019-03-20 17:30:57 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
2020-03-18 22:42:49 +01:00
|
|
|
* checkingunique inserts are not allowed to go ahead when two tuples with
|
|
|
|
* equal key attribute values would be visible to new MVCC snapshots once
|
|
|
|
* the xact commits. Check for conflicts in the locked page/buffer (if
|
|
|
|
* needed) here.
|
|
|
|
*
|
|
|
|
* It might be necessary to check a page to the right in _bt_check_unique,
|
|
|
|
* though that should be very rare. In practice the first page the value
|
|
|
|
* could be on (with scantid omitted) is almost always also the only page
|
|
|
|
* that a matching tuple might be found on. This is due to the behavior
|
|
|
|
* of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
|
|
|
|
* only be allowed to cross a page boundary when there is no candidate
|
|
|
|
* leaf page split point that avoids it. Also, _bt_check_unique can use
|
|
|
|
* the leaf page high key to determine that there will be no duplicates on
|
|
|
|
* the right sibling without actually visiting it (it uses the high key in
|
|
|
|
* cases where the new item happens to belong at the far right of the leaf
|
|
|
|
* page).
|
2002-01-01 21:32:37 +01:00
|
|
|
*
|
|
|
|
* NOTE: obviously, _bt_check_unique can only detect keys that are already
|
|
|
|
* in the index; so it cannot defend against concurrent insertions of the
|
|
|
|
* same key. We protect against that by means of holding a write lock on
|
Prevent O(N^2) unique index insertion edge case.
Commit dd299df8 made nbtree treat heap TID as a tiebreaker column,
establishing the principle that there is only one correct location (page
and page offset number) for every index tuple, no matter what.
Insertions of tuples into non-unique indexes proceed as if heap TID
(scan key's scantid) is just another user-attribute value, but
insertions into unique indexes are more delicate. The TID value in
scantid must initially be omitted to ensure that the unique index
insertion visits every leaf page that duplicates could be on. The
scantid is set once again after unique checking finishes successfully,
which can force _bt_findinsertloc() to step right one or more times, to
locate the leaf page that the new tuple must be inserted on.
Stepping right within _bt_findinsertloc() was assumed to occur no more
frequently than stepping right within _bt_check_unique(), but there was
one important case where that assumption was incorrect: inserting a
"duplicate" with NULL values. Since _bt_check_unique() didn't do any
real work in this case, it wasn't appropriate for _bt_findinsertloc() to
behave as if it was finishing off a conventional unique insertion, where
any existing physical duplicate must be dead or recently dead.
_bt_findinsertloc() might have to grovel through a substantial portion
of all of the leaf pages in the index to insert a single tuple, even
when there were no dead tuples.
To fix, treat insertions of tuples with NULLs into a unique index as if
they were insertions into a non-unique index: never unset scantid before
calling _bt_search() to descend the tree, and bypass _bt_check_unique()
entirely. _bt_check_unique() is no longer responsible for incoming
tuples with NULL values.
Discussion: https://postgr.es/m/CAH2-Wzm08nr+JPx4jMOa9CGqxWYDQ-_D4wtPBiKghXAUiUy-nQ@mail.gmail.com
2019-04-23 19:33:57 +02:00
|
|
|
* the first page the value could be on, with omitted/-inf value for the
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
* implicit heap TID tiebreaker attribute. Any other would-be inserter of
|
|
|
|
* the same key must acquire a write lock on the same page, so only one
|
|
|
|
* would-be inserter can be making the check at one time. Furthermore,
|
|
|
|
* once we are past the check we hold write locks continuously until we
|
|
|
|
* have performed our insertion, so no later inserter can fail to see our
|
|
|
|
* insertion. (This requires some care in _bt_findinsertloc.)
|
2002-01-01 21:32:37 +01:00
|
|
|
*
|
|
|
|
* If we must wait for another xact, we release the lock while waiting,
|
2020-03-18 22:42:49 +01:00
|
|
|
* and then must perform a new search.
|
2009-07-29 22:56:21 +02:00
|
|
|
*
|
|
|
|
* For a partial uniqueness check, we don't wait for the other xact. Just
|
|
|
|
* let the tuple in and return false for possibly non-unique, or true for
|
|
|
|
* definitely unique.
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
2019-03-20 17:30:57 +01:00
|
|
|
if (checkingunique)
|
1997-01-10 11:06:20 +01:00
|
|
|
{
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
TransactionId xwait;
|
|
|
|
uint32 speculativeToken;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2019-03-20 17:30:57 +01:00
|
|
|
xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
|
|
|
|
&is_unique, &speculativeToken);
|
2000-07-21 08:42:39 +02:00
|
|
|
|
2020-03-18 22:42:49 +01:00
|
|
|
if (unlikely(TransactionIdIsValid(xwait)))
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
|
|
|
/* Have to wait for the other guy ... */
|
2019-03-20 17:30:57 +01:00
|
|
|
_bt_relbuf(rel, insertstate.buf);
|
|
|
|
insertstate.buf = InvalidBuffer;
|
2015-05-24 03:35:49 +02:00
|
|
|
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
/*
|
|
|
|
* If it's a speculative insertion, wait for it to finish (ie. to
|
|
|
|
* go ahead with the insertion, or kill the tuple). Otherwise
|
|
|
|
* wait for the transaction to finish as usual.
|
|
|
|
*/
|
|
|
|
if (speculativeToken)
|
|
|
|
SpeculativeInsertionWait(xwait, speculativeToken);
|
|
|
|
else
|
|
|
|
XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* start over... */
|
2018-03-26 14:09:24 +02:00
|
|
|
if (stack)
|
|
|
|
_bt_freestack(stack);
|
2020-03-18 22:42:49 +01:00
|
|
|
goto search;
|
2000-07-21 08:42:39 +02:00
|
|
|
}
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
|
|
|
|
/* Uniqueness is established -- restore heap tid as scantid */
|
|
|
|
if (itup_key->heapkeyspace)
|
|
|
|
itup_key->scantid = &itup->t_tid;
|
2000-07-21 08:42:39 +02:00
|
|
|
}
|
|
|
|
|
2009-07-29 22:56:21 +02:00
|
|
|
if (checkUnique != UNIQUE_CHECK_EXISTING)
|
|
|
|
{
|
2019-03-20 17:30:57 +01:00
|
|
|
OffsetNumber newitemoff;
|
|
|
|
|
Implement genuine serializable isolation level.
Until now, our Serializable mode has in fact been what's called Snapshot
Isolation, which allows some anomalies that could not occur in any
serialized ordering of the transactions. This patch fixes that using a
method called Serializable Snapshot Isolation, based on research papers by
Michael J. Cahill (see README-SSI for full references). In Serializable
Snapshot Isolation, transactions run like they do in Snapshot Isolation,
but a predicate lock manager observes the reads and writes performed and
aborts transactions if it detects that an anomaly might occur. This method
produces some false positives, ie. it sometimes aborts transactions even
though there is no anomaly.
To track reads we implement predicate locking, see storage/lmgr/predicate.c.
Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared
memory is finite, so when a transaction takes many tuple-level locks on a
page, the locks are promoted to a single page-level lock, and further to a
single relation level lock if necessary. To lock key values with no matching
tuple, a sequential scan always takes a relation-level lock, and an index
scan acquires a page-level lock that covers the search key, whether or not
there are any matching keys at the moment.
A predicate lock doesn't conflict with any regular locks or with another
predicate locks in the normal sense. They're only used by the predicate lock
manager to detect the danger of anomalies. Only serializable transactions
participate in predicate locking, so there should be no extra overhead for
for other transactions.
Predicate locks can't be released at commit, but must be remembered until
all the transactions that overlapped with it have completed. That means that
we need to remember an unbounded amount of predicate locks, so we apply a
lossy but conservative method of tracking locks for committed transactions.
If we run short of shared memory, we overflow to a new "pg_serial" SLRU
pool.
We don't currently allow Serializable transactions in Hot Standby mode.
That would be hard, because even read-only transactions can cause anomalies
that wouldn't otherwise occur.
Serializable isolation mode now means the new fully serializable level.
Repeatable Read gives you the old Snapshot Isolation level that we have
always had.
Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and
Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
|
|
|
/*
|
|
|
|
* The only conflict predicate locking cares about for indexes is when
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
* an index tuple insert conflicts with an existing lock. We don't
|
Prevent O(N^2) unique index insertion edge case.
Commit dd299df8 made nbtree treat heap TID as a tiebreaker column,
establishing the principle that there is only one correct location (page
and page offset number) for every index tuple, no matter what.
Insertions of tuples into non-unique indexes proceed as if heap TID
(scan key's scantid) is just another user-attribute value, but
insertions into unique indexes are more delicate. The TID value in
scantid must initially be omitted to ensure that the unique index
insertion visits every leaf page that duplicates could be on. The
scantid is set once again after unique checking finishes successfully,
which can force _bt_findinsertloc() to step right one or more times, to
locate the leaf page that the new tuple must be inserted on.
Stepping right within _bt_findinsertloc() was assumed to occur no more
frequently than stepping right within _bt_check_unique(), but there was
one important case where that assumption was incorrect: inserting a
"duplicate" with NULL values. Since _bt_check_unique() didn't do any
real work in this case, it wasn't appropriate for _bt_findinsertloc() to
behave as if it was finishing off a conventional unique insertion, where
any existing physical duplicate must be dead or recently dead.
_bt_findinsertloc() might have to grovel through a substantial portion
of all of the leaf pages in the index to insert a single tuple, even
when there were no dead tuples.
To fix, treat insertions of tuples with NULLs into a unique index as if
they were insertions into a non-unique index: never unset scantid before
calling _bt_search() to descend the tree, and bypass _bt_check_unique()
entirely. _bt_check_unique() is no longer responsible for incoming
tuples with NULL values.
Discussion: https://postgr.es/m/CAH2-Wzm08nr+JPx4jMOa9CGqxWYDQ-_D4wtPBiKghXAUiUy-nQ@mail.gmail.com
2019-04-23 19:33:57 +02:00
|
|
|
* know the actual page we're going to insert on for sure just yet in
|
|
|
|
* checkingunique and !heapkeyspace cases, but it's okay to use the
|
|
|
|
* first page the value could be on (with scantid omitted) instead.
|
Implement genuine serializable isolation level.
Until now, our Serializable mode has in fact been what's called Snapshot
Isolation, which allows some anomalies that could not occur in any
serialized ordering of the transactions. This patch fixes that using a
method called Serializable Snapshot Isolation, based on research papers by
Michael J. Cahill (see README-SSI for full references). In Serializable
Snapshot Isolation, transactions run like they do in Snapshot Isolation,
but a predicate lock manager observes the reads and writes performed and
aborts transactions if it detects that an anomaly might occur. This method
produces some false positives, ie. it sometimes aborts transactions even
though there is no anomaly.
To track reads we implement predicate locking, see storage/lmgr/predicate.c.
Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared
memory is finite, so when a transaction takes many tuple-level locks on a
page, the locks are promoted to a single page-level lock, and further to a
single relation level lock if necessary. To lock key values with no matching
tuple, a sequential scan always takes a relation-level lock, and an index
scan acquires a page-level lock that covers the search key, whether or not
there are any matching keys at the moment.
A predicate lock doesn't conflict with any regular locks or with another
predicate locks in the normal sense. They're only used by the predicate lock
manager to detect the danger of anomalies. Only serializable transactions
participate in predicate locking, so there should be no extra overhead for
for other transactions.
Predicate locks can't be released at commit, but must be remembered until
all the transactions that overlapped with it have completed. That means that
we need to remember an unbounded amount of predicate locks, so we apply a
lossy but conservative method of tracking locks for committed transactions.
If we run short of shared memory, we overflow to a new "pg_serial" SLRU
pool.
We don't currently allow Serializable transactions in Hot Standby mode.
That would be hard, because even read-only transactions can cause anomalies
that wouldn't otherwise occur.
Serializable isolation mode now means the new fully serializable level.
Repeatable Read gives you the old Snapshot Isolation level that we have
always had.
Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and
Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
|
|
|
*/
|
2020-01-28 01:13:04 +01:00
|
|
|
CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
|
2019-03-20 17:30:57 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Do the insertion. Note that insertstate contains cached binary
|
|
|
|
* search bounds established within _bt_check_unique when insertion is
|
|
|
|
* checkingunique.
|
|
|
|
*/
|
|
|
|
newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
|
2021-01-13 18:21:32 +01:00
|
|
|
indexUnchanged, stack, heapRel);
|
2023-04-02 05:12:26 +02:00
|
|
|
_bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
|
|
|
|
stack, itup, insertstate.itemsz, newitemoff,
|
2020-03-16 20:00:10 +01:00
|
|
|
insertstate.postingoff, false);
|
2009-07-29 22:56:21 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* just release the buffer */
|
2019-03-20 17:30:57 +01:00
|
|
|
_bt_relbuf(rel, insertstate.buf);
|
2009-07-29 22:56:21 +02:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
|
|
|
|
/* be tidy */
|
2018-03-26 14:09:24 +02:00
|
|
|
if (stack)
|
|
|
|
_bt_freestack(stack);
|
2019-03-20 17:30:57 +01:00
|
|
|
pfree(itup_key);
|
2009-07-29 22:56:21 +02:00
|
|
|
|
|
|
|
return is_unique;
|
2000-07-21 08:42:39 +02:00
|
|
|
}
|
|
|
|
|
2020-03-18 22:42:49 +01:00
|
|
|
/*
|
|
|
|
* _bt_search_insert() -- _bt_search() wrapper for inserts
|
|
|
|
*
|
|
|
|
* Search the tree for a particular scankey, or more precisely for the first
|
|
|
|
* leaf page it could be on. Try to make use of the fastpath optimization's
|
|
|
|
* rightmost leaf page cache before actually searching the tree from the root
|
|
|
|
* page, though.
|
|
|
|
*
|
|
|
|
* Return value is a stack of parent-page pointers (though see notes about
|
|
|
|
* fastpath optimization and page splits below). insertstate->buf is set to
|
|
|
|
* the address of the leaf-page buffer, which is write-locked and pinned in
|
|
|
|
* all cases (if necessary by creating a new empty root page for caller).
|
|
|
|
*
|
|
|
|
* The fastpath optimization avoids most of the work of searching the tree
|
|
|
|
* repeatedly when a single backend inserts successive new tuples on the
|
|
|
|
* rightmost leaf page of an index. A backend cache of the rightmost leaf
|
|
|
|
* page is maintained within _bt_insertonpg(), and used here. The cache is
|
|
|
|
* invalidated here when an insert of a non-pivot tuple must take place on a
|
|
|
|
* non-rightmost leaf page.
|
|
|
|
*
|
|
|
|
* The optimization helps with indexes on an auto-incremented field. It also
|
|
|
|
* helps with indexes on datetime columns, as well as indexes with lots of
|
|
|
|
* NULL values. (NULLs usually get inserted in the rightmost page for single
|
|
|
|
* column indexes, since they usually get treated as coming after everything
|
|
|
|
* else in the key space. Individual NULL tuples will generally be placed on
|
|
|
|
* the rightmost leaf page due to the influence of the heap TID column.)
|
|
|
|
*
|
|
|
|
* Note that we avoid applying the optimization when there is insufficient
|
|
|
|
* space on the rightmost page to fit caller's new item. This is necessary
|
|
|
|
* because we'll need to return a real descent stack when a page split is
|
|
|
|
* expected (actually, caller can cope with a leaf page split that uses a NULL
|
|
|
|
* stack, but that's very slow and so must be avoided). Note also that the
|
|
|
|
* fastpath optimization acquires the lock on the page conditionally as a way
|
|
|
|
* of reducing extra contention when there are concurrent insertions into the
|
|
|
|
* rightmost page (we give up if we'd have to wait for the lock). We assume
|
|
|
|
* that it isn't useful to apply the optimization when there is contention,
|
|
|
|
* since each per-backend cache won't stay valid for long.
|
|
|
|
*/
|
|
|
|
static BTStack
|
2023-04-02 05:12:26 +02:00
|
|
|
_bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
|
2020-03-18 22:42:49 +01:00
|
|
|
{
|
|
|
|
Assert(insertstate->buf == InvalidBuffer);
|
|
|
|
Assert(!insertstate->bounds_valid);
|
|
|
|
Assert(insertstate->postingoff == 0);
|
|
|
|
|
|
|
|
if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
|
|
|
|
{
|
|
|
|
/* Simulate a _bt_getbuf() call with conditional locking */
|
|
|
|
insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
|
2020-07-22 00:50:58 +02:00
|
|
|
if (_bt_conditionallockbuf(rel, insertstate->buf))
|
2020-03-18 22:42:49 +01:00
|
|
|
{
|
|
|
|
Page page;
|
2020-11-17 18:01:14 +01:00
|
|
|
BTPageOpaque opaque;
|
2020-03-18 22:42:49 +01:00
|
|
|
|
|
|
|
_bt_checkpage(rel, insertstate->buf);
|
|
|
|
page = BufferGetPage(insertstate->buf);
|
2022-04-01 06:24:50 +02:00
|
|
|
opaque = BTPageGetOpaque(page);
|
2020-03-18 22:42:49 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if the page is still the rightmost leaf page and has
|
|
|
|
* enough free space to accommodate the new tuple. Also check
|
|
|
|
* that the insertion scan key is strictly greater than the first
|
|
|
|
* non-pivot tuple on the page. (Note that we expect itup_key's
|
|
|
|
* scantid to be unset when our caller is a checkingunique
|
|
|
|
* inserter.)
|
|
|
|
*/
|
2020-11-17 18:01:14 +01:00
|
|
|
if (P_RIGHTMOST(opaque) &&
|
|
|
|
P_ISLEAF(opaque) &&
|
|
|
|
!P_IGNORE(opaque) &&
|
2020-03-18 22:42:49 +01:00
|
|
|
PageGetFreeSpace(page) > insertstate->itemsz &&
|
|
|
|
PageGetMaxOffsetNumber(page) >= P_HIKEY &&
|
|
|
|
_bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Caller can use the fastpath optimization because cached
|
|
|
|
* block is still rightmost leaf page, which can fit caller's
|
|
|
|
* new tuple without splitting. Keep block in local cache for
|
|
|
|
* next insert, and have caller use NULL stack.
|
|
|
|
*
|
|
|
|
* Note that _bt_insert_parent() has an assertion that catches
|
|
|
|
* leaf page splits that somehow follow from a fastpath insert
|
|
|
|
* (it should only be passed a NULL stack when it must deal
|
|
|
|
* with a concurrent root page split, and never because a NULL
|
|
|
|
* stack was returned here).
|
|
|
|
*/
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Page unsuitable for caller, drop lock and pin */
|
|
|
|
_bt_relbuf(rel, insertstate->buf);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* Lock unavailable, drop pin */
|
|
|
|
ReleaseBuffer(insertstate->buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Forget block, since cache doesn't appear to be useful */
|
|
|
|
RelationSetTargetBlock(rel, InvalidBlockNumber);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Cannot use optimization -- descend tree, return proper descent stack */
|
2023-04-02 05:12:26 +02:00
|
|
|
return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
|
2023-09-08 07:12:12 +02:00
|
|
|
BT_WRITE);
|
2020-03-18 22:42:49 +01:00
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* _bt_check_unique() -- Check for violation of unique index constraint
|
|
|
|
*
|
2001-08-24 01:06:38 +02:00
|
|
|
* Returns InvalidTransactionId if there is no conflict, else an xact ID
|
|
|
|
* we must wait for to see if it commits a conflicting tuple. If an actual
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
* conflict is detected, no return --- just ereport(). If an xact ID is
|
|
|
|
* returned, and the conflicting tuple still has a speculative insertion in
|
|
|
|
* progress, *speculativeToken is set to non-zero, and the caller can wait for
|
|
|
|
* the verdict on the insertion using SpeculativeInsertionWait().
|
2009-07-29 22:56:21 +02:00
|
|
|
*
|
|
|
|
* However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
|
|
|
|
* InvalidTransactionId because we don't want to wait. In this case we
|
|
|
|
* set *is_unique to false if there is a potential conflict, and the
|
|
|
|
* core code must redo the uniqueness check later.
|
2019-03-20 17:30:57 +01:00
|
|
|
*
|
|
|
|
* As a side-effect, sets state in insertstate that can later be used by
|
|
|
|
* _bt_findinsertloc() to reuse most of the binary search work we do
|
|
|
|
* here.
|
Prevent O(N^2) unique index insertion edge case.
Commit dd299df8 made nbtree treat heap TID as a tiebreaker column,
establishing the principle that there is only one correct location (page
and page offset number) for every index tuple, no matter what.
Insertions of tuples into non-unique indexes proceed as if heap TID
(scan key's scantid) is just another user-attribute value, but
insertions into unique indexes are more delicate. The TID value in
scantid must initially be omitted to ensure that the unique index
insertion visits every leaf page that duplicates could be on. The
scantid is set once again after unique checking finishes successfully,
which can force _bt_findinsertloc() to step right one or more times, to
locate the leaf page that the new tuple must be inserted on.
Stepping right within _bt_findinsertloc() was assumed to occur no more
frequently than stepping right within _bt_check_unique(), but there was
one important case where that assumption was incorrect: inserting a
"duplicate" with NULL values. Since _bt_check_unique() didn't do any
real work in this case, it wasn't appropriate for _bt_findinsertloc() to
behave as if it was finishing off a conventional unique insertion, where
any existing physical duplicate must be dead or recently dead.
_bt_findinsertloc() might have to grovel through a substantial portion
of all of the leaf pages in the index to insert a single tuple, even
when there were no dead tuples.
To fix, treat insertions of tuples with NULLs into a unique index as if
they were insertions into a non-unique index: never unset scantid before
calling _bt_search() to descend the tree, and bypass _bt_check_unique()
entirely. _bt_check_unique() is no longer responsible for incoming
tuples with NULL values.
Discussion: https://postgr.es/m/CAH2-Wzm08nr+JPx4jMOa9CGqxWYDQ-_D4wtPBiKghXAUiUy-nQ@mail.gmail.com
2019-04-23 19:33:57 +02:00
|
|
|
*
|
2022-02-03 11:29:54 +01:00
|
|
|
* This code treats NULLs as equal, unlike the default semantics for unique
|
|
|
|
* indexes. So do not call here when there are NULL values in scan key and
|
|
|
|
* the index uses the default NULLS DISTINCT mode.
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
|
|
|
static TransactionId
|
2019-03-20 17:30:57 +01:00
|
|
|
_bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
IndexUniqueCheck checkUnique, bool *is_unique,
|
|
|
|
uint32 *speculativeToken)
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
2019-03-20 17:30:57 +01:00
|
|
|
IndexTuple itup = insertstate->itup;
|
2020-06-13 18:33:33 +02:00
|
|
|
IndexTuple curitup = NULL;
|
2021-04-08 21:54:31 +02:00
|
|
|
ItemId curitemid = NULL;
|
2019-03-20 17:30:57 +01:00
|
|
|
BTScanInsert itup_key = insertstate->itup_key;
|
2007-03-25 21:45:14 +02:00
|
|
|
SnapshotData SnapshotDirty;
|
2019-03-20 17:30:57 +01:00
|
|
|
OffsetNumber offset;
|
2007-03-03 21:13:06 +01:00
|
|
|
OffsetNumber maxoff;
|
2000-07-21 08:42:39 +02:00
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
Buffer nbuf = InvalidBuffer;
|
2009-07-29 22:56:21 +02:00
|
|
|
bool found = false;
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
bool inposting = false;
|
|
|
|
bool prevalldead = true;
|
|
|
|
int curposti = 0;
|
2009-07-29 22:56:21 +02:00
|
|
|
|
|
|
|
/* Assume unique until we find a duplicate */
|
|
|
|
*is_unique = true;
|
2000-07-21 08:42:39 +02:00
|
|
|
|
2007-03-25 21:45:14 +02:00
|
|
|
InitDirtySnapshot(SnapshotDirty);
|
|
|
|
|
2019-03-20 17:30:57 +01:00
|
|
|
page = BufferGetPage(insertstate->buf);
|
2022-04-01 06:24:50 +02:00
|
|
|
opaque = BTPageGetOpaque(page);
|
2000-07-21 08:42:39 +02:00
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
|
2019-03-20 17:30:57 +01:00
|
|
|
/*
|
|
|
|
* Find the first tuple with the same key.
|
|
|
|
*
|
|
|
|
* This also saves the binary search bounds in insertstate. We use them
|
|
|
|
* in the fastpath below, but also in the _bt_findinsertloc() call later.
|
|
|
|
*/
|
2019-04-04 18:38:08 +02:00
|
|
|
Assert(!insertstate->bounds_valid);
|
2019-03-20 17:30:57 +01:00
|
|
|
offset = _bt_binsrch_insert(rel, insertstate);
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* Scan over all equal tuples, looking for live conflicts.
|
|
|
|
*/
|
2019-03-20 17:30:57 +01:00
|
|
|
Assert(!insertstate->bounds_valid || insertstate->low == offset);
|
Prevent O(N^2) unique index insertion edge case.
Commit dd299df8 made nbtree treat heap TID as a tiebreaker column,
establishing the principle that there is only one correct location (page
and page offset number) for every index tuple, no matter what.
Insertions of tuples into non-unique indexes proceed as if heap TID
(scan key's scantid) is just another user-attribute value, but
insertions into unique indexes are more delicate. The TID value in
scantid must initially be omitted to ensure that the unique index
insertion visits every leaf page that duplicates could be on. The
scantid is set once again after unique checking finishes successfully,
which can force _bt_findinsertloc() to step right one or more times, to
locate the leaf page that the new tuple must be inserted on.
Stepping right within _bt_findinsertloc() was assumed to occur no more
frequently than stepping right within _bt_check_unique(), but there was
one important case where that assumption was incorrect: inserting a
"duplicate" with NULL values. Since _bt_check_unique() didn't do any
real work in this case, it wasn't appropriate for _bt_findinsertloc() to
behave as if it was finishing off a conventional unique insertion, where
any existing physical duplicate must be dead or recently dead.
_bt_findinsertloc() might have to grovel through a substantial portion
of all of the leaf pages in the index to insert a single tuple, even
when there were no dead tuples.
To fix, treat insertions of tuples with NULLs into a unique index as if
they were insertions into a non-unique index: never unset scantid before
calling _bt_search() to descend the tree, and bypass _bt_check_unique()
entirely. _bt_check_unique() is no longer responsible for incoming
tuples with NULL values.
Discussion: https://postgr.es/m/CAH2-Wzm08nr+JPx4jMOa9CGqxWYDQ-_D4wtPBiKghXAUiUy-nQ@mail.gmail.com
2019-04-23 19:33:57 +02:00
|
|
|
Assert(!itup_key->anynullkeys);
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
Assert(itup_key->scantid == NULL);
|
2000-07-21 08:42:39 +02:00
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
/*
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
* Each iteration of the loop processes one heap TID, not one index
|
|
|
|
* tuple. Current offset number for page isn't usually advanced on
|
|
|
|
* iterations that process heap TIDs from posting list tuples.
|
|
|
|
*
|
|
|
|
* "inposting" state is set when _inside_ a posting list --- not when
|
|
|
|
* we're at the start (or end) of a posting list. We advance curposti
|
|
|
|
* at the end of the iteration when inside a posting list tuple. In
|
|
|
|
* general, every loop iteration either advances the page offset or
|
|
|
|
* advances curposti --- an iteration that handles the rightmost/max
|
|
|
|
* heap TID in a posting list finally advances the page offset (and
|
|
|
|
* unsets "inposting").
|
|
|
|
*
|
|
|
|
* Make sure the offset points to an actual index tuple before trying
|
|
|
|
* to examine it...
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
|
|
|
if (offset <= maxoff)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2019-03-20 17:30:57 +01:00
|
|
|
/*
|
|
|
|
* Fastpath: In most cases, we can use cached search bounds to
|
|
|
|
* limit our consideration to items that are definitely
|
|
|
|
* duplicates. This fastpath doesn't apply when the original page
|
|
|
|
* is empty, or when initial offset is past the end of the
|
|
|
|
* original page, which may indicate that we need to examine a
|
|
|
|
* second or subsequent page.
|
|
|
|
*
|
Prevent O(N^2) unique index insertion edge case.
Commit dd299df8 made nbtree treat heap TID as a tiebreaker column,
establishing the principle that there is only one correct location (page
and page offset number) for every index tuple, no matter what.
Insertions of tuples into non-unique indexes proceed as if heap TID
(scan key's scantid) is just another user-attribute value, but
insertions into unique indexes are more delicate. The TID value in
scantid must initially be omitted to ensure that the unique index
insertion visits every leaf page that duplicates could be on. The
scantid is set once again after unique checking finishes successfully,
which can force _bt_findinsertloc() to step right one or more times, to
locate the leaf page that the new tuple must be inserted on.
Stepping right within _bt_findinsertloc() was assumed to occur no more
frequently than stepping right within _bt_check_unique(), but there was
one important case where that assumption was incorrect: inserting a
"duplicate" with NULL values. Since _bt_check_unique() didn't do any
real work in this case, it wasn't appropriate for _bt_findinsertloc() to
behave as if it was finishing off a conventional unique insertion, where
any existing physical duplicate must be dead or recently dead.
_bt_findinsertloc() might have to grovel through a substantial portion
of all of the leaf pages in the index to insert a single tuple, even
when there were no dead tuples.
To fix, treat insertions of tuples with NULLs into a unique index as if
they were insertions into a non-unique index: never unset scantid before
calling _bt_search() to descend the tree, and bypass _bt_check_unique()
entirely. _bt_check_unique() is no longer responsible for incoming
tuples with NULL values.
Discussion: https://postgr.es/m/CAH2-Wzm08nr+JPx4jMOa9CGqxWYDQ-_D4wtPBiKghXAUiUy-nQ@mail.gmail.com
2019-04-23 19:33:57 +02:00
|
|
|
* Note that this optimization allows us to avoid calling
|
|
|
|
* _bt_compare() directly when there are no duplicates, as long as
|
|
|
|
* the offset where the key will go is not at the end of the page.
|
2019-03-20 17:30:57 +01:00
|
|
|
*/
|
|
|
|
if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
|
|
|
|
{
|
|
|
|
Assert(insertstate->bounds_valid);
|
|
|
|
Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
|
|
|
|
Assert(insertstate->low <= insertstate->stricthigh);
|
Prevent O(N^2) unique index insertion edge case.
Commit dd299df8 made nbtree treat heap TID as a tiebreaker column,
establishing the principle that there is only one correct location (page
and page offset number) for every index tuple, no matter what.
Insertions of tuples into non-unique indexes proceed as if heap TID
(scan key's scantid) is just another user-attribute value, but
insertions into unique indexes are more delicate. The TID value in
scantid must initially be omitted to ensure that the unique index
insertion visits every leaf page that duplicates could be on. The
scantid is set once again after unique checking finishes successfully,
which can force _bt_findinsertloc() to step right one or more times, to
locate the leaf page that the new tuple must be inserted on.
Stepping right within _bt_findinsertloc() was assumed to occur no more
frequently than stepping right within _bt_check_unique(), but there was
one important case where that assumption was incorrect: inserting a
"duplicate" with NULL values. Since _bt_check_unique() didn't do any
real work in this case, it wasn't appropriate for _bt_findinsertloc() to
behave as if it was finishing off a conventional unique insertion, where
any existing physical duplicate must be dead or recently dead.
_bt_findinsertloc() might have to grovel through a substantial portion
of all of the leaf pages in the index to insert a single tuple, even
when there were no dead tuples.
To fix, treat insertions of tuples with NULLs into a unique index as if
they were insertions into a non-unique index: never unset scantid before
calling _bt_search() to descend the tree, and bypass _bt_check_unique()
entirely. _bt_check_unique() is no longer responsible for incoming
tuples with NULL values.
Discussion: https://postgr.es/m/CAH2-Wzm08nr+JPx4jMOa9CGqxWYDQ-_D4wtPBiKghXAUiUy-nQ@mail.gmail.com
2019-04-23 19:33:57 +02:00
|
|
|
Assert(_bt_compare(rel, itup_key, page, offset) < 0);
|
2019-03-20 17:30:57 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
1997-03-24 09:48:16 +01:00
|
|
|
/*
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
* We can skip items that are already marked killed.
|
2003-09-03 00:10:16 +02:00
|
|
|
*
|
2019-03-20 17:30:57 +01:00
|
|
|
* In the presence of heavy update activity an index may contain
|
Prevent O(N^2) unique index insertion edge case.
Commit dd299df8 made nbtree treat heap TID as a tiebreaker column,
establishing the principle that there is only one correct location (page
and page offset number) for every index tuple, no matter what.
Insertions of tuples into non-unique indexes proceed as if heap TID
(scan key's scantid) is just another user-attribute value, but
insertions into unique indexes are more delicate. The TID value in
scantid must initially be omitted to ensure that the unique index
insertion visits every leaf page that duplicates could be on. The
scantid is set once again after unique checking finishes successfully,
which can force _bt_findinsertloc() to step right one or more times, to
locate the leaf page that the new tuple must be inserted on.
Stepping right within _bt_findinsertloc() was assumed to occur no more
frequently than stepping right within _bt_check_unique(), but there was
one important case where that assumption was incorrect: inserting a
"duplicate" with NULL values. Since _bt_check_unique() didn't do any
real work in this case, it wasn't appropriate for _bt_findinsertloc() to
behave as if it was finishing off a conventional unique insertion, where
any existing physical duplicate must be dead or recently dead.
_bt_findinsertloc() might have to grovel through a substantial portion
of all of the leaf pages in the index to insert a single tuple, even
when there were no dead tuples.
To fix, treat insertions of tuples with NULLs into a unique index as if
they were insertions into a non-unique index: never unset scantid before
calling _bt_search() to descend the tree, and bypass _bt_check_unique()
entirely. _bt_check_unique() is no longer responsible for incoming
tuples with NULL values.
Discussion: https://postgr.es/m/CAH2-Wzm08nr+JPx4jMOa9CGqxWYDQ-_D4wtPBiKghXAUiUy-nQ@mail.gmail.com
2019-04-23 19:33:57 +02:00
|
|
|
* many killed items with the same key; running _bt_compare() on
|
2019-03-20 17:30:57 +01:00
|
|
|
* each killed item gets expensive. Just advance over killed
|
Prevent O(N^2) unique index insertion edge case.
Commit dd299df8 made nbtree treat heap TID as a tiebreaker column,
establishing the principle that there is only one correct location (page
and page offset number) for every index tuple, no matter what.
Insertions of tuples into non-unique indexes proceed as if heap TID
(scan key's scantid) is just another user-attribute value, but
insertions into unique indexes are more delicate. The TID value in
scantid must initially be omitted to ensure that the unique index
insertion visits every leaf page that duplicates could be on. The
scantid is set once again after unique checking finishes successfully,
which can force _bt_findinsertloc() to step right one or more times, to
locate the leaf page that the new tuple must be inserted on.
Stepping right within _bt_findinsertloc() was assumed to occur no more
frequently than stepping right within _bt_check_unique(), but there was
one important case where that assumption was incorrect: inserting a
"duplicate" with NULL values. Since _bt_check_unique() didn't do any
real work in this case, it wasn't appropriate for _bt_findinsertloc() to
behave as if it was finishing off a conventional unique insertion, where
any existing physical duplicate must be dead or recently dead.
_bt_findinsertloc() might have to grovel through a substantial portion
of all of the leaf pages in the index to insert a single tuple, even
when there were no dead tuples.
To fix, treat insertions of tuples with NULLs into a unique index as if
they were insertions into a non-unique index: never unset scantid before
calling _bt_search() to descend the tree, and bypass _bt_check_unique()
entirely. _bt_check_unique() is no longer responsible for incoming
tuples with NULL values.
Discussion: https://postgr.es/m/CAH2-Wzm08nr+JPx4jMOa9CGqxWYDQ-_D4wtPBiKghXAUiUy-nQ@mail.gmail.com
2019-04-23 19:33:57 +02:00
|
|
|
* items as quickly as we can. We only apply _bt_compare() when
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
* we get to a non-killed item. We could reuse the bounds to
|
|
|
|
* avoid _bt_compare() calls for known equal tuples, but it
|
2021-01-13 18:21:32 +01:00
|
|
|
* doesn't seem worth it.
|
1997-03-24 09:48:16 +01:00
|
|
|
*/
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
if (!inposting)
|
|
|
|
curitemid = PageGetItemId(page, offset);
|
|
|
|
if (inposting || !ItemIdIsDead(curitemid))
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
2007-09-20 19:56:33 +02:00
|
|
|
ItemPointerData htid;
|
2020-02-26 22:17:36 +01:00
|
|
|
bool all_dead = false;
|
2007-09-20 19:56:33 +02:00
|
|
|
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
if (!inposting)
|
|
|
|
{
|
|
|
|
/* Plain tuple, or first TID in posting list tuple */
|
|
|
|
if (_bt_compare(rel, itup_key, page, offset) != 0)
|
|
|
|
break; /* we're past all the equal tuples */
|
2003-09-03 00:10:16 +02:00
|
|
|
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
/* Advanced curitup */
|
|
|
|
curitup = (IndexTuple) PageGetItem(page, curitemid);
|
|
|
|
Assert(!BTreeTupleIsPivot(curitup));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* okay, we gotta fetch the heap tuple using htid ... */
|
|
|
|
if (!BTreeTupleIsPosting(curitup))
|
|
|
|
{
|
|
|
|
/* ... htid is from simple non-pivot tuple */
|
|
|
|
Assert(!inposting);
|
|
|
|
htid = curitup->t_tid;
|
|
|
|
}
|
|
|
|
else if (!inposting)
|
|
|
|
{
|
|
|
|
/* ... htid is first TID in new posting list */
|
|
|
|
inposting = true;
|
|
|
|
prevalldead = true;
|
|
|
|
curposti = 0;
|
|
|
|
htid = *BTreeTupleGetPostingN(curitup, 0);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* ... htid is second or subsequent TID in posting list */
|
|
|
|
Assert(curposti > 0);
|
|
|
|
htid = *BTreeTupleGetPostingN(curitup, curposti);
|
|
|
|
}
|
2007-09-20 19:56:33 +02:00
|
|
|
|
2009-07-29 22:56:21 +02:00
|
|
|
/*
|
|
|
|
* If we are doing a recheck, we expect to find the tuple we
|
|
|
|
* are rechecking. It's not a duplicate, but we have to keep
|
|
|
|
* scanning.
|
|
|
|
*/
|
|
|
|
if (checkUnique == UNIQUE_CHECK_EXISTING &&
|
|
|
|
ItemPointerCompare(&htid, &itup->t_tid) == 0)
|
|
|
|
{
|
|
|
|
found = true;
|
|
|
|
}
|
|
|
|
|
2007-09-20 19:56:33 +02:00
|
|
|
/*
|
2019-03-26 00:52:55 +01:00
|
|
|
* Check if there's any table tuples for this index entry
|
|
|
|
* satisfying SnapshotDirty. This is necessary because for AMs
|
|
|
|
* with optimizations like heap's HOT, we have just a single
|
|
|
|
* index entry for the entire chain.
|
2007-09-20 19:56:33 +02:00
|
|
|
*/
|
2019-03-26 00:52:55 +01:00
|
|
|
else if (table_index_fetch_tuple_check(heapRel, &htid,
|
|
|
|
&SnapshotDirty,
|
|
|
|
&all_dead))
|
2002-05-24 20:57:57 +02:00
|
|
|
{
|
2009-07-29 22:56:21 +02:00
|
|
|
TransactionId xwait;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* It is a duplicate. If we are only doing a partial
|
|
|
|
* check, then don't bother checking if the tuple is being
|
|
|
|
* updated in another transaction. Just return the fact
|
|
|
|
* that it is a potential conflict and leave the full
|
2019-04-04 18:38:08 +02:00
|
|
|
* check till later. Don't invalidate binary search
|
|
|
|
* bounds.
|
2009-07-29 22:56:21 +02:00
|
|
|
*/
|
|
|
|
if (checkUnique == UNIQUE_CHECK_PARTIAL)
|
|
|
|
{
|
|
|
|
if (nbuf != InvalidBuffer)
|
|
|
|
_bt_relbuf(rel, nbuf);
|
|
|
|
*is_unique = false;
|
|
|
|
return InvalidTransactionId;
|
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
|
2002-05-24 20:57:57 +02:00
|
|
|
/*
|
|
|
|
* If this tuple is being updated by other transaction
|
|
|
|
* then we have to wait for its commit/abort.
|
|
|
|
*/
|
2009-07-29 22:56:21 +02:00
|
|
|
xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
|
|
|
|
SnapshotDirty.xmin : SnapshotDirty.xmax;
|
|
|
|
|
2002-05-24 20:57:57 +02:00
|
|
|
if (TransactionIdIsValid(xwait))
|
|
|
|
{
|
|
|
|
if (nbuf != InvalidBuffer)
|
|
|
|
_bt_relbuf(rel, nbuf);
|
|
|
|
/* Tell _bt_doinsert to wait... */
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
*speculativeToken = SnapshotDirty.speculativeToken;
|
2019-04-04 18:38:08 +02:00
|
|
|
/* Caller releases lock on buf immediately */
|
|
|
|
insertstate->bounds_valid = false;
|
2002-05-24 20:57:57 +02:00
|
|
|
return xwait;
|
|
|
|
}
|
1998-12-15 13:47:01 +01:00
|
|
|
|
2002-05-24 20:57:57 +02:00
|
|
|
/*
|
2006-08-25 06:06:58 +02:00
|
|
|
* Otherwise we have a definite conflict. But before
|
|
|
|
* complaining, look to see if the tuple we want to insert
|
|
|
|
* is itself now committed dead --- if so, don't complain.
|
|
|
|
* This is a waste of time in normal scenarios but we must
|
|
|
|
* do it to support CREATE INDEX CONCURRENTLY.
|
2007-09-20 19:56:33 +02:00
|
|
|
*
|
|
|
|
* We must follow HOT-chains here because during
|
|
|
|
* concurrent index build, we insert the root TID though
|
|
|
|
* the actual tuple may be somewhere in the HOT-chain.
|
|
|
|
* While following the chain we might not stop at the
|
|
|
|
* exact tuple which triggered the insert, but that's OK
|
|
|
|
* because if we find a live tuple anywhere in this chain,
|
|
|
|
* we have a unique key conflict. The other live tuple is
|
|
|
|
* not part of this chain because it had a different index
|
|
|
|
* entry.
|
2002-05-24 20:57:57 +02:00
|
|
|
*/
|
2020-06-25 19:55:28 +02:00
|
|
|
htid = itup->t_tid;
|
|
|
|
if (table_index_fetch_tuple_check(heapRel, &htid,
|
2019-03-26 00:52:55 +01:00
|
|
|
SnapshotSelf, NULL))
|
2006-08-25 06:06:58 +02:00
|
|
|
{
|
|
|
|
/* Normal case --- it's still live */
|
|
|
|
}
|
2007-09-20 19:56:33 +02:00
|
|
|
else
|
2006-08-25 06:06:58 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* It's been deleted, so no error, and no need to
|
|
|
|
* continue searching
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2016-04-07 18:12:35 +02:00
|
|
|
/*
|
|
|
|
* Check for a conflict-in as we would if we were going to
|
|
|
|
* write to this page. We aren't actually going to write,
|
|
|
|
* but we want a chance to report SSI conflicts that would
|
|
|
|
* otherwise be masked by this unique constraint
|
|
|
|
* violation.
|
|
|
|
*/
|
2020-01-28 01:13:04 +01:00
|
|
|
CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
|
2016-04-07 18:12:35 +02:00
|
|
|
|
2009-07-29 22:56:21 +02:00
|
|
|
/*
|
2009-08-01 21:59:41 +02:00
|
|
|
* This is a definite conflict. Break the tuple down into
|
|
|
|
* datums and report the error. But first, make sure we
|
|
|
|
* release the buffer locks we're holding ---
|
2009-08-01 22:59:17 +02:00
|
|
|
* BuildIndexValueDescription could make catalog accesses,
|
2009-08-01 21:59:41 +02:00
|
|
|
* which in the worst case might touch this same index and
|
|
|
|
* cause deadlocks.
|
2009-07-29 22:56:21 +02:00
|
|
|
*/
|
2009-08-01 21:59:41 +02:00
|
|
|
if (nbuf != InvalidBuffer)
|
|
|
|
_bt_relbuf(rel, nbuf);
|
2019-03-20 17:30:57 +01:00
|
|
|
_bt_relbuf(rel, insertstate->buf);
|
|
|
|
insertstate->buf = InvalidBuffer;
|
2019-04-04 18:38:08 +02:00
|
|
|
insertstate->bounds_valid = false;
|
2009-08-01 21:59:41 +02:00
|
|
|
|
|
|
|
{
|
|
|
|
Datum values[INDEX_MAX_KEYS];
|
|
|
|
bool isnull[INDEX_MAX_KEYS];
|
Fix column-privilege leak in error-message paths
While building error messages to return to the user,
BuildIndexValueDescription, ExecBuildSlotValueDescription and
ri_ReportViolation would happily include the entire key or entire row in
the result returned to the user, even if the user didn't have access to
view all of the columns being included.
Instead, include only those columns which the user is providing or which
the user has select rights on. If the user does not have any rights
to view the table or any of the columns involved then no detail is
provided and a NULL value is returned from BuildIndexValueDescription
and ExecBuildSlotValueDescription. Note that, for key cases, the user
must have access to all of the columns for the key to be shown; a
partial key will not be returned.
Further, in master only, do not return any data for cases where row
security is enabled on the relation and row security should be applied
for the user. This required a bit of refactoring and moving of things
around related to RLS- note the addition of utils/misc/rls.c.
Back-patch all the way, as column-level privileges are now in all
supported versions.
This has been assigned CVE-2014-8161, but since the issue and the patch
have already been publicized on pgsql-hackers, there's no point in trying
to hide this commit.
2015-01-12 23:04:11 +01:00
|
|
|
char *key_desc;
|
2009-08-01 21:59:41 +02:00
|
|
|
|
|
|
|
index_deform_tuple(itup, RelationGetDescr(rel),
|
|
|
|
values, isnull);
|
Fix column-privilege leak in error-message paths
While building error messages to return to the user,
BuildIndexValueDescription, ExecBuildSlotValueDescription and
ri_ReportViolation would happily include the entire key or entire row in
the result returned to the user, even if the user didn't have access to
view all of the columns being included.
Instead, include only those columns which the user is providing or which
the user has select rights on. If the user does not have any rights
to view the table or any of the columns involved then no detail is
provided and a NULL value is returned from BuildIndexValueDescription
and ExecBuildSlotValueDescription. Note that, for key cases, the user
must have access to all of the columns for the key to be shown; a
partial key will not be returned.
Further, in master only, do not return any data for cases where row
security is enabled on the relation and row security should be applied
for the user. This required a bit of refactoring and moving of things
around related to RLS- note the addition of utils/misc/rls.c.
Back-patch all the way, as column-level privileges are now in all
supported versions.
This has been assigned CVE-2014-8161, but since the issue and the patch
have already been publicized on pgsql-hackers, there's no point in trying
to hide this commit.
2015-01-12 23:04:11 +01:00
|
|
|
|
|
|
|
key_desc = BuildIndexValueDescription(rel, values,
|
|
|
|
isnull);
|
|
|
|
|
2009-08-01 22:59:17 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_UNIQUE_VIOLATION),
|
|
|
|
errmsg("duplicate key value violates unique constraint \"%s\"",
|
|
|
|
RelationGetRelationName(rel)),
|
Fix column-privilege leak in error-message paths
While building error messages to return to the user,
BuildIndexValueDescription, ExecBuildSlotValueDescription and
ri_ReportViolation would happily include the entire key or entire row in
the result returned to the user, even if the user didn't have access to
view all of the columns being included.
Instead, include only those columns which the user is providing or which
the user has select rights on. If the user does not have any rights
to view the table or any of the columns involved then no detail is
provided and a NULL value is returned from BuildIndexValueDescription
and ExecBuildSlotValueDescription. Note that, for key cases, the user
must have access to all of the columns for the key to be shown; a
partial key will not be returned.
Further, in master only, do not return any data for cases where row
security is enabled on the relation and row security should be applied
for the user. This required a bit of refactoring and moving of things
around related to RLS- note the addition of utils/misc/rls.c.
Back-patch all the way, as column-level privileges are now in all
supported versions.
This has been assigned CVE-2014-8161, but since the issue and the patch
have already been publicized on pgsql-hackers, there's no point in trying
to hide this commit.
2015-01-12 23:04:11 +01:00
|
|
|
key_desc ? errdetail("Key %s already exists.",
|
|
|
|
key_desc) : 0,
|
Provide database object names as separate fields in error messages.
This patch addresses the problem that applications currently have to
extract object names from possibly-localized textual error messages,
if they want to know for example which index caused a UNIQUE_VIOLATION
failure. It adds new error message fields to the wire protocol, which
can carry the name of a table, table column, data type, or constraint
associated with the error. (Since the protocol spec has always instructed
clients to ignore unrecognized field types, this should not create any
compatibility problem.)
Support for providing these new fields has been added to just a limited set
of error reports (mainly, those in the "integrity constraint violation"
SQLSTATE class), but we will doubtless add them to more calls in future.
Pavel Stehule, reviewed and extensively revised by Peter Geoghegan, with
additional hacking by Tom Lane.
2013-01-29 23:06:26 +01:00
|
|
|
errtableconstraint(heapRel,
|
|
|
|
RelationGetRelationName(rel))));
|
2009-08-01 21:59:41 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
else if (all_dead && (!inposting ||
|
|
|
|
(prevalldead &&
|
|
|
|
curposti == BTreeTupleGetNPosting(curitup) - 1)))
|
2002-05-24 20:57:57 +02:00
|
|
|
{
|
|
|
|
/*
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
* The conflicting tuple (or all HOT chains pointed to by
|
|
|
|
* all posting list TIDs) is dead to everyone, so mark the
|
|
|
|
* index entry killed.
|
2002-05-24 20:57:57 +02:00
|
|
|
*/
|
2007-09-20 19:56:33 +02:00
|
|
|
ItemIdMarkDead(curitemid);
|
|
|
|
opaque->btpo_flags |= BTP_HAS_GARBAGE;
|
2013-03-22 14:54:07 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Mark buffer with a dirty hint, since state is not
|
|
|
|
* crucial. Be sure to mark the proper buffer dirty.
|
|
|
|
*/
|
2007-09-20 19:56:33 +02:00
|
|
|
if (nbuf != InvalidBuffer)
|
2013-06-17 17:02:12 +02:00
|
|
|
MarkBufferDirtyHint(nbuf, true);
|
2007-09-20 19:56:33 +02:00
|
|
|
else
|
2019-03-20 17:30:57 +01:00
|
|
|
MarkBufferDirtyHint(insertstate->buf, true);
|
2002-05-24 20:57:57 +02:00
|
|
|
}
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Remember if posting list tuple has even a single HOT chain
|
|
|
|
* whose members are not all dead
|
|
|
|
*/
|
|
|
|
if (!all_dead && inposting)
|
|
|
|
prevalldead = false;
|
1997-01-10 11:06:20 +01:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
}
|
|
|
|
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
|
|
|
|
{
|
|
|
|
/* Advance to next TID in same posting list */
|
|
|
|
curposti++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else if (offset < maxoff)
|
|
|
|
{
|
|
|
|
/* Advance to next tuple */
|
|
|
|
curposti = 0;
|
|
|
|
inposting = false;
|
2000-07-21 08:42:39 +02:00
|
|
|
offset = OffsetNumberNext(offset);
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
else
|
|
|
|
{
|
2019-03-20 17:30:57 +01:00
|
|
|
int highkeycmp;
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* If scankey == hikey we gotta check the next page too */
|
|
|
|
if (P_RIGHTMOST(opaque))
|
|
|
|
break;
|
2019-03-20 17:30:57 +01:00
|
|
|
highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
|
|
|
|
Assert(highkeycmp <= 0);
|
|
|
|
if (highkeycmp != 0)
|
2000-07-21 08:42:39 +02:00
|
|
|
break;
|
2003-02-22 01:45:05 +01:00
|
|
|
/* Advance to next non-dead page --- there must be one */
|
|
|
|
for (;;)
|
|
|
|
{
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
BlockNumber nblkno = opaque->btpo_next;
|
|
|
|
|
2004-04-21 20:24:26 +02:00
|
|
|
nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
|
2016-04-20 15:31:19 +02:00
|
|
|
page = BufferGetPage(nbuf);
|
2022-04-01 06:24:50 +02:00
|
|
|
opaque = BTPageGetOpaque(page);
|
2003-02-22 01:45:05 +01:00
|
|
|
if (!P_IGNORE(opaque))
|
|
|
|
break;
|
|
|
|
if (P_RIGHTMOST(opaque))
|
2007-12-31 05:52:05 +01:00
|
|
|
elog(ERROR, "fell off the end of index \"%s\"",
|
2003-02-22 01:45:05 +01:00
|
|
|
RelationGetRelationName(rel));
|
|
|
|
}
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
/* Will also advance to next tuple */
|
|
|
|
curposti = 0;
|
|
|
|
inposting = false;
|
2000-07-21 08:42:39 +02:00
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
offset = P_FIRSTDATAKEY(opaque);
|
2019-04-04 18:38:08 +02:00
|
|
|
/* Don't invalidate binary search bounds */
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
1997-01-10 11:06:20 +01:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2009-07-29 22:56:21 +02:00
|
|
|
/*
|
|
|
|
* If we are doing a recheck then we should have found the tuple we are
|
|
|
|
* checking. Otherwise there's something very wrong --- probably, the
|
|
|
|
* index is on a non-immutable expression.
|
|
|
|
*/
|
|
|
|
if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INTERNAL_ERROR),
|
|
|
|
errmsg("failed to re-find tuple within index \"%s\"",
|
|
|
|
RelationGetRelationName(rel)),
|
Provide database object names as separate fields in error messages.
This patch addresses the problem that applications currently have to
extract object names from possibly-localized textual error messages,
if they want to know for example which index caused a UNIQUE_VIOLATION
failure. It adds new error message fields to the wire protocol, which
can carry the name of a table, table column, data type, or constraint
associated with the error. (Since the protocol spec has always instructed
clients to ignore unrecognized field types, this should not create any
compatibility problem.)
Support for providing these new fields has been added to just a limited set
of error reports (mainly, those in the "integrity constraint violation"
SQLSTATE class), but we will doubtless add them to more calls in future.
Pavel Stehule, reviewed and extensively revised by Peter Geoghegan, with
additional hacking by Tom Lane.
2013-01-29 23:06:26 +01:00
|
|
|
errhint("This may be because of a non-immutable index expression."),
|
|
|
|
errtableconstraint(heapRel,
|
|
|
|
RelationGetRelationName(rel))));
|
2009-07-29 22:56:21 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
if (nbuf != InvalidBuffer)
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, nbuf);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2001-08-24 01:06:38 +02:00
|
|
|
return InvalidTransactionId;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2007-03-03 21:13:06 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* _bt_findinsertloc() -- Finds an insert location for a tuple
|
2000-07-21 08:42:39 +02:00
|
|
|
*
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
* On entry, insertstate buffer contains the page the new tuple belongs
|
|
|
|
* on. It is exclusive-locked and pinned by the caller.
|
|
|
|
*
|
|
|
|
* If 'checkingunique' is true, the buffer on entry is the first page
|
|
|
|
* that contains duplicates of the new key. If there are duplicates on
|
|
|
|
* multiple pages, the correct insertion position might be some page to
|
|
|
|
* the right, rather than the first page. In that case, this function
|
|
|
|
* moves right to the correct target page.
|
2019-03-20 17:30:57 +01:00
|
|
|
*
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
* (In a !heapkeyspace index, there can be multiple pages with the same
|
|
|
|
* high key, where the new tuple could legitimately be placed on. In
|
|
|
|
* that case, the caller passes the first page containing duplicates,
|
2019-05-26 14:58:18 +02:00
|
|
|
* just like when checkingunique=true. If that page doesn't have enough
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
* room for the new tuple, this function moves right, trying to find a
|
|
|
|
* legal page that does.)
|
2007-03-03 21:13:06 +01:00
|
|
|
*
|
2021-01-13 18:21:32 +01:00
|
|
|
* If 'indexUnchanged' is true, this is for an UPDATE that didn't
|
|
|
|
* logically change the indexed value, but must nevertheless have a new
|
|
|
|
* entry to point to a successor version. This hint from the executor
|
|
|
|
* will influence our behavior when the page might have to be split and
|
|
|
|
* we must consider our options. Bottom-up index deletion can avoid
|
|
|
|
* pathological version-driven page splits, but we only want to go to the
|
|
|
|
* trouble of trying it when we already have moderate confidence that
|
|
|
|
* it's appropriate. The hint should not significantly affect our
|
|
|
|
* behavior over time unless practically all inserts on to the leaf page
|
|
|
|
* get the hint.
|
|
|
|
*
|
2019-03-20 17:30:57 +01:00
|
|
|
* On exit, insertstate buffer contains the chosen insertion page, and
|
|
|
|
* the offset within that page is returned. If _bt_findinsertloc needed
|
|
|
|
* to move right, the lock and pin on the original page are released, and
|
|
|
|
* the new buffer is exclusively locked and pinned instead.
|
2007-03-03 21:13:06 +01:00
|
|
|
*
|
2019-03-20 17:30:57 +01:00
|
|
|
* If insertstate contains cached binary search bounds, we will take
|
|
|
|
* advantage of them. This avoids repeating comparisons that we made in
|
|
|
|
* _bt_check_unique() already.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2019-03-20 17:30:57 +01:00
|
|
|
static OffsetNumber
|
2007-03-03 21:13:06 +01:00
|
|
|
_bt_findinsertloc(Relation rel,
|
2019-03-20 17:30:57 +01:00
|
|
|
BTInsertState insertstate,
|
|
|
|
bool checkingunique,
|
2021-01-13 18:21:32 +01:00
|
|
|
bool indexUnchanged,
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
BTStack stack,
|
2010-03-28 11:27:02 +02:00
|
|
|
Relation heapRel)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2019-03-20 17:30:57 +01:00
|
|
|
BTScanInsert itup_key = insertstate->itup_key;
|
|
|
|
Page page = BufferGetPage(insertstate->buf);
|
2020-11-17 18:01:14 +01:00
|
|
|
BTPageOpaque opaque;
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
OffsetNumber newitemoff;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2022-04-01 06:24:50 +02:00
|
|
|
opaque = BTPageGetOpaque(page);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
/* Check 1/3 of a page restriction */
|
|
|
|
if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
|
|
|
|
_bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
|
|
|
|
insertstate->itup);
|
1999-12-26 04:48:22 +01:00
|
|
|
|
2020-11-17 18:01:14 +01:00
|
|
|
Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
|
2019-03-20 17:30:57 +01:00
|
|
|
Assert(!insertstate->bounds_valid || checkingunique);
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
|
|
|
|
Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
|
2000-08-26 01:13:33 +02:00
|
|
|
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
if (itup_key->heapkeyspace)
|
2019-03-20 17:30:57 +01:00
|
|
|
{
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
/* Keep track of whether checkingunique duplicate seen */
|
2021-01-13 18:21:32 +01:00
|
|
|
bool uniquedup = indexUnchanged;
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
|
2007-03-03 21:13:06 +01:00
|
|
|
/*
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
* If we're inserting into a unique index, we may have to walk right
|
|
|
|
* through leaf pages to find the one leaf page that we must insert on
|
|
|
|
* to.
|
|
|
|
*
|
|
|
|
* This is needed for checkingunique callers because a scantid was not
|
|
|
|
* used when we called _bt_search(). scantid can only be set after
|
|
|
|
* _bt_check_unique() has checked for duplicates. The buffer
|
|
|
|
* initially stored in insertstate->buf has the page where the first
|
|
|
|
* duplicate key might be found, which isn't always the page that new
|
|
|
|
* tuple belongs on. The heap TID attribute for new tuple (scantid)
|
|
|
|
* could force us to insert on a sibling page, though that should be
|
|
|
|
* very rare in practice.
|
2007-03-03 21:13:06 +01:00
|
|
|
*/
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
if (checkingunique)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
if (insertstate->low < insertstate->stricthigh)
|
|
|
|
{
|
|
|
|
/* Encountered a duplicate in _bt_check_unique() */
|
|
|
|
Assert(insertstate->bounds_valid);
|
|
|
|
uniquedup = true;
|
|
|
|
}
|
|
|
|
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Does the new tuple belong on this page?
|
|
|
|
*
|
|
|
|
* The earlier _bt_check_unique() call may well have
|
|
|
|
* established a strict upper bound on the offset for the new
|
|
|
|
* item. If it's not the last item of the page (i.e. if there
|
|
|
|
* is at least one tuple on the page that goes after the tuple
|
|
|
|
* we're inserting) then we know that the tuple belongs on
|
|
|
|
* this page. We can skip the high key check.
|
|
|
|
*/
|
|
|
|
if (insertstate->bounds_valid &&
|
|
|
|
insertstate->low <= insertstate->stricthigh &&
|
|
|
|
insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* Test '<=', not '!=', since scantid is set now */
|
2020-11-17 18:01:14 +01:00
|
|
|
if (P_RIGHTMOST(opaque) ||
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
_bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
|
|
|
|
break;
|
2006-07-25 21:13:00 +02:00
|
|
|
|
2023-04-02 05:12:26 +02:00
|
|
|
_bt_stepright(rel, heapRel, insertstate, stack);
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
/* Update local state after stepping right */
|
|
|
|
page = BufferGetPage(insertstate->buf);
|
2022-04-01 06:24:50 +02:00
|
|
|
opaque = BTPageGetOpaque(page);
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
/* Assume duplicates (if checkingunique) */
|
|
|
|
uniquedup = true;
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2001-03-22 05:01:46 +01:00
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/*
|
2021-01-13 18:21:32 +01:00
|
|
|
* If the target page cannot fit newitem, try to avoid splitting the
|
|
|
|
* page on insert by performing deletion or deduplication now
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
*/
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
if (PageGetFreeSpace(page) < insertstate->itemsz)
|
2020-11-17 18:45:56 +01:00
|
|
|
_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
|
2021-01-13 18:21:32 +01:00
|
|
|
checkingunique, uniquedup,
|
|
|
|
indexUnchanged);
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*----------
|
|
|
|
* This is a !heapkeyspace (version 2 or 3) index. The current page
|
|
|
|
* is the first page that we could insert the new tuple to, but there
|
|
|
|
* may be other pages to the right that we could opt to use instead.
|
2019-03-20 17:30:57 +01:00
|
|
|
*
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
* If the new key is equal to one or more existing keys, we can
|
|
|
|
* legitimately place it anywhere in the series of equal keys. In
|
|
|
|
* fact, if the new key is equal to the page's "high key" we can place
|
|
|
|
* it on the next page. If it is equal to the high key, and there's
|
|
|
|
* not room to insert the new tuple on the current page without
|
|
|
|
* splitting, then we move right hoping to find more free space and
|
|
|
|
* avoid a split.
|
|
|
|
*
|
|
|
|
* Keep scanning right until we
|
|
|
|
* (a) find a page with enough free space,
|
|
|
|
* (b) reach the last page where the tuple can legally go, or
|
|
|
|
* (c) get tired of searching.
|
|
|
|
* (c) is not flippant; it is important because if there are many
|
|
|
|
* pages' worth of equal keys, it's better to split one of the early
|
|
|
|
* pages than to scan all the way to the end of the run of equal keys
|
|
|
|
* on every insert. We implement "get tired" as a random choice,
|
|
|
|
* since stopping after scanning a fixed number of pages wouldn't work
|
|
|
|
* well (we'd never reach the right-hand side of previously split
|
|
|
|
* pages). The probability of moving right is set at 0.99, which may
|
|
|
|
* seem too high to change the behavior much, but it does an excellent
|
|
|
|
* job of preventing O(N^2) behavior with many equal keys.
|
|
|
|
*----------
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
while (PageGetFreeSpace(page) < insertstate->itemsz)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Before considering moving right, see if we can obtain enough
|
|
|
|
* space by erasing LP_DEAD items
|
|
|
|
*/
|
2020-11-17 18:01:14 +01:00
|
|
|
if (P_HAS_GARBAGE(opaque))
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
{
|
2021-01-13 18:21:32 +01:00
|
|
|
/* Perform simple deletion */
|
2020-11-17 18:45:56 +01:00
|
|
|
_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
|
2021-01-13 18:21:32 +01:00
|
|
|
false, false, false);
|
2019-03-20 17:30:57 +01:00
|
|
|
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
if (PageGetFreeSpace(page) >= insertstate->itemsz)
|
|
|
|
break; /* OK, now we have enough space */
|
|
|
|
}
|
2007-03-03 21:13:06 +01:00
|
|
|
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
/*
|
|
|
|
* Nope, so check conditions (b) and (c) enumerated above
|
|
|
|
*
|
|
|
|
* The earlier _bt_check_unique() call may well have established a
|
|
|
|
* strict upper bound on the offset for the new item. If it's not
|
|
|
|
* the last item of the page (i.e. if there is at least one tuple
|
|
|
|
* on the page that's greater than the tuple we're inserting to)
|
|
|
|
* then we know that the tuple belongs on this page. We can skip
|
|
|
|
* the high key check.
|
|
|
|
*/
|
|
|
|
if (insertstate->bounds_valid &&
|
|
|
|
insertstate->low <= insertstate->stricthigh &&
|
|
|
|
insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
|
|
|
|
break;
|
|
|
|
|
2020-11-17 18:01:14 +01:00
|
|
|
if (P_RIGHTMOST(opaque) ||
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
_bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
|
Replace random(), pg_erand48(), etc with a better PRNG API and algorithm.
Standardize on xoroshiro128** as our basic PRNG algorithm, eliminating
a bunch of platform dependencies as well as fundamentally-obsolete PRNG
code. In addition, this API replacement will ease replacing the
algorithm again in future, should that become necessary.
xoroshiro128** is a few percent slower than the drand48 family,
but it can produce full-width 64-bit random values not only 48-bit,
and it should be much more trustworthy. It's likely to be noticeably
faster than the platform's random(), depending on which platform you
are thinking about; and we can have non-global state vectors easily,
unlike with random(). It is not cryptographically strong, but neither
are the functions it replaces.
Fabien Coelho, reviewed by Dean Rasheed, Aleksander Alekseev, and myself
Discussion: https://postgr.es/m/alpine.DEB.2.22.394.2105241211230.165418@pseudo
2021-11-29 03:32:36 +01:00
|
|
|
pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
break;
|
|
|
|
|
2023-04-02 05:12:26 +02:00
|
|
|
_bt_stepright(rel, heapRel, insertstate, stack);
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
/* Update local state after stepping right */
|
|
|
|
page = BufferGetPage(insertstate->buf);
|
2022-04-01 06:24:50 +02:00
|
|
|
opaque = BTPageGetOpaque(page);
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
}
|
2019-03-20 17:30:57 +01:00
|
|
|
}
|
2007-03-03 21:13:06 +01:00
|
|
|
|
2019-03-20 17:30:57 +01:00
|
|
|
/*
|
|
|
|
* We should now be on the correct page. Find the offset within the page
|
|
|
|
* for the new tuple. (Possibly reusing earlier search bounds.)
|
|
|
|
*/
|
2020-11-17 18:01:14 +01:00
|
|
|
Assert(P_RIGHTMOST(opaque) ||
|
2019-03-20 17:30:57 +01:00
|
|
|
_bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
newitemoff = _bt_binsrch_insert(rel, insertstate);
|
|
|
|
|
|
|
|
if (insertstate->postingoff == -1)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* There is an overlapping posting list tuple with its LP_DEAD bit
|
|
|
|
* set. We don't want to unnecessarily unset its LP_DEAD bit while
|
2021-01-13 18:21:32 +01:00
|
|
|
* performing a posting list split, so perform simple index tuple
|
|
|
|
* deletion early.
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
*/
|
2020-11-17 18:45:56 +01:00
|
|
|
_bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
|
2021-01-13 18:21:32 +01:00
|
|
|
false, false, false);
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Do new binary search. New insert location cannot overlap with any
|
|
|
|
* posting list now.
|
|
|
|
*/
|
2020-11-17 18:45:56 +01:00
|
|
|
Assert(!insertstate->bounds_valid);
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
insertstate->postingoff = 0;
|
|
|
|
newitemoff = _bt_binsrch_insert(rel, insertstate);
|
|
|
|
Assert(insertstate->postingoff == 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
return newitemoff;
|
2019-03-20 17:30:57 +01:00
|
|
|
}
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
|
2019-03-20 17:30:57 +01:00
|
|
|
/*
|
|
|
|
* Step right to next non-dead page, during insertion.
|
|
|
|
*
|
|
|
|
* This is a bit more complicated than moving right in a search. We must
|
|
|
|
* write-lock the target page before releasing write lock on current page;
|
|
|
|
* else someone else's _bt_check_unique scan could fail to see our insertion.
|
|
|
|
* Write locks on intermediate dead pages won't do because we don't know when
|
|
|
|
* they will get de-linked from the tree.
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
*
|
|
|
|
* This is more aggressive than it needs to be for non-unique !heapkeyspace
|
|
|
|
* indexes.
|
2019-03-20 17:30:57 +01:00
|
|
|
*/
|
|
|
|
static void
|
2023-06-10 23:08:25 +02:00
|
|
|
_bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate,
|
|
|
|
BTStack stack)
|
2019-03-20 17:30:57 +01:00
|
|
|
{
|
|
|
|
Page page;
|
2020-11-17 18:01:14 +01:00
|
|
|
BTPageOpaque opaque;
|
2019-03-20 17:30:57 +01:00
|
|
|
Buffer rbuf;
|
|
|
|
BlockNumber rblkno;
|
|
|
|
|
2023-06-10 23:08:25 +02:00
|
|
|
Assert(heaprel != NULL);
|
2019-03-20 17:30:57 +01:00
|
|
|
page = BufferGetPage(insertstate->buf);
|
2022-04-01 06:24:50 +02:00
|
|
|
opaque = BTPageGetOpaque(page);
|
2019-03-20 17:30:57 +01:00
|
|
|
|
|
|
|
rbuf = InvalidBuffer;
|
2020-11-17 18:01:14 +01:00
|
|
|
rblkno = opaque->btpo_next;
|
2019-03-20 17:30:57 +01:00
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
|
|
|
|
page = BufferGetPage(rbuf);
|
2022-04-01 06:24:50 +02:00
|
|
|
opaque = BTPageGetOpaque(page);
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
|
2019-03-20 17:30:57 +01:00
|
|
|
/*
|
|
|
|
* If this page was incompletely split, finish the split now. We do
|
|
|
|
* this while holding a lock on the left sibling, which is not good
|
|
|
|
* because finishing the split could be a fairly lengthy operation.
|
|
|
|
* But this should happen very seldom.
|
|
|
|
*/
|
2020-11-17 18:01:14 +01:00
|
|
|
if (P_INCOMPLETE_SPLIT(opaque))
|
2019-03-20 17:30:57 +01:00
|
|
|
{
|
2023-04-02 05:12:26 +02:00
|
|
|
_bt_finish_split(rel, heaprel, rbuf, stack);
|
2019-03-20 17:30:57 +01:00
|
|
|
rbuf = InvalidBuffer;
|
|
|
|
continue;
|
2007-03-03 21:13:06 +01:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2020-11-17 18:01:14 +01:00
|
|
|
if (!P_IGNORE(opaque))
|
2019-03-20 17:30:57 +01:00
|
|
|
break;
|
2020-11-17 18:01:14 +01:00
|
|
|
if (P_RIGHTMOST(opaque))
|
2019-03-20 17:30:57 +01:00
|
|
|
elog(ERROR, "fell off the end of index \"%s\"",
|
|
|
|
RelationGetRelationName(rel));
|
2007-03-03 21:13:06 +01:00
|
|
|
|
2020-11-17 18:01:14 +01:00
|
|
|
rblkno = opaque->btpo_next;
|
2019-03-20 17:30:57 +01:00
|
|
|
}
|
|
|
|
/* rbuf locked; unlock buf, update state for caller */
|
|
|
|
_bt_relbuf(rel, insertstate->buf);
|
|
|
|
insertstate->buf = rbuf;
|
|
|
|
insertstate->bounds_valid = false;
|
2007-03-03 21:13:06 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*----------
|
|
|
|
* _bt_insertonpg() -- Insert a tuple on a particular page in the index.
|
|
|
|
*
|
|
|
|
* This recursive procedure does the following things:
|
|
|
|
*
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
* + if postingoff != 0, splits existing posting list tuple
|
|
|
|
* (since it overlaps with new 'itup' tuple).
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
* + if necessary, splits the target page, using 'itup_key' for
|
|
|
|
* suffix truncation on leaf pages (caller passes NULL for
|
|
|
|
* non-leaf pages).
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
* + inserts the new tuple (might be split from posting list).
|
2007-03-03 21:13:06 +01:00
|
|
|
* + if the page was split, pops the parent stack, and finds the
|
|
|
|
* right place to insert the new child pointer (by walking
|
|
|
|
* right using information stored in the parent stack).
|
|
|
|
* + invokes itself with the appropriate tuple for the right
|
|
|
|
* child page on the parent.
|
|
|
|
* + updates the metapage if a true root or fast root is split.
|
|
|
|
*
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
* On entry, we must have the correct buffer in which to do the
|
2007-03-03 21:13:06 +01:00
|
|
|
* insertion, and the buffer must be pinned and write-locked. On return,
|
|
|
|
* we will have dropped both the pin and the lock on the buffer.
|
|
|
|
*
|
2018-04-19 10:08:45 +02:00
|
|
|
* This routine only performs retail tuple insertions. 'itup' should
|
|
|
|
* always be either a non-highkey leaf item, or a downlink (new high
|
|
|
|
* key items are created indirectly, when a page is split). When
|
|
|
|
* inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
|
|
|
|
* we're inserting the downlink for. This function will clear the
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
* INCOMPLETE_SPLIT flag on it, and release the buffer.
|
2007-03-03 21:13:06 +01:00
|
|
|
*----------
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
_bt_insertonpg(Relation rel,
|
2023-04-02 05:12:26 +02:00
|
|
|
Relation heaprel,
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
BTScanInsert itup_key,
|
2007-03-03 21:13:06 +01:00
|
|
|
Buffer buf,
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
Buffer cbuf,
|
2007-03-03 21:13:06 +01:00
|
|
|
BTStack stack,
|
|
|
|
IndexTuple itup,
|
2020-03-16 20:00:10 +01:00
|
|
|
Size itemsz,
|
2007-03-03 21:13:06 +01:00
|
|
|
OffsetNumber newitemoff,
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
int postingoff,
|
2007-03-03 21:13:06 +01:00
|
|
|
bool split_only_page)
|
|
|
|
{
|
|
|
|
Page page;
|
2020-11-17 18:01:14 +01:00
|
|
|
BTPageOpaque opaque;
|
|
|
|
bool isleaf,
|
|
|
|
isroot,
|
|
|
|
isrightmost,
|
|
|
|
isonly;
|
2020-02-27 00:15:45 +01:00
|
|
|
IndexTuple oposting = NULL;
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
IndexTuple origitup = NULL;
|
|
|
|
IndexTuple nposting = NULL;
|
2007-03-03 21:13:06 +01:00
|
|
|
|
2016-04-20 15:31:19 +02:00
|
|
|
page = BufferGetPage(buf);
|
2022-04-01 06:24:50 +02:00
|
|
|
opaque = BTPageGetOpaque(page);
|
2020-11-17 18:01:14 +01:00
|
|
|
isleaf = P_ISLEAF(opaque);
|
|
|
|
isroot = P_ISROOT(opaque);
|
|
|
|
isrightmost = P_RIGHTMOST(opaque);
|
|
|
|
isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
|
2007-03-03 21:13:06 +01:00
|
|
|
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
/* child buffer must be given iff inserting on an internal page */
|
2020-11-17 18:01:14 +01:00
|
|
|
Assert(isleaf == !BufferIsValid(cbuf));
|
Adjust INCLUDE index truncation comments and code.
Add several assertions that ensure that we're dealing with a pivot tuple
without non-key attributes where that's expected. Also, remove the
assertion within _bt_isequal(), restoring the v10 function signature. A
similar check will be performed for the page highkey within
_bt_moveright() in most cases. Also avoid dropping all objects within
regression tests, to increase pg_dump test coverage for INCLUDE indexes.
Rather than using infrastructure that's generally intended to be used
with reference counted heap tuple descriptors during truncation, use the
same function that was introduced to store flat TupleDescs in shared
memory (we use a temp palloc'd buffer). This isn't strictly necessary,
but seems more future-proof than the old approach. It also lets us
avoid including rel.h within indextuple.c, which was arguably a
modularity violation. Also, we now call index_deform_tuple() with the
truncated TupleDesc, not the source TupleDesc, since that's more robust,
and saves a few cycles.
In passing, fix a memory leak by pfree'ing truncated pivot tuple memory
during CREATE INDEX. Also pfree during a page split, just to be
consistent.
Refactor _bt_check_natts() to be more readable.
Author: Peter Geoghegan with some editorization by me
Reviewed by: Alexander Korotkov, Teodor Sigaev
Discussion: https://www.postgresql.org/message-id/CAH2-Wz%3DkCWuXeMrBCopC-tFs3FbiVxQNjjgNKdG2sHxZ5k2y3w%40mail.gmail.com
2018-04-19 07:45:58 +02:00
|
|
|
/* tuple must have appropriate number of attributes */
|
2020-11-17 18:01:14 +01:00
|
|
|
Assert(!isleaf ||
|
Adjust INCLUDE index truncation comments and code.
Add several assertions that ensure that we're dealing with a pivot tuple
without non-key attributes where that's expected. Also, remove the
assertion within _bt_isequal(), restoring the v10 function signature. A
similar check will be performed for the page highkey within
_bt_moveright() in most cases. Also avoid dropping all objects within
regression tests, to increase pg_dump test coverage for INCLUDE indexes.
Rather than using infrastructure that's generally intended to be used
with reference counted heap tuple descriptors during truncation, use the
same function that was introduced to store flat TupleDescs in shared
memory (we use a temp palloc'd buffer). This isn't strictly necessary,
but seems more future-proof than the old approach. It also lets us
avoid including rel.h within indextuple.c, which was arguably a
modularity violation. Also, we now call index_deform_tuple() with the
truncated TupleDesc, not the source TupleDesc, since that's more robust,
and saves a few cycles.
In passing, fix a memory leak by pfree'ing truncated pivot tuple memory
during CREATE INDEX. Also pfree during a page split, just to be
consistent.
Refactor _bt_check_natts() to be more readable.
Author: Peter Geoghegan with some editorization by me
Reviewed by: Alexander Korotkov, Teodor Sigaev
Discussion: https://www.postgresql.org/message-id/CAH2-Wz%3DkCWuXeMrBCopC-tFs3FbiVxQNjjgNKdG2sHxZ5k2y3w%40mail.gmail.com
2018-04-19 07:45:58 +02:00
|
|
|
BTreeTupleGetNAtts(itup, rel) ==
|
|
|
|
IndexRelationGetNumberOfAttributes(rel));
|
2020-11-17 18:01:14 +01:00
|
|
|
Assert(isleaf ||
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
BTreeTupleGetNAtts(itup, rel) <=
|
Adjust INCLUDE index truncation comments and code.
Add several assertions that ensure that we're dealing with a pivot tuple
without non-key attributes where that's expected. Also, remove the
assertion within _bt_isequal(), restoring the v10 function signature. A
similar check will be performed for the page highkey within
_bt_moveright() in most cases. Also avoid dropping all objects within
regression tests, to increase pg_dump test coverage for INCLUDE indexes.
Rather than using infrastructure that's generally intended to be used
with reference counted heap tuple descriptors during truncation, use the
same function that was introduced to store flat TupleDescs in shared
memory (we use a temp palloc'd buffer). This isn't strictly necessary,
but seems more future-proof than the old approach. It also lets us
avoid including rel.h within indextuple.c, which was arguably a
modularity violation. Also, we now call index_deform_tuple() with the
truncated TupleDesc, not the source TupleDesc, since that's more robust,
and saves a few cycles.
In passing, fix a memory leak by pfree'ing truncated pivot tuple memory
during CREATE INDEX. Also pfree during a page split, just to be
consistent.
Refactor _bt_check_natts() to be more readable.
Author: Peter Geoghegan with some editorization by me
Reviewed by: Alexander Korotkov, Teodor Sigaev
Discussion: https://www.postgresql.org/message-id/CAH2-Wz%3DkCWuXeMrBCopC-tFs3FbiVxQNjjgNKdG2sHxZ5k2y3w%40mail.gmail.com
2018-04-19 07:45:58 +02:00
|
|
|
IndexRelationGetNumberOfKeyAttributes(rel));
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
Assert(!BTreeTupleIsPosting(itup));
|
2020-03-16 20:00:10 +01:00
|
|
|
Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
|
2020-11-15 20:53:37 +01:00
|
|
|
/* Caller must always finish incomplete split for us */
|
2020-11-17 18:01:14 +01:00
|
|
|
Assert(!P_INCOMPLETE_SPLIT(opaque));
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
|
2020-03-10 22:15:41 +01:00
|
|
|
/*
|
|
|
|
* Every internal page should have exactly one negative infinity item at
|
2023-06-10 23:08:25 +02:00
|
|
|
* all times. Only _bt_split() and _bt_newlevel() should add items that
|
2020-03-10 22:15:41 +01:00
|
|
|
* become negative infinity items through truncation, since they're the
|
|
|
|
* only routines that allocate new internal pages.
|
|
|
|
*/
|
2020-11-17 18:01:14 +01:00
|
|
|
Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
|
2020-03-10 22:15:41 +01:00
|
|
|
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
/*
|
|
|
|
* Do we need to split an existing posting list item?
|
|
|
|
*/
|
|
|
|
if (postingoff != 0)
|
|
|
|
{
|
|
|
|
ItemId itemid = PageGetItemId(page, newitemoff);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The new tuple is a duplicate with a heap TID that falls inside the
|
|
|
|
* range of an existing posting list tuple on a leaf page. Prepare to
|
|
|
|
* split an existing posting list. Overwriting the posting list with
|
|
|
|
* its post-split version is treated as an extra step in either the
|
|
|
|
* insert or page split critical section.
|
|
|
|
*/
|
2021-10-27 21:10:47 +02:00
|
|
|
Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
oposting = (IndexTuple) PageGetItem(page, itemid);
|
|
|
|
|
2021-10-27 21:10:47 +02:00
|
|
|
/*
|
|
|
|
* postingoff value comes from earlier call to _bt_binsrch_posting().
|
|
|
|
* Its binary search might think that a plain tuple must be a posting
|
|
|
|
* list tuple that needs to be split. This can happen with corruption
|
|
|
|
* involving an existing plain tuple that is a duplicate of the new
|
|
|
|
* item, up to and including its table TID. Check for that here in
|
|
|
|
* passing.
|
|
|
|
*
|
|
|
|
* Also verify that our caller has made sure that the existing posting
|
|
|
|
* list tuple does not have its LP_DEAD bit set.
|
|
|
|
*/
|
|
|
|
if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
|
|
|
errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
|
|
|
|
ItemPointerGetBlockNumber(&itup->t_tid),
|
|
|
|
ItemPointerGetOffsetNumber(&itup->t_tid),
|
2021-10-27 22:05:35 +02:00
|
|
|
newitemoff, BufferGetBlockNumber(buf),
|
2021-10-27 21:10:47 +02:00
|
|
|
RelationGetRelationName(rel))));
|
|
|
|
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
/* use a mutable copy of itup as our itup from here on */
|
|
|
|
origitup = itup;
|
|
|
|
itup = CopyIndexTuple(origitup);
|
|
|
|
nposting = _bt_swap_posting(itup, oposting, postingoff);
|
|
|
|
/* itup now contains rightmost/max TID from oposting */
|
|
|
|
|
|
|
|
/* Alter offset so that newitem goes after posting list */
|
|
|
|
newitemoff = OffsetNumberNext(newitemoff);
|
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* Do we need to split the page to fit the item on it?
|
2000-07-21 21:21:00 +02:00
|
|
|
*
|
|
|
|
* Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
|
|
|
|
* so this comparison is correct even though we appear to be accounting
|
|
|
|
* only for the item and not for its line pointer.
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
|
|
|
if (PageGetFreeSpace(page) < itemsz)
|
|
|
|
{
|
2003-02-21 01:06:22 +01:00
|
|
|
Buffer rbuf;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2020-04-14 06:11:03 +02:00
|
|
|
Assert(!split_only_page);
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* split the buffer into left and right halves */
|
2023-04-02 05:12:26 +02:00
|
|
|
rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
|
|
|
|
itup, origitup, nposting, postingoff);
|
Implement genuine serializable isolation level.
Until now, our Serializable mode has in fact been what's called Snapshot
Isolation, which allows some anomalies that could not occur in any
serialized ordering of the transactions. This patch fixes that using a
method called Serializable Snapshot Isolation, based on research papers by
Michael J. Cahill (see README-SSI for full references). In Serializable
Snapshot Isolation, transactions run like they do in Snapshot Isolation,
but a predicate lock manager observes the reads and writes performed and
aborts transactions if it detects that an anomaly might occur. This method
produces some false positives, ie. it sometimes aborts transactions even
though there is no anomaly.
To track reads we implement predicate locking, see storage/lmgr/predicate.c.
Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared
memory is finite, so when a transaction takes many tuple-level locks on a
page, the locks are promoted to a single page-level lock, and further to a
single relation level lock if necessary. To lock key values with no matching
tuple, a sequential scan always takes a relation-level lock, and an index
scan acquires a page-level lock that covers the search key, whether or not
there are any matching keys at the moment.
A predicate lock doesn't conflict with any regular locks or with another
predicate locks in the normal sense. They're only used by the predicate lock
manager to detect the danger of anomalies. Only serializable transactions
participate in predicate locking, so there should be no extra overhead for
for other transactions.
Predicate locks can't be released at commit, but must be remembered until
all the transactions that overlapped with it have completed. That means that
we need to remember an unbounded amount of predicate locks, so we apply a
lossy but conservative method of tracking locks for committed transactions.
If we run short of shared memory, we overflow to a new "pg_serial" SLRU
pool.
We don't currently allow Serializable transactions in Hot Standby mode.
That would be hard, because even read-only transactions can cause anomalies
that wouldn't otherwise occur.
Serializable isolation mode now means the new fully serializable level.
Repeatable Read gives you the old Snapshot Isolation level that we have
always had.
Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and
Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
|
|
|
PredicateLockPageSplit(rel,
|
|
|
|
BufferGetBlockNumber(buf),
|
|
|
|
BufferGetBlockNumber(rbuf));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*----------
|
1996-07-09 08:22:35 +02:00
|
|
|
* By here,
|
1997-09-07 07:04:48 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* + our target page has been split;
|
|
|
|
* + the original tuple has been inserted;
|
|
|
|
* + we have write locks on both the old (left half)
|
|
|
|
* and new (right half) buffers, after the split; and
|
|
|
|
* + we know the key we want to insert into the parent
|
|
|
|
* (it's the "high key" on the left child page).
|
|
|
|
*
|
|
|
|
* We're ready to do the parent insertion. We need to hold onto the
|
|
|
|
* locks for the child pages until we locate the parent, but we can
|
2019-03-13 00:40:05 +01:00
|
|
|
* at least release the lock on the right child before doing the
|
|
|
|
* actual insertion. The lock on the left child will be released
|
|
|
|
* last of all by parent insertion, where it is the 'cbuf' of parent
|
|
|
|
* page.
|
2000-07-21 08:42:39 +02:00
|
|
|
*----------
|
1997-06-10 09:28:50 +02:00
|
|
|
*/
|
2023-04-02 05:12:26 +02:00
|
|
|
_bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
|
2003-02-21 01:06:22 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Buffer metabuf = InvalidBuffer;
|
|
|
|
Page metapg = NULL;
|
|
|
|
BTMetaPageData *metad = NULL;
|
2020-03-18 22:42:49 +01:00
|
|
|
BlockNumber blockcache;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
/*
|
|
|
|
* If we are doing this insert because we split a page that was the
|
|
|
|
* only one on its tree level, but was not the root, it may have been
|
|
|
|
* the "fast root". We need to ensure that the fast root link points
|
|
|
|
* at or above the current page. We can safely acquire a lock on the
|
2023-06-10 23:08:25 +02:00
|
|
|
* metapage here --- see comments for _bt_newlevel().
|
2003-02-21 01:06:22 +01:00
|
|
|
*/
|
2020-11-17 18:01:14 +01:00
|
|
|
if (unlikely(split_only_page))
|
1996-12-06 10:45:30 +01:00
|
|
|
{
|
2020-03-18 22:42:49 +01:00
|
|
|
Assert(!isleaf);
|
|
|
|
Assert(BufferIsValid(cbuf));
|
2006-04-13 05:53:05 +02:00
|
|
|
|
2023-06-10 23:08:25 +02:00
|
|
|
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
|
2016-04-20 15:31:19 +02:00
|
|
|
metapg = BufferGetPage(metabuf);
|
2003-02-21 01:06:22 +01:00
|
|
|
metad = BTPageGetMeta(metapg);
|
2001-01-29 08:28:17 +01:00
|
|
|
|
2021-02-25 03:41:34 +01:00
|
|
|
if (metad->btm_fastlevel >= opaque->btpo_level)
|
2003-02-21 01:06:22 +01:00
|
|
|
{
|
|
|
|
/* no update wanted */
|
|
|
|
_bt_relbuf(rel, metabuf);
|
|
|
|
metabuf = InvalidBuffer;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2003-02-21 01:06:22 +01:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2003-07-21 22:29:40 +02:00
|
|
|
/* Do the update. No ereport(ERROR) until changes are logged */
|
2003-02-21 01:06:22 +01:00
|
|
|
START_CRIT_SECTION();
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
if (postingoff != 0)
|
|
|
|
memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
|
|
|
|
|
2020-03-19 02:17:37 +01:00
|
|
|
if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
|
|
|
|
false) == InvalidOffsetNumber)
|
2010-08-29 21:33:14 +02:00
|
|
|
elog(PANIC, "failed to add new item to block %u in index \"%s\"",
|
2020-03-18 02:39:26 +01:00
|
|
|
BufferGetBlockNumber(buf), RelationGetRelationName(rel));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2006-04-01 01:32:07 +02:00
|
|
|
MarkBufferDirty(buf);
|
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
if (BufferIsValid(metabuf))
|
|
|
|
{
|
Skip full index scan during cleanup of B-tree indexes when possible
Vacuum of index consists from two stages: multiple (zero of more) ambulkdelete
calls and one amvacuumcleanup call. When workload on particular table
is append-only, then autovacuum isn't intended to touch this table. However,
user may run vacuum manually in order to fill visibility map and get benefits
of index-only scans. Then ambulkdelete wouldn't be called for indexes
of such table (because no heap tuples were deleted), only amvacuumcleanup would
be called In this case, amvacuumcleanup would perform full index scan for
two objectives: put recyclable pages into free space map and update index
statistics.
This patch allows btvacuumclanup to skip full index scan when two conditions
are satisfied: no pages are going to be put into free space map and index
statistics isn't stalled. In order to check first condition, we store
oldest btpo_xact in the meta-page. When it's precedes RecentGlobalXmin, then
there are some recyclable pages. In order to check second condition we store
number of heap tuples observed during previous full index scan by cleanup.
If fraction of newly inserted tuples is less than
vacuum_cleanup_index_scale_factor, then statistics isn't considered to be
stalled. vacuum_cleanup_index_scale_factor can be defined as both reloption and GUC (default).
This patch bumps B-tree meta-page version. Upgrade of meta-page is performed
"on the fly": during VACUUM meta-page is rewritten with new version. No special
handling in pg_upgrade is required.
Author: Masahiko Sawada, Alexander Korotkov
Review by: Peter Geoghegan, Kyotaro Horiguchi, Alexander Korotkov, Yura Sokolov
Discussion: https://www.postgresql.org/message-id/flat/CAD21AoAX+d2oD_nrd9O2YkpzHaFr=uQeGr9s1rKC3O4ENc568g@mail.gmail.com
2018-04-04 18:29:00 +02:00
|
|
|
/* upgrade meta-page if needed */
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
if (metad->btm_version < BTREE_NOVAC_VERSION)
|
Skip full index scan during cleanup of B-tree indexes when possible
Vacuum of index consists from two stages: multiple (zero of more) ambulkdelete
calls and one amvacuumcleanup call. When workload on particular table
is append-only, then autovacuum isn't intended to touch this table. However,
user may run vacuum manually in order to fill visibility map and get benefits
of index-only scans. Then ambulkdelete wouldn't be called for indexes
of such table (because no heap tuples were deleted), only amvacuumcleanup would
be called In this case, amvacuumcleanup would perform full index scan for
two objectives: put recyclable pages into free space map and update index
statistics.
This patch allows btvacuumclanup to skip full index scan when two conditions
are satisfied: no pages are going to be put into free space map and index
statistics isn't stalled. In order to check first condition, we store
oldest btpo_xact in the meta-page. When it's precedes RecentGlobalXmin, then
there are some recyclable pages. In order to check second condition we store
number of heap tuples observed during previous full index scan by cleanup.
If fraction of newly inserted tuples is less than
vacuum_cleanup_index_scale_factor, then statistics isn't considered to be
stalled. vacuum_cleanup_index_scale_factor can be defined as both reloption and GUC (default).
This patch bumps B-tree meta-page version. Upgrade of meta-page is performed
"on the fly": during VACUUM meta-page is rewritten with new version. No special
handling in pg_upgrade is required.
Author: Masahiko Sawada, Alexander Korotkov
Review by: Peter Geoghegan, Kyotaro Horiguchi, Alexander Korotkov, Yura Sokolov
Discussion: https://www.postgresql.org/message-id/flat/CAD21AoAX+d2oD_nrd9O2YkpzHaFr=uQeGr9s1rKC3O4ENc568g@mail.gmail.com
2018-04-04 18:29:00 +02:00
|
|
|
_bt_upgrademetapage(metapg);
|
2020-03-18 02:39:26 +01:00
|
|
|
metad->btm_fastroot = BufferGetBlockNumber(buf);
|
2021-02-25 03:41:34 +01:00
|
|
|
metad->btm_fastlevel = opaque->btpo_level;
|
2006-04-01 01:32:07 +02:00
|
|
|
MarkBufferDirty(metabuf);
|
2003-02-21 01:06:22 +01:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2020-04-14 04:26:41 +02:00
|
|
|
/*
|
|
|
|
* Clear INCOMPLETE_SPLIT flag on child if inserting the new item
|
|
|
|
* finishes a split
|
|
|
|
*/
|
|
|
|
if (!isleaf)
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
{
|
2016-04-20 15:31:19 +02:00
|
|
|
Page cpage = BufferGetPage(cbuf);
|
2022-04-01 06:24:50 +02:00
|
|
|
BTPageOpaque cpageop = BTPageGetOpaque(cpage);
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
|
|
|
|
Assert(P_INCOMPLETE_SPLIT(cpageop));
|
|
|
|
cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
|
|
|
|
MarkBufferDirty(cbuf);
|
|
|
|
}
|
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
/* XLOG stuff */
|
2010-12-13 18:34:26 +01:00
|
|
|
if (RelationNeedsWAL(rel))
|
2003-02-21 01:06:22 +01:00
|
|
|
{
|
|
|
|
xl_btree_insert xlrec;
|
|
|
|
xl_btree_metadata xlmeta;
|
|
|
|
uint8 xlinfo;
|
|
|
|
XLogRecPtr recptr;
|
2020-04-30 21:31:56 +02:00
|
|
|
uint16 upostingoff;
|
2003-02-21 01:06:22 +01:00
|
|
|
|
2020-03-18 02:39:26 +01:00
|
|
|
xlrec.offnum = newitemoff;
|
2003-02-21 01:06:22 +01:00
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogBeginInsert();
|
|
|
|
XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
|
2003-02-21 01:06:22 +01:00
|
|
|
|
2020-03-18 22:42:49 +01:00
|
|
|
if (isleaf && postingoff == 0)
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
{
|
|
|
|
/* Simple leaf insert */
|
2006-04-13 05:53:05 +02:00
|
|
|
xlinfo = XLOG_BTREE_INSERT_LEAF;
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
}
|
|
|
|
else if (postingoff != 0)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Leaf insert with posting list split. Must include
|
|
|
|
* postingoff field before newitem/orignewitem.
|
|
|
|
*/
|
2020-04-14 18:33:18 +02:00
|
|
|
Assert(isleaf);
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
xlinfo = XLOG_BTREE_INSERT_POST;
|
|
|
|
}
|
2006-04-13 05:53:05 +02:00
|
|
|
else
|
|
|
|
{
|
2020-04-14 04:26:41 +02:00
|
|
|
/* Internal page insert, which finishes a split on cbuf */
|
2006-04-13 05:53:05 +02:00
|
|
|
xlinfo = XLOG_BTREE_INSERT_UPPER;
|
2020-04-14 04:26:41 +02:00
|
|
|
XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
|
2006-04-13 05:53:05 +02:00
|
|
|
|
2020-04-14 18:33:18 +02:00
|
|
|
if (BufferIsValid(metabuf))
|
|
|
|
{
|
|
|
|
/* Actually, it's an internal page insert + meta update */
|
|
|
|
xlinfo = XLOG_BTREE_INSERT_META;
|
|
|
|
|
|
|
|
Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
|
|
|
|
xlmeta.version = metad->btm_version;
|
|
|
|
xlmeta.root = metad->btm_root;
|
|
|
|
xlmeta.level = metad->btm_level;
|
|
|
|
xlmeta.fastroot = metad->btm_fastroot;
|
|
|
|
xlmeta.fastlevel = metad->btm_fastlevel;
|
2021-02-25 03:41:34 +01:00
|
|
|
xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
|
2020-04-14 18:33:18 +02:00
|
|
|
xlmeta.allequalimage = metad->btm_allequalimage;
|
|
|
|
|
|
|
|
XLogRegisterBuffer(2, metabuf,
|
|
|
|
REGBUF_WILL_INIT | REGBUF_STANDARD);
|
|
|
|
XLogRegisterBufData(2, (char *) &xlmeta,
|
|
|
|
sizeof(xl_btree_metadata));
|
|
|
|
}
|
2003-02-21 01:06:22 +01:00
|
|
|
}
|
2001-01-31 02:08:36 +01:00
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
if (postingoff == 0)
|
|
|
|
{
|
2020-04-14 18:33:18 +02:00
|
|
|
/* Just log itup from caller */
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Insert with posting list split (XLOG_BTREE_INSERT_POST
|
|
|
|
* record) case.
|
|
|
|
*
|
|
|
|
* Log postingoff. Also log origitup, not itup. REDO routine
|
|
|
|
* must reconstruct final itup (as well as nposting) using
|
|
|
|
* _bt_swap_posting().
|
|
|
|
*/
|
2020-04-30 21:31:56 +02:00
|
|
|
upostingoff = postingoff;
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
|
|
|
|
XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
|
|
|
|
XLogRegisterBufData(0, (char *) origitup,
|
|
|
|
IndexTupleSize(origitup));
|
|
|
|
}
|
2003-02-21 01:06:22 +01:00
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
recptr = XLogInsert(RM_BTREE_ID, xlinfo);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
if (BufferIsValid(metabuf))
|
|
|
|
PageSetLSN(metapg, recptr);
|
2020-04-14 04:26:41 +02:00
|
|
|
if (!isleaf)
|
2016-04-20 15:31:19 +02:00
|
|
|
PageSetLSN(BufferGetPage(cbuf), recptr);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
PageSetLSN(page, recptr);
|
1996-12-06 10:45:30 +01:00
|
|
|
}
|
2000-10-04 02:04:43 +02:00
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
END_CRIT_SECTION();
|
2000-10-04 02:04:43 +02:00
|
|
|
|
2020-03-18 22:42:49 +01:00
|
|
|
/* Release subsidiary buffers */
|
2003-02-21 01:06:22 +01:00
|
|
|
if (BufferIsValid(metabuf))
|
2006-04-01 01:32:07 +02:00
|
|
|
_bt_relbuf(rel, metabuf);
|
2020-04-14 04:26:41 +02:00
|
|
|
if (!isleaf)
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
_bt_relbuf(rel, cbuf);
|
2020-03-18 22:42:49 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Cache the block number if this is the rightmost leaf page. Cache
|
|
|
|
* may be used by a future inserter within _bt_search_insert().
|
|
|
|
*/
|
|
|
|
blockcache = InvalidBlockNumber;
|
2020-11-17 18:01:14 +01:00
|
|
|
if (isrightmost && isleaf && !isroot)
|
2020-03-18 22:42:49 +01:00
|
|
|
blockcache = BufferGetBlockNumber(buf);
|
|
|
|
|
|
|
|
/* Release buffer for insertion target block */
|
2006-04-01 01:32:07 +02:00
|
|
|
_bt_relbuf(rel, buf);
|
2018-04-11 00:21:03 +02:00
|
|
|
|
|
|
|
/*
|
2020-03-18 22:42:49 +01:00
|
|
|
* If we decided to cache the insertion target block before releasing
|
|
|
|
* its buffer lock, then cache it now. Check the height of the tree
|
|
|
|
* first, though. We don't go for the optimization with small
|
|
|
|
* indexes. Defer final check to this point to ensure that we don't
|
|
|
|
* call _bt_getrootheight while holding a buffer lock.
|
2018-04-11 00:21:03 +02:00
|
|
|
*/
|
2020-03-18 22:42:49 +01:00
|
|
|
if (BlockNumberIsValid(blockcache) &&
|
2023-06-10 23:08:25 +02:00
|
|
|
_bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
|
2020-03-18 22:42:49 +01:00
|
|
|
RelationSetTargetBlock(rel, blockcache);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
|
|
|
|
/* be tidy */
|
|
|
|
if (postingoff != 0)
|
|
|
|
{
|
|
|
|
/* itup is actually a modified copy of caller's original */
|
|
|
|
pfree(nposting);
|
|
|
|
pfree(itup);
|
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* _bt_split() -- split a page in the btree.
|
|
|
|
*
|
2006-04-13 05:53:05 +02:00
|
|
|
* On entry, buf is the page to split, and is pinned and write-locked.
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
* newitemoff etc. tell us about the new item that must be inserted
|
|
|
|
* along with the data from the original page.
|
2000-07-21 08:42:39 +02:00
|
|
|
*
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
* itup_key is used for suffix truncation on leaf pages (internal
|
|
|
|
* page callers pass NULL). When splitting a non-leaf page, 'cbuf'
|
|
|
|
* is the left-sibling of the page we're inserting the downlink for.
|
|
|
|
* This function will clear the INCOMPLETE_SPLIT flag on it, and
|
|
|
|
* release the buffer.
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
*
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
* orignewitem, nposting, and postingoff are needed when an insert of
|
|
|
|
* orignewitem results in both a posting list split and a page split.
|
|
|
|
* These extra posting list split details are used here in the same
|
|
|
|
* way as they are used in the more common case where a posting list
|
|
|
|
* split does not coincide with a page split. We need to deal with
|
|
|
|
* posting list splits directly in order to ensure that everything
|
|
|
|
* that follows from the insert of orignewitem is handled as a single
|
|
|
|
* atomic operation (though caller's insert of a new pivot/downlink
|
|
|
|
* into parent page will still be a separate operation). See
|
|
|
|
* nbtree/README for details on the design of posting list splits.
|
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Returns the new right sibling of buf, pinned and write-locked.
|
2005-09-25 00:54:44 +02:00
|
|
|
* The pin and lock on buf are maintained.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
static Buffer
|
2023-04-02 05:12:26 +02:00
|
|
|
_bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
|
|
|
|
Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
|
|
|
Buffer rbuf;
|
|
|
|
Page origpage;
|
|
|
|
Page leftpage,
|
|
|
|
rightpage;
|
2010-08-29 21:33:14 +02:00
|
|
|
BlockNumber origpagenumber,
|
|
|
|
rightpagenumber;
|
1996-07-09 08:22:35 +02:00
|
|
|
BTPageOpaque ropaque,
|
|
|
|
lopaque,
|
|
|
|
oopaque;
|
2003-02-22 01:45:05 +01:00
|
|
|
Buffer sbuf = InvalidBuffer;
|
|
|
|
Page spage = NULL;
|
|
|
|
BTPageOpaque sopaque = NULL;
|
1996-07-09 08:22:35 +02:00
|
|
|
Size itemsz;
|
|
|
|
ItemId itemid;
|
2020-04-14 01:39:55 +02:00
|
|
|
IndexTuple firstright,
|
|
|
|
lefthighkey;
|
|
|
|
OffsetNumber firstrightoff;
|
|
|
|
OffsetNumber afterleftoff,
|
|
|
|
afterrightoff,
|
|
|
|
minusinfoff;
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
OffsetNumber origpagepostingoff;
|
1996-07-09 08:22:35 +02:00
|
|
|
OffsetNumber maxoff;
|
|
|
|
OffsetNumber i;
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
bool newitemonleft,
|
2020-04-14 01:39:55 +02:00
|
|
|
isleaf,
|
|
|
|
isrightmost;
|
2000-10-13 14:05:22 +02:00
|
|
|
|
2010-08-29 21:33:14 +02:00
|
|
|
/*
|
|
|
|
* origpage is the original page to be split. leftpage is a temporary
|
|
|
|
* buffer that receives the left-sibling data, which will be copied back
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
* into origpage on success. rightpage is the new page that will receive
|
|
|
|
* the right-sibling data.
|
|
|
|
*
|
|
|
|
* leftpage is allocated after choosing a split point. rightpage's new
|
|
|
|
* buffer isn't acquired until after leftpage is initialized and has new
|
|
|
|
* high key, the last point where splitting the page may fail (barring
|
|
|
|
* corruption). Failing before acquiring new buffer won't have lasting
|
|
|
|
* consequences, since origpage won't have been modified and leftpage is
|
|
|
|
* only workspace.
|
2010-08-29 21:33:14 +02:00
|
|
|
*/
|
2016-04-20 15:31:19 +02:00
|
|
|
origpage = BufferGetPage(buf);
|
2022-04-01 06:24:50 +02:00
|
|
|
oopaque = BTPageGetOpaque(origpage);
|
2020-04-14 01:39:55 +02:00
|
|
|
isleaf = P_ISLEAF(oopaque);
|
|
|
|
isrightmost = P_RIGHTMOST(oopaque);
|
|
|
|
maxoff = PageGetMaxOffsetNumber(origpage);
|
2010-08-29 21:33:14 +02:00
|
|
|
origpagenumber = BufferGetBlockNumber(buf);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2007-04-11 22:47:38 +02:00
|
|
|
/*
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
* Choose a point to split origpage at.
|
|
|
|
*
|
2020-04-14 01:39:55 +02:00
|
|
|
* A split point can be thought of as a point _between_ two existing data
|
|
|
|
* items on origpage (the lastleft and firstright tuples), provided you
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
* pretend that the new item that didn't fit is already on origpage.
|
|
|
|
*
|
|
|
|
* Since origpage does not actually contain newitem, the representation of
|
|
|
|
* split points needs to work with two boundary cases: splits where
|
|
|
|
* newitem is lastleft, and splits where newitem is firstright.
|
|
|
|
* newitemonleft resolves the ambiguity that would otherwise exist when
|
2020-04-14 01:39:55 +02:00
|
|
|
* newitemoff == firstrightoff. In all other cases it's clear which side
|
|
|
|
* of the split every tuple goes on from context. newitemonleft is
|
|
|
|
* usually (but not always) redundant information.
|
|
|
|
*
|
|
|
|
* firstrightoff is supposed to be an origpage offset number, but it's
|
|
|
|
* possible that its value will be maxoff+1, which is "past the end" of
|
|
|
|
* origpage. This happens in the rare case where newitem goes after all
|
|
|
|
* existing items (i.e. newitemoff is maxoff+1) and we end up splitting
|
|
|
|
* origpage at the point that leaves newitem alone on new right page. Any
|
|
|
|
* "!newitemonleft && newitemoff == firstrightoff" split point makes
|
|
|
|
* newitem the firstright tuple, though, so this case isn't a special
|
|
|
|
* case.
|
2007-04-11 22:47:38 +02:00
|
|
|
*/
|
2020-04-14 01:39:55 +02:00
|
|
|
firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
|
|
|
|
newitem, &newitemonleft);
|
2007-04-11 22:47:38 +02:00
|
|
|
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
/* Allocate temp buffer for leftpage */
|
|
|
|
leftpage = PageGetTempPage(origpage);
|
|
|
|
_bt_pageinit(leftpage, BufferGetPageSize(buf));
|
2022-04-01 06:24:50 +02:00
|
|
|
lopaque = BTPageGetOpaque(leftpage);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
/*
|
|
|
|
* leftpage won't be the root when we're done. Also, clear the SPLIT_END
|
|
|
|
* and HAS_GARBAGE flags.
|
|
|
|
*/
|
2000-10-04 02:04:43 +02:00
|
|
|
lopaque->btpo_flags = oopaque->btpo_flags;
|
2006-07-25 21:13:00 +02:00
|
|
|
lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
/* set flag in leftpage indicating that rightpage has no downlink yet */
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
|
1996-07-09 08:22:35 +02:00
|
|
|
lopaque->btpo_prev = oopaque->btpo_prev;
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
/* handle btpo_next after rightpage buffer acquired */
|
2021-02-25 03:41:34 +01:00
|
|
|
lopaque->btpo_level = oopaque->btpo_level;
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
/* handle btpo_cycleid after rightpage buffer acquired */
|
1999-03-28 22:32:42 +02:00
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/*
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
* Copy the original page's LSN into leftpage, which will become the
|
|
|
|
* updated version of the page. We need this because XLogInsert will
|
|
|
|
* examine the LSN and possibly dump it in a page image.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
PageSetLSN(leftpage, PageGetLSN(origpage));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
/*
|
|
|
|
* Determine page offset number of existing overlapped-with-orignewitem
|
|
|
|
* posting list when it is necessary to perform a posting list split in
|
|
|
|
* passing. Note that newitem was already changed by caller (newitem no
|
|
|
|
* longer has the orignewitem TID).
|
|
|
|
*
|
|
|
|
* This page offset number (origpagepostingoff) will be used to pretend
|
|
|
|
* that the posting split has already taken place, even though the
|
|
|
|
* required modifications to origpage won't occur until we reach the
|
|
|
|
* critical section. The lastleft and firstright tuples of our page split
|
|
|
|
* point should, in effect, come from an imaginary version of origpage
|
|
|
|
* that has the nposting tuple instead of the original posting list tuple.
|
|
|
|
*
|
|
|
|
* Note: _bt_findsplitloc() should have compensated for coinciding posting
|
|
|
|
* list splits in just the same way, at least in theory. It doesn't
|
|
|
|
* bother with that, though. In practice it won't affect its choice of
|
|
|
|
* split point.
|
|
|
|
*/
|
|
|
|
origpagepostingoff = InvalidOffsetNumber;
|
|
|
|
if (postingoff != 0)
|
|
|
|
{
|
|
|
|
Assert(isleaf);
|
|
|
|
Assert(ItemPointerCompare(&orignewitem->t_tid,
|
|
|
|
&newitem->t_tid) < 0);
|
|
|
|
Assert(BTreeTupleIsPosting(nposting));
|
|
|
|
origpagepostingoff = OffsetNumberPrev(newitemoff);
|
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
2020-04-14 01:39:55 +02:00
|
|
|
* The high key for the new left page is a possibly-truncated copy of
|
|
|
|
* firstright on the leaf level (it's "firstright itself" on internal
|
|
|
|
* pages; see !isleaf comments below). This may seem to be contrary to
|
|
|
|
* Lehman & Yao's approach of using a copy of lastleft as the new high key
|
|
|
|
* when splitting on the leaf level. It isn't, though.
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
*
|
2020-04-14 01:39:55 +02:00
|
|
|
* Suffix truncation will leave the left page's high key fully equal to
|
|
|
|
* lastleft when lastleft and firstright are equal prior to heap TID (that
|
|
|
|
* is, the tiebreaker TID value comes from lastleft). It isn't actually
|
|
|
|
* necessary for a new leaf high key to be a copy of lastleft for the L&Y
|
|
|
|
* "subtree" invariant to hold. It's sufficient to make sure that the new
|
|
|
|
* leaf high key is strictly less than firstright, and greater than or
|
|
|
|
* equal to (not necessarily equal to) lastleft. In other words, when
|
|
|
|
* suffix truncation isn't possible during a leaf page split, we take
|
|
|
|
* L&Y's exact approach to generating a new high key for the left page.
|
|
|
|
* (Actually, that is slightly inaccurate. We don't just use a copy of
|
|
|
|
* lastleft. A tuple with all the keys from firstright but the max heap
|
|
|
|
* TID from lastleft is used, to avoid introducing a special case.)
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
2020-04-14 01:39:55 +02:00
|
|
|
if (!newitemonleft && newitemoff == firstrightoff)
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
2020-04-14 01:39:55 +02:00
|
|
|
/* incoming tuple becomes firstright */
|
2000-07-21 08:42:39 +02:00
|
|
|
itemsz = newitemsz;
|
2020-04-14 01:39:55 +02:00
|
|
|
firstright = newitem;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
else
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2020-04-14 01:39:55 +02:00
|
|
|
/* existing item at firstrightoff becomes firstright */
|
|
|
|
itemid = PageGetItemId(origpage, firstrightoff);
|
2000-07-21 08:42:39 +02:00
|
|
|
itemsz = ItemIdGetLength(itemid);
|
2020-04-14 01:39:55 +02:00
|
|
|
firstright = (IndexTuple) PageGetItem(origpage, itemid);
|
|
|
|
if (firstrightoff == origpagepostingoff)
|
|
|
|
firstright = nposting;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2018-04-07 22:00:39 +02:00
|
|
|
|
2020-04-14 01:39:55 +02:00
|
|
|
if (isleaf)
|
2018-04-07 22:00:39 +02:00
|
|
|
{
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
IndexTuple lastleft;
|
|
|
|
|
2020-04-14 01:39:55 +02:00
|
|
|
/* Attempt suffix truncation for leaf page splits */
|
|
|
|
if (newitemonleft && newitemoff == firstrightoff)
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
{
|
2020-04-14 01:39:55 +02:00
|
|
|
/* incoming tuple becomes lastleft */
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
lastleft = newitem;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
OffsetNumber lastleftoff;
|
|
|
|
|
2020-04-14 01:39:55 +02:00
|
|
|
/* existing item before firstrightoff becomes lastleft */
|
|
|
|
lastleftoff = OffsetNumberPrev(firstrightoff);
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
|
|
|
|
itemid = PageGetItemId(origpage, lastleftoff);
|
|
|
|
lastleft = (IndexTuple) PageGetItem(origpage, itemid);
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
if (lastleftoff == origpagepostingoff)
|
|
|
|
lastleft = nposting;
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
}
|
|
|
|
|
2020-04-14 01:39:55 +02:00
|
|
|
lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
|
|
|
|
itemsz = IndexTupleSize(lefthighkey);
|
2018-04-07 22:00:39 +02:00
|
|
|
}
|
|
|
|
else
|
2020-04-14 01:39:55 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Don't perform suffix truncation on a copy of firstright to make
|
|
|
|
* left page high key for internal page splits. Must use firstright
|
|
|
|
* as new high key directly.
|
|
|
|
*
|
|
|
|
* Each distinct separator key value originates as a leaf level high
|
|
|
|
* key; all other separator keys/pivot tuples are copied from one
|
|
|
|
* level down. A separator key in a grandparent page must be
|
|
|
|
* identical to high key in rightmost parent page of the subtree to
|
|
|
|
* its left, which must itself be identical to high key in rightmost
|
|
|
|
* child page of that same subtree (this even applies to separator
|
|
|
|
* from grandparent's high key). There must always be an unbroken
|
|
|
|
* "seam" of identical separator keys that guide index scans at every
|
|
|
|
* level, starting from the grandparent. That's why suffix truncation
|
|
|
|
* is unsafe here.
|
|
|
|
*
|
|
|
|
* Internal page splits will truncate firstright into a "negative
|
|
|
|
* infinity" data item when it gets inserted on the new right page
|
|
|
|
* below, though. This happens during the call to _bt_pgaddtup() for
|
|
|
|
* the new first data item for right page. Do not confuse this
|
|
|
|
* mechanism with suffix truncation. It is just a convenient way of
|
|
|
|
* implementing page splits that split the internal page "inside"
|
|
|
|
* firstright. The lefthighkey separator key cannot appear a second
|
|
|
|
* time in the right page (only firstright's downlink goes in right
|
|
|
|
* page).
|
|
|
|
*/
|
|
|
|
lefthighkey = firstright;
|
|
|
|
}
|
2018-04-07 22:00:39 +02:00
|
|
|
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
/*
|
|
|
|
* Add new high key to leftpage
|
|
|
|
*/
|
2020-04-14 01:39:55 +02:00
|
|
|
afterleftoff = P_HIKEY;
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
|
2020-04-14 01:39:55 +02:00
|
|
|
Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
|
|
|
|
Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
|
|
|
|
IndexRelationGetNumberOfKeyAttributes(rel));
|
|
|
|
Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
|
|
|
|
if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
|
|
|
|
false) == InvalidOffsetNumber)
|
|
|
|
elog(ERROR, "failed to add high key to the left sibling"
|
2007-12-31 05:52:05 +01:00
|
|
|
" while splitting block %u of index \"%s\"",
|
2010-08-29 21:33:14 +02:00
|
|
|
origpagenumber, RelationGetRelationName(rel));
|
2020-04-14 01:39:55 +02:00
|
|
|
afterleftoff = OffsetNumberNext(afterleftoff);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
* Acquire a new right page to split into, now that left page has a new
|
|
|
|
* high key. From here on, it's not okay to throw an error without
|
|
|
|
* zeroing rightpage first. This coding rule ensures that we won't
|
|
|
|
* confuse future VACUUM operations, which might otherwise try to re-find
|
|
|
|
* a downlink to a leftover junk page as the page undergoes deletion.
|
|
|
|
*
|
|
|
|
* It would be reasonable to start the critical section just after the new
|
|
|
|
* rightpage buffer is acquired instead; that would allow us to avoid
|
|
|
|
* leftover junk pages without bothering to zero rightpage. We do it this
|
|
|
|
* way because it avoids an unnecessary PANIC when either origpage or its
|
|
|
|
* existing sibling page are corrupt.
|
|
|
|
*/
|
2023-06-10 23:08:25 +02:00
|
|
|
rbuf = _bt_allocbuf(rel, heaprel);
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
rightpage = BufferGetPage(rbuf);
|
|
|
|
rightpagenumber = BufferGetBlockNumber(rbuf);
|
|
|
|
/* rightpage was initialized by _bt_getbuf */
|
2022-04-01 06:24:50 +02:00
|
|
|
ropaque = BTPageGetOpaque(rightpage);
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Finish off remaining leftpage special area fields. They cannot be set
|
|
|
|
* before both origpage (leftpage) and rightpage buffers are acquired and
|
|
|
|
* locked.
|
2020-05-02 23:04:33 +02:00
|
|
|
*
|
|
|
|
* btpo_cycleid is only used with leaf pages, though we set it here in all
|
|
|
|
* cases just to be consistent.
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
*/
|
|
|
|
lopaque->btpo_next = rightpagenumber;
|
|
|
|
lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* rightpage won't be the root when we're done. Also, clear the SPLIT_END
|
|
|
|
* and HAS_GARBAGE flags.
|
|
|
|
*/
|
|
|
|
ropaque->btpo_flags = oopaque->btpo_flags;
|
|
|
|
ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
|
|
|
|
ropaque->btpo_prev = origpagenumber;
|
|
|
|
ropaque->btpo_next = oopaque->btpo_next;
|
2021-02-25 03:41:34 +01:00
|
|
|
ropaque->btpo_level = oopaque->btpo_level;
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
ropaque->btpo_cycleid = lopaque->btpo_cycleid;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add new high key to rightpage where necessary.
|
|
|
|
*
|
|
|
|
* If the page we're splitting is not the rightmost page at its level in
|
|
|
|
* the tree, then the first entry on the page is the high key from
|
|
|
|
* origpage.
|
|
|
|
*/
|
2020-04-14 01:39:55 +02:00
|
|
|
afterrightoff = P_HIKEY;
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
|
2020-04-14 01:39:55 +02:00
|
|
|
if (!isrightmost)
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
{
|
2020-04-14 01:39:55 +02:00
|
|
|
IndexTuple righthighkey;
|
|
|
|
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
itemid = PageGetItemId(origpage, P_HIKEY);
|
|
|
|
itemsz = ItemIdGetLength(itemid);
|
2020-04-14 01:39:55 +02:00
|
|
|
righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
|
|
|
|
Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
|
|
|
|
Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
|
|
|
|
IndexRelationGetNumberOfKeyAttributes(rel));
|
|
|
|
if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
false, false) == InvalidOffsetNumber)
|
|
|
|
{
|
|
|
|
memset(rightpage, 0, BufferGetPageSize(rbuf));
|
2020-04-14 01:39:55 +02:00
|
|
|
elog(ERROR, "failed to add high key to the right sibling"
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
" while splitting block %u of index \"%s\"",
|
|
|
|
origpagenumber, RelationGetRelationName(rel));
|
|
|
|
}
|
2020-04-14 01:39:55 +02:00
|
|
|
afterrightoff = OffsetNumberNext(afterrightoff);
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
}
|
|
|
|
|
2020-04-14 01:39:55 +02:00
|
|
|
/*
|
|
|
|
* Internal page splits truncate first data item on right page -- it
|
|
|
|
* becomes "minus infinity" item for the page. Set this up here.
|
|
|
|
*/
|
|
|
|
minusinfoff = InvalidOffsetNumber;
|
|
|
|
if (!isleaf)
|
|
|
|
minusinfoff = afterrightoff;
|
|
|
|
|
Don't leave behind junk nbtree pages during split.
Commit 8fa30f906be reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR. At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section. This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions. The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog(). There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.
Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes. This happened at a point
that broke the rule around zeroing the right page in _bt_split(). If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents. This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.
To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired. A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.
Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
2019-05-13 19:27:59 +02:00
|
|
|
/*
|
|
|
|
* Now transfer all the data items (non-pivot tuples in isleaf case, or
|
|
|
|
* additional pivot tuples in !isleaf case) to the appropriate page.
|
2007-04-11 22:47:38 +02:00
|
|
|
*
|
|
|
|
* Note: we *must* insert at least the right page's items in item-number
|
|
|
|
* order, for the benefit of _bt_restore_page().
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
|
|
|
for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2020-04-14 01:39:55 +02:00
|
|
|
IndexTuple dataitem;
|
|
|
|
|
1997-06-10 09:28:50 +02:00
|
|
|
itemid = PageGetItemId(origpage, i);
|
|
|
|
itemsz = ItemIdGetLength(itemid);
|
2020-04-14 01:39:55 +02:00
|
|
|
dataitem = (IndexTuple) PageGetItem(origpage, itemid);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
/* replace original item with nposting due to posting split? */
|
|
|
|
if (i == origpagepostingoff)
|
|
|
|
{
|
2020-04-14 01:39:55 +02:00
|
|
|
Assert(BTreeTupleIsPosting(dataitem));
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
|
2020-04-14 01:39:55 +02:00
|
|
|
dataitem = nposting;
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* does new item belong before this one? */
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
else if (i == newitemoff)
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
|
|
|
if (newitemonleft)
|
|
|
|
{
|
2020-04-14 01:39:55 +02:00
|
|
|
Assert(newitemoff <= firstrightoff);
|
|
|
|
if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
|
|
|
|
false))
|
2010-08-29 21:33:14 +02:00
|
|
|
{
|
|
|
|
memset(rightpage, 0, BufferGetPageSize(rbuf));
|
|
|
|
elog(ERROR, "failed to add new item to the left sibling"
|
|
|
|
" while splitting block %u of index \"%s\"",
|
|
|
|
origpagenumber, RelationGetRelationName(rel));
|
|
|
|
}
|
2020-04-14 01:39:55 +02:00
|
|
|
afterleftoff = OffsetNumberNext(afterleftoff);
|
2000-07-21 08:42:39 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2020-04-14 01:39:55 +02:00
|
|
|
Assert(newitemoff >= firstrightoff);
|
|
|
|
if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
|
|
|
|
afterrightoff == minusinfoff))
|
2010-08-29 21:33:14 +02:00
|
|
|
{
|
|
|
|
memset(rightpage, 0, BufferGetPageSize(rbuf));
|
|
|
|
elog(ERROR, "failed to add new item to the right sibling"
|
|
|
|
" while splitting block %u of index \"%s\"",
|
|
|
|
origpagenumber, RelationGetRelationName(rel));
|
|
|
|
}
|
2020-04-14 01:39:55 +02:00
|
|
|
afterrightoff = OffsetNumberNext(afterrightoff);
|
2000-07-21 08:42:39 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1997-06-06 05:11:46 +02:00
|
|
|
/* decide which page to put it on */
|
2020-04-14 01:39:55 +02:00
|
|
|
if (i < firstrightoff)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2020-04-14 01:39:55 +02:00
|
|
|
if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
|
2010-08-29 21:33:14 +02:00
|
|
|
{
|
|
|
|
memset(rightpage, 0, BufferGetPageSize(rbuf));
|
|
|
|
elog(ERROR, "failed to add old item to the left sibling"
|
|
|
|
" while splitting block %u of index \"%s\"",
|
|
|
|
origpagenumber, RelationGetRelationName(rel));
|
|
|
|
}
|
2020-04-14 01:39:55 +02:00
|
|
|
afterleftoff = OffsetNumberNext(afterleftoff);
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2020-04-14 01:39:55 +02:00
|
|
|
if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
|
|
|
|
afterrightoff == minusinfoff))
|
2010-08-29 21:33:14 +02:00
|
|
|
{
|
|
|
|
memset(rightpage, 0, BufferGetPageSize(rbuf));
|
|
|
|
elog(ERROR, "failed to add old item to the right sibling"
|
|
|
|
" while splitting block %u of index \"%s\"",
|
|
|
|
origpagenumber, RelationGetRelationName(rel));
|
|
|
|
}
|
2020-04-14 01:39:55 +02:00
|
|
|
afterrightoff = OffsetNumberNext(afterrightoff);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
|
2020-04-14 01:39:55 +02:00
|
|
|
/* Handle case where newitem goes at the end of rightpage */
|
2000-07-21 08:42:39 +02:00
|
|
|
if (i <= newitemoff)
|
|
|
|
{
|
2007-02-06 15:55:11 +01:00
|
|
|
/*
|
|
|
|
* Can't have newitemonleft here; that would imply we were told to put
|
|
|
|
* *everything* on the left page, which cannot fit (if it could, we'd
|
|
|
|
* not be splitting the page).
|
|
|
|
*/
|
2020-04-14 01:39:55 +02:00
|
|
|
Assert(!newitemonleft && newitemoff == maxoff + 1);
|
|
|
|
if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
|
|
|
|
afterrightoff == minusinfoff))
|
2010-08-29 21:33:14 +02:00
|
|
|
{
|
|
|
|
memset(rightpage, 0, BufferGetPageSize(rbuf));
|
|
|
|
elog(ERROR, "failed to add new item to the right sibling"
|
|
|
|
" while splitting block %u of index \"%s\"",
|
|
|
|
origpagenumber, RelationGetRelationName(rel));
|
|
|
|
}
|
2020-04-14 01:39:55 +02:00
|
|
|
afterrightoff = OffsetNumberNext(afterrightoff);
|
2000-07-21 08:42:39 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-10-04 02:04:43 +02:00
|
|
|
/*
|
2020-08-09 21:01:15 +02:00
|
|
|
* We have to grab the original right sibling (if any) and update its prev
|
|
|
|
* link. We are guaranteed that this is deadlock-free, since we couple
|
|
|
|
* the locks in the standard order: left to right.
|
2000-10-04 02:04:43 +02:00
|
|
|
*/
|
2020-04-14 01:39:55 +02:00
|
|
|
if (!isrightmost)
|
2000-10-04 02:04:43 +02:00
|
|
|
{
|
2023-06-10 23:08:25 +02:00
|
|
|
sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
|
2016-04-20 15:31:19 +02:00
|
|
|
spage = BufferGetPage(sbuf);
|
2022-04-01 06:24:50 +02:00
|
|
|
sopaque = BTPageGetOpaque(spage);
|
2010-08-29 21:33:14 +02:00
|
|
|
if (sopaque->btpo_prev != origpagenumber)
|
|
|
|
{
|
|
|
|
memset(rightpage, 0, BufferGetPageSize(rbuf));
|
2019-08-01 11:05:08 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
|
|
|
errmsg_internal("right sibling's left-link doesn't match: "
|
|
|
|
"block %u links to %u instead of expected %u in index \"%s\"",
|
|
|
|
oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
|
|
|
|
RelationGetRelationName(rel))));
|
2010-08-29 21:33:14 +02:00
|
|
|
}
|
2006-10-04 02:30:14 +02:00
|
|
|
|
2006-05-08 02:00:17 +02:00
|
|
|
/*
|
|
|
|
* Check to see if we can set the SPLIT_END flag in the right-hand
|
|
|
|
* split page; this can save some I/O for vacuum since it need not
|
|
|
|
* proceed to the right sibling. We can set the flag if the right
|
|
|
|
* sibling has a different cycleid: that means it could not be part of
|
|
|
|
* a group of pages that were all split off from the same ancestor
|
|
|
|
* page. If you're confused, imagine that page A splits to A B and
|
|
|
|
* then again, yielding A C B, while vacuum is in progress. Tuples
|
|
|
|
* originally in A could now be in either B or C, hence vacuum must
|
|
|
|
* examine both pages. But if D, our right sibling, has a different
|
|
|
|
* cycleid then it could not contain any tuples that were in A when
|
|
|
|
* the vacuum started.
|
|
|
|
*/
|
|
|
|
if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
|
|
|
|
ropaque->btpo_flags |= BTP_SPLIT_END;
|
2000-10-04 02:04:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Right sibling is locked, new siblings are prepared, but original page
|
2007-04-11 22:47:38 +02:00
|
|
|
* is not updated yet.
|
2000-10-04 02:04:43 +02:00
|
|
|
*
|
2006-04-01 01:32:07 +02:00
|
|
|
* NO EREPORT(ERROR) till right sibling is updated. We can get away with
|
|
|
|
* not starting the critical section till here because we haven't been
|
2010-08-29 21:33:14 +02:00
|
|
|
* scribbling on the original page yet; see comments above.
|
2000-10-04 02:04:43 +02:00
|
|
|
*/
|
2001-01-12 22:54:01 +01:00
|
|
|
START_CRIT_SECTION();
|
2002-08-06 04:36:35 +02:00
|
|
|
|
2007-02-08 06:05:53 +01:00
|
|
|
/*
|
|
|
|
* By here, the original data page has been split into two new halves, and
|
|
|
|
* these are correct. The algorithm requires that the left page never
|
|
|
|
* move during a split, so we copy the new left page back on top of the
|
2020-04-14 23:38:28 +02:00
|
|
|
* original. We need to do this before writing the WAL record, so that
|
|
|
|
* XLogInsert can WAL log an image of the page if necessary.
|
2007-02-08 06:05:53 +01:00
|
|
|
*/
|
|
|
|
PageRestoreTempPage(leftpage, origpage);
|
2010-08-29 21:33:14 +02:00
|
|
|
/* leftpage, lopaque must not be used below here */
|
2007-02-08 06:05:53 +01:00
|
|
|
|
2007-04-11 22:47:38 +02:00
|
|
|
MarkBufferDirty(buf);
|
|
|
|
MarkBufferDirty(rbuf);
|
|
|
|
|
2020-04-14 01:39:55 +02:00
|
|
|
if (!isrightmost)
|
2007-04-11 22:47:38 +02:00
|
|
|
{
|
2010-08-29 21:33:14 +02:00
|
|
|
sopaque->btpo_prev = rightpagenumber;
|
2007-04-11 22:47:38 +02:00
|
|
|
MarkBufferDirty(sbuf);
|
|
|
|
}
|
|
|
|
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
/*
|
|
|
|
* Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
|
2020-04-14 04:26:41 +02:00
|
|
|
* a split
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
*/
|
|
|
|
if (!isleaf)
|
|
|
|
{
|
2016-04-20 15:31:19 +02:00
|
|
|
Page cpage = BufferGetPage(cbuf);
|
2022-04-01 06:24:50 +02:00
|
|
|
BTPageOpaque cpageop = BTPageGetOpaque(cpage);
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
|
|
|
|
cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
|
|
|
|
MarkBufferDirty(cbuf);
|
|
|
|
}
|
|
|
|
|
2002-08-06 04:36:35 +02:00
|
|
|
/* XLOG stuff */
|
2010-12-13 18:34:26 +01:00
|
|
|
if (RelationNeedsWAL(rel))
|
2000-10-04 02:04:43 +02:00
|
|
|
{
|
2000-12-28 14:00:29 +01:00
|
|
|
xl_btree_split xlrec;
|
2003-02-21 01:06:22 +01:00
|
|
|
uint8 xlinfo;
|
2000-12-28 14:00:29 +01:00
|
|
|
XLogRecPtr recptr;
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
|
2021-02-25 03:41:34 +01:00
|
|
|
xlrec.level = ropaque->btpo_level;
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
/* See comments below on newitem, orignewitem, and posting lists */
|
2020-04-14 01:39:55 +02:00
|
|
|
xlrec.firstrightoff = firstrightoff;
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
xlrec.newitemoff = newitemoff;
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
xlrec.postingoff = 0;
|
2020-04-14 01:39:55 +02:00
|
|
|
if (postingoff != 0 && origpagepostingoff < firstrightoff)
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
xlrec.postingoff = postingoff;
|
2000-12-28 14:00:29 +01:00
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogBeginInsert();
|
|
|
|
XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
|
2007-02-08 06:05:53 +01:00
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
|
|
|
|
XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
|
2020-04-14 01:39:55 +02:00
|
|
|
/* Log original right sibling, since we've changed its prev-pointer */
|
|
|
|
if (!isrightmost)
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
|
2020-04-14 01:39:55 +02:00
|
|
|
if (!isleaf)
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
|
2007-02-08 06:05:53 +01:00
|
|
|
|
2007-04-11 22:47:38 +02:00
|
|
|
/*
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
* Log the new item, if it was inserted on the left page. (If it was
|
|
|
|
* put on the right page, we don't need to explicitly WAL log it
|
|
|
|
* because it's included with all the other items on the right page.)
|
|
|
|
* Show the new item as belonging to the left page buffer, so that it
|
|
|
|
* is not stored if XLogInsert decides it needs a full-page image of
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
* the left page. We always store newitemoff in the record, though.
|
|
|
|
*
|
|
|
|
* The details are sometimes slightly different for page splits that
|
|
|
|
* coincide with a posting list split. If both the replacement
|
|
|
|
* posting list and newitem go on the right page, then we don't need
|
|
|
|
* to log anything extra, just like the simple !newitemonleft
|
|
|
|
* no-posting-split case (postingoff is set to zero in the WAL record,
|
|
|
|
* so recovery doesn't need to process a posting list split at all).
|
|
|
|
* Otherwise, we set postingoff and log orignewitem instead of
|
|
|
|
* newitem, despite having actually inserted newitem. REDO routine
|
|
|
|
* must reconstruct nposting and newitem using _bt_swap_posting().
|
|
|
|
*
|
|
|
|
* Note: It's possible that our page split point is the point that
|
|
|
|
* makes the posting list lastleft and newitem firstright. This is
|
|
|
|
* the only case where we log orignewitem/newitem despite newitem
|
|
|
|
* going on the right page. If XLogInsert decides that it can omit
|
|
|
|
* orignewitem due to logging a full-page image of the left page,
|
|
|
|
* everything still works out, since recovery only needs to log
|
|
|
|
* orignewitem for items on the left page (just like the regular
|
|
|
|
* newitem-logged case).
|
2007-02-08 06:05:53 +01:00
|
|
|
*/
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
if (newitemonleft && xlrec.postingoff == 0)
|
2020-04-14 01:39:55 +02:00
|
|
|
XLogRegisterBufData(0, (char *) newitem, newitemsz);
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
else if (xlrec.postingoff != 0)
|
|
|
|
{
|
2020-04-14 01:39:55 +02:00
|
|
|
Assert(isleaf);
|
|
|
|
Assert(newitemonleft || firstrightoff == newitemoff);
|
|
|
|
Assert(newitemsz == IndexTupleSize(orignewitem));
|
|
|
|
XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
}
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
/* Log the left page's new high key */
|
2020-04-14 01:39:55 +02:00
|
|
|
if (!isleaf)
|
|
|
|
{
|
|
|
|
/* lefthighkey isn't local copy, get current pointer */
|
|
|
|
itemid = PageGetItemId(origpage, P_HIKEY);
|
|
|
|
lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
|
|
|
|
}
|
|
|
|
XLogRegisterBufData(0, (char *) lefthighkey,
|
|
|
|
MAXALIGN(IndexTupleSize(lefthighkey)));
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
|
2007-04-11 22:47:38 +02:00
|
|
|
/*
|
|
|
|
* Log the contents of the right page in the format understood by
|
2019-03-04 21:32:40 +01:00
|
|
|
* _bt_restore_page(). The whole right page will be recreated.
|
2007-02-08 06:05:53 +01:00
|
|
|
*
|
2002-08-06 04:36:35 +02:00
|
|
|
* Direct access to page is not good but faster - we should implement
|
2003-02-21 01:06:22 +01:00
|
|
|
* some new func in page API. Note we only store the tuples
|
2007-04-11 22:47:38 +02:00
|
|
|
* themselves, knowing that they were inserted in item-number order
|
2019-05-14 00:53:39 +02:00
|
|
|
* and so the line pointers can be reconstructed. See comments for
|
2006-04-13 05:53:05 +02:00
|
|
|
* _bt_restore_page().
|
2000-10-04 02:04:43 +02:00
|
|
|
*/
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogRegisterBufData(1,
|
|
|
|
(char *) rightpage + ((PageHeader) rightpage)->pd_upper,
|
|
|
|
((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
|
2007-02-08 06:05:53 +01:00
|
|
|
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
recptr = XLogInsert(RM_BTREE_ID, xlinfo);
|
2000-10-04 02:04:43 +02:00
|
|
|
|
2007-02-08 14:52:55 +01:00
|
|
|
PageSetLSN(origpage, recptr);
|
2000-10-04 02:04:43 +02:00
|
|
|
PageSetLSN(rightpage, recptr);
|
2020-04-14 01:39:55 +02:00
|
|
|
if (!isrightmost)
|
2000-10-04 02:04:43 +02:00
|
|
|
PageSetLSN(spage, recptr);
|
2014-04-01 18:19:47 +02:00
|
|
|
if (!isleaf)
|
2016-04-20 15:31:19 +02:00
|
|
|
PageSetLSN(BufferGetPage(cbuf), recptr);
|
2000-10-04 02:04:43 +02:00
|
|
|
}
|
|
|
|
|
2001-01-24 00:29:22 +01:00
|
|
|
END_CRIT_SECTION();
|
|
|
|
|
2006-04-01 01:32:07 +02:00
|
|
|
/* release the old right sibling */
|
2020-04-14 01:39:55 +02:00
|
|
|
if (!isrightmost)
|
2006-04-01 01:32:07 +02:00
|
|
|
_bt_relbuf(rel, sbuf);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
/* release the child */
|
|
|
|
if (!isleaf)
|
|
|
|
_bt_relbuf(rel, cbuf);
|
|
|
|
|
2020-04-14 01:39:55 +02:00
|
|
|
/* be tidy */
|
|
|
|
if (isleaf)
|
|
|
|
pfree(lefthighkey);
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/* split's done */
|
|
|
|
return rbuf;
|
|
|
|
}
|
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
/*
|
2019-05-03 22:34:45 +02:00
|
|
|
* _bt_insert_parent() -- Insert downlink into parent, completing split.
|
2003-02-21 01:06:22 +01:00
|
|
|
*
|
|
|
|
* On entry, buf and rbuf are the left and right split pages, which we
|
2019-05-03 22:34:45 +02:00
|
|
|
* still hold write locks on. Both locks will be released here. We
|
|
|
|
* release the rbuf lock once we have a write lock on the page that we
|
|
|
|
* intend to insert a downlink to rbuf on (i.e. buf's current parent page).
|
|
|
|
* The lock on buf is released at the same point as the lock on the parent
|
|
|
|
* page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
|
|
|
|
* atomic operation that completes the split by inserting a new downlink.
|
2003-02-21 01:06:22 +01:00
|
|
|
*
|
2018-12-19 01:59:50 +01:00
|
|
|
* stack - stack showing how we got here. Will be NULL when splitting true
|
|
|
|
* root, or during concurrent root split, where we can be inefficient
|
2020-11-17 18:01:14 +01:00
|
|
|
* isroot - we split the true root
|
|
|
|
* isonly - we split a page alone on its level (might have been fast root)
|
2003-02-21 01:06:22 +01:00
|
|
|
*/
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
static void
|
2003-02-21 01:06:22 +01:00
|
|
|
_bt_insert_parent(Relation rel,
|
2023-04-02 05:12:26 +02:00
|
|
|
Relation heaprel,
|
2003-02-21 01:06:22 +01:00
|
|
|
Buffer buf,
|
|
|
|
Buffer rbuf,
|
|
|
|
BTStack stack,
|
2020-11-17 18:01:14 +01:00
|
|
|
bool isroot,
|
|
|
|
bool isonly)
|
2003-02-21 01:06:22 +01:00
|
|
|
{
|
2023-06-10 23:08:25 +02:00
|
|
|
Assert(heaprel != NULL);
|
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
/*
|
|
|
|
* Here we have to do something Lehman and Yao don't talk about: deal with
|
|
|
|
* a root split and construction of a new root. If our stack is empty
|
|
|
|
* then we have just split a node on what had been the root level when we
|
|
|
|
* descended the tree. If it was still the root then we perform a
|
|
|
|
* new-root construction. If it *wasn't* the root anymore, search to find
|
|
|
|
* the next higher level that someone constructed meanwhile, and find the
|
|
|
|
* right place to insert as for the normal case.
|
|
|
|
*
|
|
|
|
* If we have to search for the parent level, we do so by re-descending
|
|
|
|
* from the root. This is not super-efficient, but it's rare enough not
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
* to matter.
|
2003-02-21 01:06:22 +01:00
|
|
|
*/
|
2020-11-17 18:01:14 +01:00
|
|
|
if (isroot)
|
2003-02-21 01:06:22 +01:00
|
|
|
{
|
|
|
|
Buffer rootbuf;
|
|
|
|
|
2004-01-07 19:56:30 +01:00
|
|
|
Assert(stack == NULL);
|
2020-11-17 18:01:14 +01:00
|
|
|
Assert(isonly);
|
2023-06-10 23:08:25 +02:00
|
|
|
/* create a new root node one level up and update the metapage */
|
|
|
|
rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
|
2003-02-21 01:06:22 +01:00
|
|
|
/* release the split buffers */
|
2006-04-01 01:32:07 +02:00
|
|
|
_bt_relbuf(rel, rootbuf);
|
|
|
|
_bt_relbuf(rel, rbuf);
|
|
|
|
_bt_relbuf(rel, buf);
|
2003-02-21 01:06:22 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
BlockNumber bknum = BufferGetBlockNumber(buf);
|
|
|
|
BlockNumber rbknum = BufferGetBlockNumber(rbuf);
|
2016-04-20 15:31:19 +02:00
|
|
|
Page page = BufferGetPage(buf);
|
2006-01-26 00:04:21 +01:00
|
|
|
IndexTuple new_item;
|
2003-02-21 01:06:22 +01:00
|
|
|
BTStackData fakestack;
|
2006-01-26 00:04:21 +01:00
|
|
|
IndexTuple ritem;
|
2003-02-21 01:06:22 +01:00
|
|
|
Buffer pbuf;
|
|
|
|
|
2004-01-07 19:56:30 +01:00
|
|
|
if (stack == NULL)
|
2003-02-21 01:06:22 +01:00
|
|
|
{
|
2020-11-17 18:01:14 +01:00
|
|
|
BTPageOpaque opaque;
|
2003-02-21 01:06:22 +01:00
|
|
|
|
2014-09-11 21:43:56 +02:00
|
|
|
elog(DEBUG2, "concurrent ROOT page split");
|
2022-04-01 06:24:50 +02:00
|
|
|
opaque = BTPageGetOpaque(page);
|
2020-03-11 01:25:47 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We should never reach here when a leaf page split takes place
|
|
|
|
* despite the insert of newitem being able to apply the fastpath
|
|
|
|
* optimization. Make sure of that with an assertion.
|
|
|
|
*
|
|
|
|
* This is more of a performance issue than a correctness issue.
|
|
|
|
* The fastpath won't have a descent stack. Using a phony stack
|
|
|
|
* here works, but never rely on that. The fastpath should be
|
2020-03-18 22:42:49 +01:00
|
|
|
* rejected within _bt_search_insert() when the rightmost leaf
|
|
|
|
* page will split, since it's faster to go through _bt_search()
|
|
|
|
* and get a stack in the usual way.
|
2020-03-11 01:25:47 +01:00
|
|
|
*/
|
2020-11-17 18:01:14 +01:00
|
|
|
Assert(!(P_ISLEAF(opaque) &&
|
2020-03-11 01:25:47 +01:00
|
|
|
BlockNumberIsValid(RelationGetTargetBlock(rel))));
|
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
/* Find the leftmost page at the next level up */
|
2023-09-08 07:12:12 +02:00
|
|
|
pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
|
2003-02-21 01:06:22 +01:00
|
|
|
/* Set up a phony stack entry pointing there */
|
|
|
|
stack = &fakestack;
|
|
|
|
stack->bts_blkno = BufferGetBlockNumber(pbuf);
|
|
|
|
stack->bts_offset = InvalidOffsetNumber;
|
|
|
|
stack->bts_parent = NULL;
|
|
|
|
_bt_relbuf(rel, pbuf);
|
|
|
|
}
|
|
|
|
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
/* get high key from left, a strict lower bound for new right page */
|
2006-01-26 00:04:21 +01:00
|
|
|
ritem = (IndexTuple) PageGetItem(page,
|
|
|
|
PageGetItemId(page, P_HIKEY));
|
2003-02-21 01:06:22 +01:00
|
|
|
|
|
|
|
/* form an index tuple that points at the new right page */
|
2006-01-26 00:04:21 +01:00
|
|
|
new_item = CopyIndexTuple(ritem);
|
2019-12-17 02:49:45 +01:00
|
|
|
BTreeTupleSetDownLink(new_item, rbknum);
|
2003-02-21 01:06:22 +01:00
|
|
|
|
|
|
|
/*
|
2019-05-03 22:34:45 +02:00
|
|
|
* Re-find and write lock the parent of buf.
|
2003-02-21 01:06:22 +01:00
|
|
|
*
|
2019-05-03 22:34:45 +02:00
|
|
|
* It's possible that the location of buf's downlink has changed since
|
|
|
|
* our initial _bt_search() descent. _bt_getstackbuf() will detect
|
|
|
|
* and recover from this, updating the stack, which ensures that the
|
|
|
|
* new downlink will be inserted at the correct offset. Even buf's
|
|
|
|
* parent may have changed.
|
2003-02-21 01:06:22 +01:00
|
|
|
*/
|
2023-04-02 05:12:26 +02:00
|
|
|
pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
|
2003-02-21 01:06:22 +01:00
|
|
|
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
/*
|
2020-03-28 00:44:52 +01:00
|
|
|
* Unlock the right child. The left child will be unlocked in
|
|
|
|
* _bt_insertonpg().
|
|
|
|
*
|
|
|
|
* Unlocking the right child must be delayed until here to ensure that
|
|
|
|
* no concurrent VACUUM operation can become confused. Page deletion
|
|
|
|
* cannot be allowed to fail to re-find a downlink for the rbuf page.
|
|
|
|
* (Actually, this is just a vestige of how things used to work. The
|
|
|
|
* page deletion code is expected to check for the INCOMPLETE_SPLIT
|
|
|
|
* flag on the left child. It won't attempt deletion of the right
|
|
|
|
* child until the split is complete. Despite all this, we opt to
|
|
|
|
* conservatively delay unlocking the right child until here.)
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
*/
|
2006-04-01 01:32:07 +02:00
|
|
|
_bt_relbuf(rel, rbuf);
|
2003-02-21 01:06:22 +01:00
|
|
|
|
|
|
|
if (pbuf == InvalidBuffer)
|
2019-08-01 11:05:08 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
|
|
|
errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
|
|
|
|
RelationGetRelationName(rel), bknum, rbknum)));
|
2003-02-21 01:06:22 +01:00
|
|
|
|
2019-08-14 20:32:35 +02:00
|
|
|
/* Recursively insert into the parent */
|
2023-04-02 05:12:26 +02:00
|
|
|
_bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
|
2020-03-16 20:00:10 +01:00
|
|
|
new_item, MAXALIGN(IndexTupleSize(new_item)),
|
2020-11-17 18:01:14 +01:00
|
|
|
stack->bts_offset + 1, 0, isonly);
|
2003-02-21 01:06:22 +01:00
|
|
|
|
|
|
|
/* be tidy */
|
|
|
|
pfree(new_item);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
/*
|
|
|
|
* _bt_finish_split() -- Finish an incomplete split
|
|
|
|
*
|
|
|
|
* A crash or other failure can leave a split incomplete. The insertion
|
|
|
|
* routines won't allow to insert on a page that is incompletely split.
|
|
|
|
* Before inserting on such a page, call _bt_finish_split().
|
|
|
|
*
|
|
|
|
* On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
|
|
|
|
* and unpinned.
|
2023-06-10 23:08:25 +02:00
|
|
|
*
|
|
|
|
* Caller must provide a valid heaprel, since finishing a page split requires
|
|
|
|
* allocating a new page if and when the parent page splits in turn.
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
*/
|
|
|
|
void
|
2023-04-02 05:12:26 +02:00
|
|
|
_bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
{
|
2016-04-20 15:31:19 +02:00
|
|
|
Page lpage = BufferGetPage(lbuf);
|
2022-04-01 06:24:50 +02:00
|
|
|
BTPageOpaque lpageop = BTPageGetOpaque(lpage);
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
Buffer rbuf;
|
|
|
|
Page rpage;
|
|
|
|
BTPageOpaque rpageop;
|
2020-11-17 18:01:14 +01:00
|
|
|
bool wasroot;
|
|
|
|
bool wasonly;
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
|
|
|
|
Assert(P_INCOMPLETE_SPLIT(lpageop));
|
2023-06-10 23:08:25 +02:00
|
|
|
Assert(heaprel != NULL);
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
|
|
|
|
/* Lock right sibling, the one missing the downlink */
|
2023-06-10 23:08:25 +02:00
|
|
|
rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
|
2016-04-20 15:31:19 +02:00
|
|
|
rpage = BufferGetPage(rbuf);
|
2022-04-01 06:24:50 +02:00
|
|
|
rpageop = BTPageGetOpaque(rpage);
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
|
|
|
|
/* Could this be a root split? */
|
|
|
|
if (!stack)
|
|
|
|
{
|
|
|
|
Buffer metabuf;
|
|
|
|
Page metapg;
|
|
|
|
BTMetaPageData *metad;
|
|
|
|
|
|
|
|
/* acquire lock on the metapage */
|
2023-06-10 23:08:25 +02:00
|
|
|
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
|
2016-04-20 15:31:19 +02:00
|
|
|
metapg = BufferGetPage(metabuf);
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
metad = BTPageGetMeta(metapg);
|
|
|
|
|
2020-11-17 18:01:14 +01:00
|
|
|
wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
|
|
|
|
_bt_relbuf(rel, metabuf);
|
|
|
|
}
|
|
|
|
else
|
2020-11-17 18:01:14 +01:00
|
|
|
wasroot = false;
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
|
|
|
|
/* Was this the only page on the level before split? */
|
2020-11-17 18:01:14 +01:00
|
|
|
wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
|
|
|
|
elog(DEBUG1, "finishing incomplete split of %u/%u",
|
|
|
|
BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
|
|
|
|
|
2023-04-02 05:12:26 +02:00
|
|
|
_bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
2019-08-14 20:32:35 +02:00
|
|
|
* _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
|
|
|
|
* tuple whose downlink points to child page.
|
2000-07-21 08:42:39 +02:00
|
|
|
*
|
2019-08-14 20:32:35 +02:00
|
|
|
* Caller passes child's block number, which is used to identify
|
|
|
|
* associated pivot tuple in parent page using a linear search that
|
|
|
|
* matches on pivot's downlink/block number. The expected location of
|
|
|
|
* the pivot tuple is taken from the stack one level above the child
|
|
|
|
* page. This is used as a starting point. Insertions into the
|
|
|
|
* parent level could cause the pivot tuple to move right; deletions
|
|
|
|
* could cause it to move left, but not left of the page we previously
|
|
|
|
* found it on.
|
2000-07-21 08:42:39 +02:00
|
|
|
*
|
2019-08-14 20:32:35 +02:00
|
|
|
* Caller can use its stack to relocate the pivot tuple/downlink for
|
|
|
|
* any same-level page to the right of the page found by its initial
|
|
|
|
* descent. This is necessary because of the possibility that caller
|
|
|
|
* moved right to recover from a concurrent page split. It's also
|
|
|
|
* convenient for certain callers to be able to step right when there
|
|
|
|
* wasn't a concurrent page split, while still using their original
|
|
|
|
* stack. For example, the checkingunique _bt_doinsert() case may
|
|
|
|
* have to step right when there are many physical duplicates, and its
|
|
|
|
* scantid forces an insertion to the right of the "first page the
|
2020-05-11 20:01:07 +02:00
|
|
|
* value could be on". (This is also relied on by all of our callers
|
|
|
|
* when dealing with !heapkeyspace indexes.)
|
2003-02-21 01:06:22 +01:00
|
|
|
*
|
2019-08-14 20:32:35 +02:00
|
|
|
* Returns write-locked parent page buffer, or InvalidBuffer if pivot
|
|
|
|
* tuple not found (should not happen). Adjusts bts_blkno &
|
|
|
|
* bts_offset if changed. Page split caller should insert its new
|
|
|
|
* pivot tuple for its new right sibling page on parent page, at the
|
|
|
|
* offset number bts_offset + 1.
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
2003-02-23 07:17:13 +01:00
|
|
|
Buffer
|
2023-04-02 05:12:26 +02:00
|
|
|
_bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
|
|
|
BlockNumber blkno;
|
2003-02-21 01:06:22 +01:00
|
|
|
OffsetNumber start;
|
2000-07-21 08:42:39 +02:00
|
|
|
|
|
|
|
blkno = stack->bts_blkno;
|
|
|
|
start = stack->bts_offset;
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
for (;;)
|
|
|
|
{
|
2003-02-21 01:06:22 +01:00
|
|
|
Buffer buf;
|
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
|
2023-06-10 23:08:25 +02:00
|
|
|
buf = _bt_getbuf(rel, blkno, BT_WRITE);
|
2016-04-20 15:31:19 +02:00
|
|
|
page = BufferGetPage(buf);
|
2022-04-01 06:24:50 +02:00
|
|
|
opaque = BTPageGetOpaque(page);
|
2003-02-21 01:06:22 +01:00
|
|
|
|
2023-06-10 23:08:25 +02:00
|
|
|
Assert(heaprel != NULL);
|
2019-02-26 02:47:43 +01:00
|
|
|
if (P_INCOMPLETE_SPLIT(opaque))
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
{
|
2023-04-02 05:12:26 +02:00
|
|
|
_bt_finish_split(rel, heaprel, buf, stack->bts_parent);
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2003-02-22 01:45:05 +01:00
|
|
|
if (!P_IGNORE(opaque))
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
2003-02-22 01:45:05 +01:00
|
|
|
OffsetNumber offnum,
|
|
|
|
minoff,
|
|
|
|
maxoff;
|
|
|
|
ItemId itemid;
|
2006-01-26 00:04:21 +01:00
|
|
|
IndexTuple item;
|
2003-02-22 01:45:05 +01:00
|
|
|
|
|
|
|
minoff = P_FIRSTDATAKEY(opaque);
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* start = InvalidOffsetNumber means "search the whole page". We
|
|
|
|
* need this test anyway due to possibility that page has a high
|
|
|
|
* key now when it didn't before.
|
|
|
|
*/
|
|
|
|
if (start < minoff)
|
|
|
|
start = minoff;
|
|
|
|
|
2004-08-18 01:15:33 +02:00
|
|
|
/*
|
|
|
|
* Need this check too, to guard against possibility that page
|
|
|
|
* split since we visited it originally.
|
|
|
|
*/
|
|
|
|
if (start > maxoff)
|
|
|
|
start = OffsetNumberNext(maxoff);
|
|
|
|
|
2003-02-22 01:45:05 +01:00
|
|
|
/*
|
|
|
|
* These loops will check every item on the page --- but in an
|
|
|
|
* order that's attuned to the probability of where it actually
|
|
|
|
* is. Scan to the right first, then to the left.
|
|
|
|
*/
|
|
|
|
for (offnum = start;
|
|
|
|
offnum <= maxoff;
|
|
|
|
offnum = OffsetNumberNext(offnum))
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
2003-02-22 01:45:05 +01:00
|
|
|
itemid = PageGetItemId(page, offnum);
|
2006-01-26 00:04:21 +01:00
|
|
|
item = (IndexTuple) PageGetItem(page, itemid);
|
2018-04-07 22:00:39 +02:00
|
|
|
|
2019-12-17 02:49:45 +01:00
|
|
|
if (BTreeTupleGetDownLink(item) == child)
|
2003-02-22 01:45:05 +01:00
|
|
|
{
|
|
|
|
/* Return accurate pointer to where link is now */
|
|
|
|
stack->bts_blkno = blkno;
|
|
|
|
stack->bts_offset = offnum;
|
|
|
|
return buf;
|
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
}
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2003-02-22 01:45:05 +01:00
|
|
|
for (offnum = OffsetNumberPrev(start);
|
|
|
|
offnum >= minoff;
|
|
|
|
offnum = OffsetNumberPrev(offnum))
|
2003-02-21 01:06:22 +01:00
|
|
|
{
|
2003-02-22 01:45:05 +01:00
|
|
|
itemid = PageGetItemId(page, offnum);
|
2006-01-26 00:04:21 +01:00
|
|
|
item = (IndexTuple) PageGetItem(page, itemid);
|
2018-04-07 22:00:39 +02:00
|
|
|
|
2019-12-17 02:49:45 +01:00
|
|
|
if (BTreeTupleGetDownLink(item) == child)
|
2003-02-22 01:45:05 +01:00
|
|
|
{
|
|
|
|
/* Return accurate pointer to where link is now */
|
|
|
|
stack->bts_blkno = blkno;
|
|
|
|
stack->bts_offset = offnum;
|
|
|
|
return buf;
|
|
|
|
}
|
2003-02-21 01:06:22 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
2003-02-21 01:06:22 +01:00
|
|
|
* The item we're looking for moved right at least one page.
|
2019-08-24 05:24:49 +02:00
|
|
|
*
|
|
|
|
* Lehman and Yao couple/chain locks when moving right here, which we
|
|
|
|
* can avoid. See nbtree/README.
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
|
|
|
if (P_RIGHTMOST(opaque))
|
2001-01-31 02:08:36 +01:00
|
|
|
{
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, buf);
|
2004-04-21 20:24:26 +02:00
|
|
|
return InvalidBuffer;
|
2001-01-31 02:08:36 +01:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
blkno = opaque->btpo_next;
|
2003-02-21 01:06:22 +01:00
|
|
|
start = InvalidOffsetNumber;
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, buf);
|
2000-07-21 08:42:39 +02:00
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2023-06-10 23:08:25 +02:00
|
|
|
* _bt_newlevel() -- Create a new level above root page.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
* We've just split the old root page and need to create a new one.
|
|
|
|
* In order to do this, we add a new root page to the file, then lock
|
|
|
|
* the metadata page and update it. This is guaranteed to be deadlock-
|
|
|
|
* free, because all readers release their locks on the metadata page
|
|
|
|
* before trying to lock the root, and all writers lock the root before
|
|
|
|
* trying to lock the metadata page. We have a write lock on the old
|
|
|
|
* root page, so we have not introduced any cycles into the waits-for
|
|
|
|
* graph.
|
|
|
|
*
|
|
|
|
* On entry, lbuf (the old root) and rbuf (its new peer) are write-
|
2001-01-26 02:24:31 +01:00
|
|
|
* locked. On exit, a new root page exists with entries for the
|
|
|
|
* two new children, metapage is updated and unlocked/unpinned.
|
|
|
|
* The new root buffer is returned to caller which has to unlock/unpin
|
|
|
|
* lbuf, rbuf & rootbuf.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2001-01-26 02:24:31 +01:00
|
|
|
static Buffer
|
2023-06-10 23:08:25 +02:00
|
|
|
_bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2000-12-28 14:00:29 +01:00
|
|
|
Buffer rootbuf;
|
|
|
|
Page lpage,
|
|
|
|
rootpage;
|
|
|
|
BlockNumber lbkno,
|
|
|
|
rbkno;
|
|
|
|
BlockNumber rootblknum;
|
|
|
|
BTPageOpaque rootopaque;
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
BTPageOpaque lopaque;
|
2000-12-28 14:00:29 +01:00
|
|
|
ItemId itemid;
|
2006-01-26 00:04:21 +01:00
|
|
|
IndexTuple item;
|
2014-04-04 12:12:38 +02:00
|
|
|
IndexTuple left_item;
|
|
|
|
Size left_item_sz;
|
|
|
|
IndexTuple right_item;
|
|
|
|
Size right_item_sz;
|
2000-12-28 14:00:29 +01:00
|
|
|
Buffer metabuf;
|
|
|
|
Page metapg;
|
|
|
|
BTMetaPageData *metad;
|
2000-10-13 04:03:02 +02:00
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
lbkno = BufferGetBlockNumber(lbuf);
|
|
|
|
rbkno = BufferGetBlockNumber(rbuf);
|
2016-04-20 15:31:19 +02:00
|
|
|
lpage = BufferGetPage(lbuf);
|
2022-04-01 06:24:50 +02:00
|
|
|
lopaque = BTPageGetOpaque(lpage);
|
2003-02-21 01:06:22 +01:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/* get a new root page */
|
2023-06-10 23:08:25 +02:00
|
|
|
rootbuf = _bt_allocbuf(rel, heaprel);
|
2016-04-20 15:31:19 +02:00
|
|
|
rootpage = BufferGetPage(rootbuf);
|
2000-10-04 02:04:43 +02:00
|
|
|
rootblknum = BufferGetBlockNumber(rootbuf);
|
2003-02-22 01:45:05 +01:00
|
|
|
|
|
|
|
/* acquire lock on the metapage */
|
2023-06-10 23:08:25 +02:00
|
|
|
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
|
2016-04-20 15:31:19 +02:00
|
|
|
metapg = BufferGetPage(metabuf);
|
2000-12-28 14:00:29 +01:00
|
|
|
metad = BTPageGetMeta(metapg);
|
2000-10-04 02:04:43 +02:00
|
|
|
|
2014-04-04 12:12:38 +02:00
|
|
|
/*
|
2020-04-14 01:39:55 +02:00
|
|
|
* Create downlink item for left page (old root). The key value used is
|
|
|
|
* "minus infinity", a sentinel value that's reliably less than any real
|
|
|
|
* key value that could appear in the left page.
|
2014-04-04 12:12:38 +02:00
|
|
|
*/
|
|
|
|
left_item_sz = sizeof(IndexTupleData);
|
|
|
|
left_item = (IndexTuple) palloc(left_item_sz);
|
|
|
|
left_item->t_info = left_item_sz;
|
2019-12-17 02:49:45 +01:00
|
|
|
BTreeTupleSetDownLink(left_item, lbkno);
|
2020-04-08 00:56:52 +02:00
|
|
|
BTreeTupleSetNAtts(left_item, 0, false);
|
2014-04-04 12:12:38 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Create downlink item for right page. The key for it is obtained from
|
|
|
|
* the "high key" position in the left page.
|
|
|
|
*/
|
|
|
|
itemid = PageGetItemId(lpage, P_HIKEY);
|
|
|
|
right_item_sz = ItemIdGetLength(itemid);
|
|
|
|
item = (IndexTuple) PageGetItem(lpage, itemid);
|
|
|
|
right_item = CopyIndexTuple(item);
|
2019-12-17 02:49:45 +01:00
|
|
|
BTreeTupleSetDownLink(right_item, rbkno);
|
2014-04-04 12:12:38 +02:00
|
|
|
|
2003-07-21 22:29:40 +02:00
|
|
|
/* NO EREPORT(ERROR) from here till newroot op is logged */
|
2001-01-12 22:54:01 +01:00
|
|
|
START_CRIT_SECTION();
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2018-05-30 18:45:39 +02:00
|
|
|
/* upgrade metapage if needed */
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
if (metad->btm_version < BTREE_NOVAC_VERSION)
|
2018-05-30 18:45:39 +02:00
|
|
|
_bt_upgrademetapage(metapg);
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/* set btree special data */
|
2022-04-01 06:24:50 +02:00
|
|
|
rootopaque = BTPageGetOpaque(rootpage);
|
1996-07-09 08:22:35 +02:00
|
|
|
rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
|
2003-02-21 01:06:22 +01:00
|
|
|
rootopaque->btpo_flags = BTP_ROOT;
|
2021-02-25 03:41:34 +01:00
|
|
|
rootopaque->btpo_level =
|
2022-04-01 06:24:50 +02:00
|
|
|
(BTPageGetOpaque(lpage))->btpo_level + 1;
|
2006-05-08 02:00:17 +02:00
|
|
|
rootopaque->btpo_cycleid = 0;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
/* update metapage data */
|
|
|
|
metad->btm_root = rootblknum;
|
2021-02-25 03:41:34 +01:00
|
|
|
metad->btm_level = rootopaque->btpo_level;
|
2003-02-21 01:06:22 +01:00
|
|
|
metad->btm_fastroot = rootblknum;
|
2021-02-25 03:41:34 +01:00
|
|
|
metad->btm_fastlevel = rootopaque->btpo_level;
|
1999-03-28 22:32:42 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Insert the left page pointer into the new root page. The root page is
|
|
|
|
* the rightmost page on its level so there is no "high key" in it; the
|
|
|
|
* two items will go into positions P_HIKEY and P_FIRSTKEY.
|
2006-04-13 05:53:05 +02:00
|
|
|
*
|
|
|
|
* Note: we *must* insert the two items in item-number order, for the
|
|
|
|
* benefit of _bt_restore_page().
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
Adjust INCLUDE index truncation comments and code.
Add several assertions that ensure that we're dealing with a pivot tuple
without non-key attributes where that's expected. Also, remove the
assertion within _bt_isequal(), restoring the v10 function signature. A
similar check will be performed for the page highkey within
_bt_moveright() in most cases. Also avoid dropping all objects within
regression tests, to increase pg_dump test coverage for INCLUDE indexes.
Rather than using infrastructure that's generally intended to be used
with reference counted heap tuple descriptors during truncation, use the
same function that was introduced to store flat TupleDescs in shared
memory (we use a temp palloc'd buffer). This isn't strictly necessary,
but seems more future-proof than the old approach. It also lets us
avoid including rel.h within indextuple.c, which was arguably a
modularity violation. Also, we now call index_deform_tuple() with the
truncated TupleDesc, not the source TupleDesc, since that's more robust,
and saves a few cycles.
In passing, fix a memory leak by pfree'ing truncated pivot tuple memory
during CREATE INDEX. Also pfree during a page split, just to be
consistent.
Refactor _bt_check_natts() to be more readable.
Author: Peter Geoghegan with some editorization by me
Reviewed by: Alexander Korotkov, Teodor Sigaev
Discussion: https://www.postgresql.org/message-id/CAH2-Wz%3DkCWuXeMrBCopC-tFs3FbiVxQNjjgNKdG2sHxZ5k2y3w%40mail.gmail.com
2018-04-19 07:45:58 +02:00
|
|
|
Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
|
2014-04-04 12:12:38 +02:00
|
|
|
if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
|
2007-09-20 19:56:33 +02:00
|
|
|
false, false) == InvalidOffsetNumber)
|
2007-12-31 05:52:05 +01:00
|
|
|
elog(PANIC, "failed to add leftkey to new root page"
|
|
|
|
" while splitting block %u of index \"%s\"",
|
|
|
|
BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
|
|
|
* insert the right page pointer into the new root page.
|
|
|
|
*/
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
|
|
|
|
Assert(BTreeTupleGetNAtts(right_item, rel) <=
|
Adjust INCLUDE index truncation comments and code.
Add several assertions that ensure that we're dealing with a pivot tuple
without non-key attributes where that's expected. Also, remove the
assertion within _bt_isequal(), restoring the v10 function signature. A
similar check will be performed for the page highkey within
_bt_moveright() in most cases. Also avoid dropping all objects within
regression tests, to increase pg_dump test coverage for INCLUDE indexes.
Rather than using infrastructure that's generally intended to be used
with reference counted heap tuple descriptors during truncation, use the
same function that was introduced to store flat TupleDescs in shared
memory (we use a temp palloc'd buffer). This isn't strictly necessary,
but seems more future-proof than the old approach. It also lets us
avoid including rel.h within indextuple.c, which was arguably a
modularity violation. Also, we now call index_deform_tuple() with the
truncated TupleDesc, not the source TupleDesc, since that's more robust,
and saves a few cycles.
In passing, fix a memory leak by pfree'ing truncated pivot tuple memory
during CREATE INDEX. Also pfree during a page split, just to be
consistent.
Refactor _bt_check_natts() to be more readable.
Author: Peter Geoghegan with some editorization by me
Reviewed by: Alexander Korotkov, Teodor Sigaev
Discussion: https://www.postgresql.org/message-id/CAH2-Wz%3DkCWuXeMrBCopC-tFs3FbiVxQNjjgNKdG2sHxZ5k2y3w%40mail.gmail.com
2018-04-19 07:45:58 +02:00
|
|
|
IndexRelationGetNumberOfKeyAttributes(rel));
|
2014-04-04 12:12:38 +02:00
|
|
|
if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
|
2007-09-20 19:56:33 +02:00
|
|
|
false, false) == InvalidOffsetNumber)
|
2007-12-31 05:52:05 +01:00
|
|
|
elog(PANIC, "failed to add rightkey to new root page"
|
|
|
|
" while splitting block %u of index \"%s\"",
|
|
|
|
BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page,
and inserting the downlink for the new right page to the parent. Previously,
we handled the case that you crash in between those steps with a cleanup
routine after the WAL recovery had finished, which finished the incomplete
split. However, that doesn't help if the page split is interrupted but the
database doesn't crash, so that you don't perform WAL recovery. That could
happen for example if you run out of disk space.
Remove the end-of-recovery cleanup step. Instead, when a page is split, the
left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink
is inserted to the parent, the flag is cleared again. If an insertion sees
a page with the flag set, it knows that the split was interrupted for some
reason, and inserts the missing downlink before proceeding.
I used the same approach to fix GIN and GiST split algorithms earlier. This
was the last WAL cleanup routine, so we could get rid of that whole
machinery now, but I'll leave that for a separate patch.
Reviewed by Peter Geoghegan.
2014-03-18 19:12:58 +01:00
|
|
|
/* Clear the incomplete-split flag in the left child */
|
|
|
|
Assert(P_INCOMPLETE_SPLIT(lopaque));
|
|
|
|
lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
|
|
|
|
MarkBufferDirty(lbuf);
|
|
|
|
|
2006-04-01 01:32:07 +02:00
|
|
|
MarkBufferDirty(rootbuf);
|
|
|
|
MarkBufferDirty(metabuf);
|
|
|
|
|
2000-10-04 02:04:43 +02:00
|
|
|
/* XLOG stuff */
|
2010-12-13 18:34:26 +01:00
|
|
|
if (RelationNeedsWAL(rel))
|
2000-10-04 02:04:43 +02:00
|
|
|
{
|
2000-10-13 04:03:02 +02:00
|
|
|
xl_btree_newroot xlrec;
|
2000-10-21 17:43:36 +02:00
|
|
|
XLogRecPtr recptr;
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
xl_btree_metadata md;
|
2000-10-13 04:03:02 +02:00
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
xlrec.rootblk = rootblknum;
|
2000-12-28 14:00:29 +01:00
|
|
|
xlrec.level = metad->btm_level;
|
2003-02-21 01:06:22 +01:00
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogBeginInsert();
|
|
|
|
XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
|
|
|
|
|
|
|
|
XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
|
|
|
|
XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
|
2017-11-03 21:31:32 +01:00
|
|
|
XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
|
Make heap TID a tiebreaker nbtree index column.
Make nbtree treat all index tuples as having a heap TID attribute.
Index searches can distinguish duplicates by heap TID, since heap TID is
always guaranteed to be unique. This general approach has numerous
benefits for performance, and is prerequisite to teaching VACUUM to
perform "retail index tuple deletion".
Naively adding a new attribute to every pivot tuple has unacceptable
overhead (it bloats internal pages), so suffix truncation of pivot
tuples is added. This will usually truncate away the "extra" heap TID
attribute from pivot tuples during a leaf page split, and may also
truncate away additional user attributes. This can increase fan-out,
especially in a multi-column index. Truncation can only occur at the
attribute granularity, which isn't particularly effective, but works
well enough for now. A future patch may add support for truncating
"within" text attributes by generating truncated key values using new
opclass infrastructure.
Only new indexes (BTREE_VERSION 4 indexes) will have insertions that
treat heap TID as a tiebreaker attribute, or will have pivot tuples
undergo suffix truncation during a leaf page split (on-disk
compatibility with versions 2 and 3 is preserved). Upgrades to version
4 cannot be performed on-the-fly, unlike upgrades from version 2 to
version 3. contrib/amcheck continues to work with version 2 and 3
indexes, while also enforcing stricter invariants when verifying version
4 indexes. These stricter invariants are the same invariants described
by "3.1.12 Sequencing" from the Lehman and Yao paper.
A later patch will enhance the logic used by nbtree to pick a split
point. This patch is likely to negatively impact performance without
smarter choices around the precise point to split leaf pages at. Making
these two mostly-distinct sets of enhancements into distinct commits
seems like it might clarify their design, even though neither commit is
particularly useful on its own.
The maximum allowed size of new tuples is reduced by an amount equal to
the space required to store an extra MAXALIGN()'d TID in a new high key
during leaf page splits. The user-facing definition of the "1/3 of a
page" restriction is already imprecise, and so does not need to be
revised. However, there should be a compatibility note in the v12
release notes.
Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas, Alexander Korotkov
Discussion: https://postgr.es/m/CAH2-WzkVb0Kom=R+88fDFb=JSxZMFvbHVC6Mn9LJ2n=X=kS-Uw@mail.gmail.com
2019-03-20 18:04:01 +01:00
|
|
|
Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
|
|
|
|
md.version = metad->btm_version;
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
md.root = rootblknum;
|
|
|
|
md.level = metad->btm_level;
|
|
|
|
md.fastroot = rootblknum;
|
|
|
|
md.fastlevel = metad->btm_level;
|
2021-02-25 03:41:34 +01:00
|
|
|
md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
md.allequalimage = metad->btm_allequalimage;
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
|
|
|
|
XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
|
2000-10-04 02:04:43 +02:00
|
|
|
|
|
|
|
/*
|
2002-08-06 04:36:35 +02:00
|
|
|
* Direct access to page is not good but faster - we should implement
|
2000-10-04 02:04:43 +02:00
|
|
|
* some new func in page API.
|
|
|
|
*/
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogRegisterBufData(0,
|
|
|
|
(char *) rootpage + ((PageHeader) rootpage)->pd_upper,
|
|
|
|
((PageHeader) rootpage)->pd_special -
|
|
|
|
((PageHeader) rootpage)->pd_upper);
|
|
|
|
|
|
|
|
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
|
2000-10-13 04:03:02 +02:00
|
|
|
|
2014-04-22 21:40:44 +02:00
|
|
|
PageSetLSN(lpage, recptr);
|
2000-10-04 02:04:43 +02:00
|
|
|
PageSetLSN(rootpage, recptr);
|
2000-10-13 04:03:02 +02:00
|
|
|
PageSetLSN(metapg, recptr);
|
2000-10-04 02:04:43 +02:00
|
|
|
}
|
2002-08-06 04:36:35 +02:00
|
|
|
|
2001-01-12 22:54:01 +01:00
|
|
|
END_CRIT_SECTION();
|
2000-10-04 02:04:43 +02:00
|
|
|
|
2006-04-01 01:32:07 +02:00
|
|
|
/* done with metapage */
|
|
|
|
_bt_relbuf(rel, metabuf);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2014-04-04 12:12:38 +02:00
|
|
|
pfree(left_item);
|
|
|
|
pfree(right_item);
|
|
|
|
|
2006-01-11 09:43:13 +01:00
|
|
|
return rootbuf;
|
2001-01-26 02:24:31 +01:00
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
2020-03-19 02:17:37 +01:00
|
|
|
* _bt_pgaddtup() -- add a data item to a particular page during split.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2020-03-19 02:17:37 +01:00
|
|
|
* The difference between this routine and a bare PageAddItem call is
|
2020-04-14 01:39:55 +02:00
|
|
|
* that this code can deal with the first data item on an internal btree
|
|
|
|
* page in passing. This data item (which is called "firstright" within
|
|
|
|
* _bt_split()) has a key that must be treated as minus infinity after
|
|
|
|
* the split. Therefore, we truncate away all attributes when caller
|
|
|
|
* specifies it's the first data item on page (downlink is not changed,
|
|
|
|
* though). This extra step is only needed for the right page of an
|
|
|
|
* internal page split. There is no need to do this for the first data
|
|
|
|
* item on the existing/left page, since that will already have been
|
|
|
|
* truncated during an earlier page split.
|
2020-03-19 02:17:37 +01:00
|
|
|
*
|
2020-04-14 01:39:55 +02:00
|
|
|
* See _bt_split() for a high level explanation of why we truncate here.
|
|
|
|
* Note that this routine has nothing to do with suffix truncation,
|
|
|
|
* despite using some of the same infrastructure.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2020-04-14 01:39:55 +02:00
|
|
|
static inline bool
|
2010-08-29 21:33:14 +02:00
|
|
|
_bt_pgaddtup(Page page,
|
1996-07-09 08:22:35 +02:00
|
|
|
Size itemsize,
|
2006-01-26 00:04:21 +01:00
|
|
|
IndexTuple itup,
|
2020-04-14 01:39:55 +02:00
|
|
|
OffsetNumber itup_off,
|
|
|
|
bool newfirstdataitem)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2006-01-26 00:04:21 +01:00
|
|
|
IndexTupleData trunctuple;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2020-04-14 01:39:55 +02:00
|
|
|
if (newfirstdataitem)
|
2000-02-18 07:32:39 +01:00
|
|
|
{
|
2006-01-26 00:04:21 +01:00
|
|
|
trunctuple = *itup;
|
|
|
|
trunctuple.t_info = sizeof(IndexTupleData);
|
2020-04-08 00:56:52 +02:00
|
|
|
BTreeTupleSetNAtts(&trunctuple, 0, false);
|
2006-01-26 00:04:21 +01:00
|
|
|
itup = &trunctuple;
|
|
|
|
itemsize = sizeof(IndexTupleData);
|
2000-02-18 07:32:39 +01:00
|
|
|
}
|
|
|
|
|
2020-04-14 01:39:55 +02:00
|
|
|
if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
|
|
|
|
false) == InvalidOffsetNumber))
|
2010-08-29 21:33:14 +02:00
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
1997-03-24 09:48:16 +01:00
|
|
|
|
2006-07-25 21:13:00 +02:00
|
|
|
/*
|
2021-01-13 18:21:32 +01:00
|
|
|
* _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
|
2020-11-17 18:45:56 +01:00
|
|
|
*
|
2021-01-13 18:21:32 +01:00
|
|
|
* There are three operations performed here: simple index deletion, bottom-up
|
|
|
|
* index deletion, and deduplication. If all three operations fail to free
|
|
|
|
* enough space for the incoming item then caller will go on to split the
|
|
|
|
* page. We always consider simple deletion first. If that doesn't work out
|
|
|
|
* we consider alternatives. Callers that only want us to consider simple
|
|
|
|
* deletion (without any fallback) ask for that using the 'simpleonly'
|
|
|
|
* argument.
|
2020-11-17 18:45:56 +01:00
|
|
|
*
|
2021-01-13 18:21:32 +01:00
|
|
|
* We usually pick only one alternative "complex" operation when simple
|
|
|
|
* deletion alone won't prevent a page split. The 'checkingunique',
|
|
|
|
* 'uniquedup', and 'indexUnchanged' arguments are used for that.
|
2006-07-25 21:13:00 +02:00
|
|
|
*
|
2020-11-17 18:45:56 +01:00
|
|
|
* Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
|
|
|
|
* level flag was found set. The flag was useful back when there wasn't
|
|
|
|
* necessarily one single page for a duplicate tuple to go on (before heap TID
|
|
|
|
* became a part of the key space in version 4 indexes). But we don't
|
|
|
|
* actually look at the flag anymore (it's not a gating condition for our
|
|
|
|
* caller). That would cause us to miss tuples that are safe to delete,
|
|
|
|
* without getting any benefit in return. We know that the alternative is to
|
|
|
|
* split the page; scanning the line pointer array in passing won't have
|
|
|
|
* noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
|
|
|
|
* all this because !heapkeyspace indexes must still do a "getting tired"
|
|
|
|
* linear search, and so are likely to get some benefit from using it as a
|
|
|
|
* gating condition.)
|
2006-07-25 21:13:00 +02:00
|
|
|
*/
|
|
|
|
static void
|
2020-11-17 18:45:56 +01:00
|
|
|
_bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
|
|
|
|
BTInsertState insertstate,
|
2021-01-13 18:21:32 +01:00
|
|
|
bool simpleonly, bool checkingunique,
|
|
|
|
bool uniquedup, bool indexUnchanged)
|
2006-07-25 21:13:00 +02:00
|
|
|
{
|
Add deduplication to nbtree.
Deduplication reduces the storage overhead of duplicates in indexes that
use the standard nbtree index access method. The deduplication process
is applied lazily, after the point where opportunistic deletion of
LP_DEAD-marked index tuples occurs. Deduplication is only applied at
the point where a leaf page split would otherwise be required. New
posting list tuples are formed by merging together existing duplicate
tuples. The physical representation of the items on an nbtree leaf page
is made more space efficient by deduplication, but the logical contents
of the page are not changed. Even unique indexes make use of
deduplication as a way of controlling bloat from duplicates whose TIDs
point to different versions of the same logical table row.
The lazy approach taken by nbtree has significant advantages over a GIN
style eager approach. Most individual inserts of index tuples have
exactly the same overhead as before. The extra overhead of
deduplication is amortized across insertions, just like the overhead of
page splits. The key space of indexes works in the same way as it has
since commit dd299df8 (the commit that made heap TID a tiebreaker
column).
Testing has shown that nbtree deduplication can generally make indexes
with about 10 or 15 tuples for each distinct key value about 2.5X - 4X
smaller, even with single column integer indexes (e.g., an index on a
referencing column that accompanies a foreign key). The final size of
single column nbtree indexes comes close to the final size of a similar
contrib/btree_gin index, at least in cases where GIN's posting list
compression isn't very effective. This can significantly improve
transaction throughput, and significantly reduce the cost of vacuuming
indexes.
A new index storage parameter (deduplicate_items) controls the use of
deduplication. The default setting is 'on', so all new B-Tree indexes
automatically use deduplication where possible. This decision will be
reviewed at the end of the Postgres 13 beta period.
There is a regression of approximately 2% of transaction throughput with
synthetic workloads that consist of append-only inserts into a table
with several non-unique indexes, where all indexes have few or no
repeated values. The underlying issue is that cycles are wasted on
unsuccessful attempts at deduplicating items in non-unique indexes.
There doesn't seem to be a way around it short of disabling
deduplication entirely. Note that deduplication of items in unique
indexes is fairly well targeted in general, which avoids the problem
there (we can use a special heuristic to trigger deduplication passes in
unique indexes, since we're specifically targeting "version bloat").
Bump XLOG_PAGE_MAGIC because xl_btree_vacuum changed.
No bump in BTREE_VERSION, since the representation of posting list
tuples works in a way that's backwards compatible with version 4 indexes
(i.e. indexes built on PostgreSQL 12). However, users must still
REINDEX a pg_upgrade'd index to use deduplication, regardless of the
Postgres version they've upgraded from. This is the only way to set the
new nbtree metapage flag indicating that deduplication is generally
safe.
Author: Anastasia Lubennikova, Peter Geoghegan
Reviewed-By: Peter Geoghegan, Heikki Linnakangas
Discussion:
https://postgr.es/m/55E4051B.7020209@postgrespro.ru
https://postgr.es/m/4ab6e2db-bcee-f4cf-0916-3a06e6ccbb55@postgrespro.ru
2020-02-26 22:05:30 +01:00
|
|
|
OffsetNumber deletable[MaxIndexTuplesPerPage];
|
2006-07-25 21:13:00 +02:00
|
|
|
int ndeletable = 0;
|
|
|
|
OffsetNumber offnum,
|
2021-01-13 18:21:32 +01:00
|
|
|
minoff,
|
2006-07-25 21:13:00 +02:00
|
|
|
maxoff;
|
2020-11-17 18:45:56 +01:00
|
|
|
Buffer buffer = insertstate->buf;
|
|
|
|
BTScanInsert itup_key = insertstate->itup_key;
|
2016-04-20 15:31:19 +02:00
|
|
|
Page page = BufferGetPage(buffer);
|
2022-04-01 06:24:50 +02:00
|
|
|
BTPageOpaque opaque = BTPageGetOpaque(page);
|
2006-07-25 21:13:00 +02:00
|
|
|
|
2019-03-20 17:30:57 +01:00
|
|
|
Assert(P_ISLEAF(opaque));
|
2021-01-13 18:21:32 +01:00
|
|
|
Assert(simpleonly || itup_key->heapkeyspace);
|
|
|
|
Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
|
2019-03-20 17:30:57 +01:00
|
|
|
|
2006-07-25 21:13:00 +02:00
|
|
|
/*
|
2007-04-11 22:47:38 +02:00
|
|
|
* Scan over all items to see which ones need to be deleted according to
|
2021-01-13 18:21:32 +01:00
|
|
|
* LP_DEAD flags. We'll usually manage to delete a few extra items that
|
|
|
|
* are not marked LP_DEAD in passing. Often the extra items that actually
|
|
|
|
* end up getting deleted are items that would have had their LP_DEAD bit
|
|
|
|
* set before long anyway (if we opted not to include them as extras).
|
2006-07-25 21:13:00 +02:00
|
|
|
*/
|
2021-01-13 18:21:32 +01:00
|
|
|
minoff = P_FIRSTDATAKEY(opaque);
|
2006-07-25 21:13:00 +02:00
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
2021-01-13 18:21:32 +01:00
|
|
|
for (offnum = minoff;
|
2006-07-25 21:13:00 +02:00
|
|
|
offnum <= maxoff;
|
|
|
|
offnum = OffsetNumberNext(offnum))
|
|
|
|
{
|
|
|
|
ItemId itemId = PageGetItemId(page, offnum);
|
|
|
|
|
2007-09-13 00:10:26 +02:00
|
|
|
if (ItemIdIsDead(itemId))
|
2006-07-25 21:13:00 +02:00
|
|
|
deletable[ndeletable++] = offnum;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ndeletable > 0)
|
2020-11-17 18:45:56 +01:00
|
|
|
{
|
2021-01-13 18:21:32 +01:00
|
|
|
_bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
|
|
|
|
insertstate->itup, minoff, maxoff);
|
2020-11-17 18:45:56 +01:00
|
|
|
insertstate->bounds_valid = false;
|
|
|
|
|
|
|
|
/* Return when a page split has already been avoided */
|
|
|
|
if (PageGetFreeSpace(page) >= insertstate->itemsz)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Might as well assume duplicates (if checkingunique) */
|
|
|
|
uniquedup = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2021-01-13 18:21:32 +01:00
|
|
|
* We're done with simple deletion. Return early with callers that only
|
|
|
|
* call here so that simple deletion can be considered. This includes
|
|
|
|
* callers that explicitly ask for this and checkingunique callers that
|
|
|
|
* probably don't have any version churn duplicates on the page.
|
2020-11-17 18:45:56 +01:00
|
|
|
*
|
|
|
|
* Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
|
|
|
|
* return at this point (or when we go on the try either or both of our
|
|
|
|
* other strategies and they also fail). We do not bother expending a
|
|
|
|
* separate write to clear it, however. Caller will definitely clear it
|
2021-01-13 18:21:32 +01:00
|
|
|
* when it goes on to split the page (note also that the deduplication
|
|
|
|
* process will clear the flag in passing, just to keep things tidy).
|
2020-11-17 18:45:56 +01:00
|
|
|
*/
|
2021-01-13 18:21:32 +01:00
|
|
|
if (simpleonly || (checkingunique && !uniquedup))
|
|
|
|
{
|
|
|
|
Assert(!indexUnchanged);
|
2020-11-17 18:45:56 +01:00
|
|
|
return;
|
2021-01-13 18:21:32 +01:00
|
|
|
}
|
2020-11-17 18:45:56 +01:00
|
|
|
|
|
|
|
/* Assume bounds about to be invalidated (this is almost certain now) */
|
|
|
|
insertstate->bounds_valid = false;
|
2006-10-04 02:30:14 +02:00
|
|
|
|
2006-07-25 21:13:00 +02:00
|
|
|
/*
|
2021-01-13 18:21:32 +01:00
|
|
|
* Perform bottom-up index deletion pass when executor hint indicated that
|
|
|
|
* incoming item is logically unchanged, or for a unique index that is
|
|
|
|
* known to have physical duplicates for some other reason. (There is a
|
|
|
|
* large overlap between these two cases for a unique index. It's worth
|
|
|
|
* having both triggering conditions in order to apply the optimization in
|
|
|
|
* the event of successive related INSERT and DELETE statements.)
|
|
|
|
*
|
|
|
|
* We'll go on to do a deduplication pass when a bottom-up pass fails to
|
|
|
|
* delete an acceptable amount of free space (a significant fraction of
|
|
|
|
* the page, or space for the new item, whichever is greater).
|
|
|
|
*
|
|
|
|
* Note: Bottom-up index deletion uses the same equality/equivalence
|
|
|
|
* routines as deduplication internally. However, it does not merge
|
|
|
|
* together index tuples, so the same correctness considerations do not
|
|
|
|
* apply. We deliberately omit an index-is-allequalimage test here.
|
2006-07-25 21:13:00 +02:00
|
|
|
*/
|
2021-01-13 18:21:32 +01:00
|
|
|
if ((indexUnchanged || uniquedup) &&
|
|
|
|
_bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Perform deduplication pass (when enabled and index-is-allequalimage) */
|
2020-11-17 18:45:56 +01:00
|
|
|
if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
|
2023-04-18 19:33:15 +02:00
|
|
|
_bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
|
|
|
|
(indexUnchanged || uniquedup));
|
2006-07-25 21:13:00 +02:00
|
|
|
}
|
2021-01-13 18:21:32 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* _bt_simpledel_pass - Simple index tuple deletion pass.
|
|
|
|
*
|
|
|
|
* We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers
|
|
|
|
* of all such tuples are determined by caller (caller passes these to us as
|
|
|
|
* its 'deletable' argument).
|
|
|
|
*
|
|
|
|
* We might also delete extra index tuples that turn out to be safe to delete
|
|
|
|
* in passing (though they must be cheap to check in passing to begin with).
|
|
|
|
* There is no certainty that any extra tuples will be deleted, though. The
|
|
|
|
* high level goal of the approach we take is to get the most out of each call
|
|
|
|
* here (without noticeably increasing the per-call overhead compared to what
|
|
|
|
* we need to do just to be able to delete the page's LP_DEAD-marked index
|
|
|
|
* tuples).
|
|
|
|
*
|
|
|
|
* The number of extra index tuples that turn out to be deletable might
|
|
|
|
* greatly exceed the number of LP_DEAD-marked index tuples due to various
|
|
|
|
* locality related effects. For example, it's possible that the total number
|
|
|
|
* of table blocks (pointed to by all TIDs on the leaf page) is naturally
|
|
|
|
* quite low, in which case we might end up checking if it's possible to
|
|
|
|
* delete _most_ index tuples on the page (without the tableam needing to
|
|
|
|
* access additional table blocks). The tableam will sometimes stumble upon
|
|
|
|
* _many_ extra deletable index tuples in indexes where this pattern is
|
|
|
|
* common.
|
|
|
|
*
|
|
|
|
* See nbtree/README for further details on simple index tuple deletion.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
_bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
|
|
|
|
OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
|
|
|
|
OffsetNumber minoff, OffsetNumber maxoff)
|
|
|
|
{
|
|
|
|
Page page = BufferGetPage(buffer);
|
|
|
|
BlockNumber *deadblocks;
|
|
|
|
int ndeadblocks;
|
|
|
|
TM_IndexDeleteOp delstate;
|
|
|
|
OffsetNumber offnum;
|
|
|
|
|
|
|
|
/* Get array of table blocks pointed to by LP_DEAD-set tuples */
|
|
|
|
deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
|
|
|
|
&ndeadblocks);
|
|
|
|
|
|
|
|
/* Initialize tableam state that describes index deletion operation */
|
2021-11-05 03:54:05 +01:00
|
|
|
delstate.irel = rel;
|
|
|
|
delstate.iblknum = BufferGetBlockNumber(buffer);
|
2021-01-13 18:21:32 +01:00
|
|
|
delstate.bottomup = false;
|
|
|
|
delstate.bottomupfreespace = 0;
|
|
|
|
delstate.ndeltids = 0;
|
|
|
|
delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
|
|
|
|
delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
|
|
|
|
|
|
|
|
for (offnum = minoff;
|
|
|
|
offnum <= maxoff;
|
|
|
|
offnum = OffsetNumberNext(offnum))
|
|
|
|
{
|
|
|
|
ItemId itemid = PageGetItemId(page, offnum);
|
|
|
|
IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
|
|
|
|
TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
|
|
|
|
TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
|
|
|
|
BlockNumber tidblock;
|
|
|
|
void *match;
|
|
|
|
|
|
|
|
if (!BTreeTupleIsPosting(itup))
|
|
|
|
{
|
|
|
|
tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
|
|
|
|
match = bsearch(&tidblock, deadblocks, ndeadblocks,
|
|
|
|
sizeof(BlockNumber), _bt_blk_cmp);
|
|
|
|
|
|
|
|
if (!match)
|
|
|
|
{
|
|
|
|
Assert(!ItemIdIsDead(itemid));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* TID's table block is among those pointed to by the TIDs from
|
|
|
|
* LP_DEAD-bit set tuples on page -- add TID to deltids
|
|
|
|
*/
|
|
|
|
odeltid->tid = itup->t_tid;
|
|
|
|
odeltid->id = delstate.ndeltids;
|
|
|
|
ostatus->idxoffnum = offnum;
|
|
|
|
ostatus->knowndeletable = ItemIdIsDead(itemid);
|
|
|
|
ostatus->promising = false; /* unused */
|
|
|
|
ostatus->freespace = 0; /* unused */
|
|
|
|
|
|
|
|
delstate.ndeltids++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
int nitem = BTreeTupleGetNPosting(itup);
|
|
|
|
|
|
|
|
for (int p = 0; p < nitem; p++)
|
|
|
|
{
|
|
|
|
ItemPointer tid = BTreeTupleGetPostingN(itup, p);
|
|
|
|
|
|
|
|
tidblock = ItemPointerGetBlockNumber(tid);
|
|
|
|
match = bsearch(&tidblock, deadblocks, ndeadblocks,
|
|
|
|
sizeof(BlockNumber), _bt_blk_cmp);
|
|
|
|
|
|
|
|
if (!match)
|
|
|
|
{
|
|
|
|
Assert(!ItemIdIsDead(itemid));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* TID's table block is among those pointed to by the TIDs
|
|
|
|
* from LP_DEAD-bit set tuples on page -- add TID to deltids
|
|
|
|
*/
|
|
|
|
odeltid->tid = *tid;
|
|
|
|
odeltid->id = delstate.ndeltids;
|
|
|
|
ostatus->idxoffnum = offnum;
|
|
|
|
ostatus->knowndeletable = ItemIdIsDead(itemid);
|
|
|
|
ostatus->promising = false; /* unused */
|
|
|
|
ostatus->freespace = 0; /* unused */
|
|
|
|
|
|
|
|
odeltid++;
|
|
|
|
ostatus++;
|
|
|
|
delstate.ndeltids++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pfree(deadblocks);
|
|
|
|
|
|
|
|
Assert(delstate.ndeltids >= ndeletable);
|
|
|
|
|
|
|
|
/* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
|
|
|
|
_bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
|
|
|
|
|
|
|
|
pfree(delstate.deltids);
|
|
|
|
pfree(delstate.status);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* _bt_deadblocks() -- Get LP_DEAD related table blocks.
|
|
|
|
*
|
|
|
|
* Builds sorted and unique-ified array of table block numbers from index
|
|
|
|
* tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table
|
|
|
|
* block from incoming newitem just in case it isn't among the LP_DEAD-related
|
|
|
|
* table blocks.
|
|
|
|
*
|
|
|
|
* Always counting the newitem's table block as an LP_DEAD related block makes
|
|
|
|
* sense because the cost is consistently low; it is practically certain that
|
|
|
|
* the table block will not incur a buffer miss in tableam. On the other hand
|
|
|
|
* the benefit is often quite high. There is a decent chance that there will
|
|
|
|
* be some deletable items from this block, since in general most garbage
|
|
|
|
* tuples became garbage in the recent past (in many cases this won't be the
|
|
|
|
* first logical row that core code added to/modified in table block
|
|
|
|
* recently).
|
|
|
|
*
|
|
|
|
* Returns final array, and sets *nblocks to its final size for caller.
|
|
|
|
*/
|
|
|
|
static BlockNumber *
|
|
|
|
_bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
|
|
|
|
IndexTuple newitem, int *nblocks)
|
|
|
|
{
|
|
|
|
int spacentids,
|
|
|
|
ntids;
|
|
|
|
BlockNumber *tidblocks;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Accumulate each TID's block in array whose initial size has space for
|
|
|
|
* one table block per LP_DEAD-set tuple (plus space for the newitem table
|
|
|
|
* block). Array will only need to grow when there are LP_DEAD-marked
|
|
|
|
* posting list tuples (which is not that common).
|
|
|
|
*/
|
|
|
|
spacentids = ndeletable + 1;
|
|
|
|
ntids = 0;
|
|
|
|
tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* First add the table block for the incoming newitem. This is the one
|
|
|
|
* case where simple deletion can visit a table block that doesn't have
|
|
|
|
* any known deletable items.
|
|
|
|
*/
|
|
|
|
Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
|
|
|
|
tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
|
|
|
|
|
|
|
|
for (int i = 0; i < ndeletable; i++)
|
|
|
|
{
|
|
|
|
ItemId itemid = PageGetItemId(page, deletable[i]);
|
|
|
|
IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
|
|
|
|
|
|
|
|
Assert(ItemIdIsDead(itemid));
|
|
|
|
|
|
|
|
if (!BTreeTupleIsPosting(itup))
|
|
|
|
{
|
|
|
|
if (ntids + 1 > spacentids)
|
|
|
|
{
|
|
|
|
spacentids *= 2;
|
|
|
|
tidblocks = (BlockNumber *)
|
|
|
|
repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
|
|
|
|
}
|
|
|
|
|
|
|
|
tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
int nposting = BTreeTupleGetNPosting(itup);
|
|
|
|
|
|
|
|
if (ntids + nposting > spacentids)
|
|
|
|
{
|
|
|
|
spacentids = Max(spacentids * 2, ntids + nposting);
|
|
|
|
tidblocks = (BlockNumber *)
|
|
|
|
repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int j = 0; j < nposting; j++)
|
|
|
|
{
|
|
|
|
ItemPointer tid = BTreeTupleGetPostingN(itup, j);
|
|
|
|
|
|
|
|
tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
|
|
|
|
*nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
|
|
|
|
|
|
|
|
return tidblocks;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
|
|
|
|
*/
|
|
|
|
static inline int
|
|
|
|
_bt_blk_cmp(const void *arg1, const void *arg2)
|
|
|
|
{
|
|
|
|
BlockNumber b1 = *((BlockNumber *) arg1);
|
|
|
|
BlockNumber b2 = *((BlockNumber *) arg2);
|
|
|
|
|
2024-02-16 21:05:36 +01:00
|
|
|
return pg_cmp_u32(b1, b2);
|
2021-01-13 18:21:32 +01:00
|
|
|
}
|