1996-08-27 23:50:29 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
1999-02-14 00:22:53 +01:00
|
|
|
* gist.h
|
2005-05-17 05:34:18 +02:00
|
|
|
* The public API for GiST indexes. This API is exposed to
|
|
|
|
* individuals implementing GiST indexes, so backward-incompatible
|
|
|
|
* changes should be made with care.
|
1997-09-07 07:04:48 +02:00
|
|
|
*
|
1996-08-27 23:50:29 +02:00
|
|
|
*
|
2022-01-08 01:04:57 +01:00
|
|
|
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
|
2001-05-30 21:53:40 +02:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-08-27 23:50:29 +02:00
|
|
|
*
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/include/access/gist.h
|
1996-08-27 23:50:29 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#ifndef GIST_H
|
|
|
|
#define GIST_H
|
|
|
|
|
Implement operator class parameters
PostgreSQL provides set of template index access methods, where opclasses have
much freedom in the semantics of indexing. These index AMs are GiST, GIN,
SP-GiST and BRIN. There opclasses define representation of keys, operations on
them and supported search strategies. So, it's natural that opclasses may be
faced some tradeoffs, which require user-side decision. This commit implements
opclass parameters allowing users to set some values, which tell opclass how to
index the particular dataset.
This commit doesn't introduce new storage in system catalog. Instead it uses
pg_attribute.attoptions, which is used for table column storage options but
unused for index attributes.
In order to evade changing signature of each opclass support function, we
implement unified way to pass options to opclass support functions. Options
are set to fn_expr as the constant bytea expression. It's possible due to the
fact that opclass support functions are executed outside of expressions, so
fn_expr is unused for them.
This commit comes with some examples of opclass options usage. We parametrize
signature length in GiST. That applies to multiple opclasses: tsvector_ops,
gist__intbig_ops, gist_ltree_ops, gist__ltree_ops, gist_trgm_ops and
gist_hstore_ops. Also we parametrize maximum number of integer ranges for
gist__int_ops. However, the main future usage of this feature is expected
to be json, where users would be able to specify which way to index particular
json parts.
Catversion is bumped.
Discussion: https://postgr.es/m/d22c3a18-31c7-1879-fc11-4c1ce2f5e5af%40postgrespro.ru
Author: Nikita Glukhov, revised by me
Reviwed-by: Nikolay Shaplov, Robert Haas, Tom Lane, Tomas Vondra, Alvaro Herrera
2020-03-30 18:17:11 +02:00
|
|
|
#include "access/itup.h"
|
2019-07-24 19:24:07 +02:00
|
|
|
#include "access/transam.h"
|
2005-11-07 18:36:47 +01:00
|
|
|
#include "access/xlog.h"
|
|
|
|
#include "access/xlogdefs.h"
|
2008-06-19 02:46:06 +02:00
|
|
|
#include "storage/block.h"
|
2005-05-17 05:34:18 +02:00
|
|
|
#include "storage/bufpage.h"
|
2008-06-19 02:46:06 +02:00
|
|
|
#include "utils/relcache.h"
|
1996-08-27 23:50:29 +02:00
|
|
|
|
|
|
|
/*
|
2001-05-30 21:53:40 +02:00
|
|
|
* amproc indexes for GiST indexes.
|
|
|
|
*/
|
1996-08-27 23:50:29 +02:00
|
|
|
#define GIST_CONSISTENT_PROC 1
|
|
|
|
#define GIST_UNION_PROC 2
|
|
|
|
#define GIST_COMPRESS_PROC 3
|
|
|
|
#define GIST_DECOMPRESS_PROC 4
|
|
|
|
#define GIST_PENALTY_PROC 5
|
|
|
|
#define GIST_PICKSPLIT_PROC 6
|
|
|
|
#define GIST_EQUAL_PROC 7
|
2010-12-04 02:52:18 +01:00
|
|
|
#define GIST_DISTANCE_PROC 8
|
2015-03-26 18:12:00 +01:00
|
|
|
#define GIST_FETCH_PROC 9
|
Implement operator class parameters
PostgreSQL provides set of template index access methods, where opclasses have
much freedom in the semantics of indexing. These index AMs are GiST, GIN,
SP-GiST and BRIN. There opclasses define representation of keys, operations on
them and supported search strategies. So, it's natural that opclasses may be
faced some tradeoffs, which require user-side decision. This commit implements
opclass parameters allowing users to set some values, which tell opclass how to
index the particular dataset.
This commit doesn't introduce new storage in system catalog. Instead it uses
pg_attribute.attoptions, which is used for table column storage options but
unused for index attributes.
In order to evade changing signature of each opclass support function, we
implement unified way to pass options to opclass support functions. Options
are set to fn_expr as the constant bytea expression. It's possible due to the
fact that opclass support functions are executed outside of expressions, so
fn_expr is unused for them.
This commit comes with some examples of opclass options usage. We parametrize
signature length in GiST. That applies to multiple opclasses: tsvector_ops,
gist__intbig_ops, gist_ltree_ops, gist__ltree_ops, gist_trgm_ops and
gist_hstore_ops. Also we parametrize maximum number of integer ranges for
gist__int_ops. However, the main future usage of this feature is expected
to be json, where users would be able to specify which way to index particular
json parts.
Catversion is bumped.
Discussion: https://postgr.es/m/d22c3a18-31c7-1879-fc11-4c1ce2f5e5af%40postgrespro.ru
Author: Nikita Glukhov, revised by me
Reviwed-by: Nikolay Shaplov, Robert Haas, Tom Lane, Tomas Vondra, Alvaro Herrera
2020-03-30 18:17:11 +02:00
|
|
|
#define GIST_OPTIONS_PROC 10
|
2020-09-17 10:33:40 +02:00
|
|
|
#define GIST_SORTSUPPORT_PROC 11
|
|
|
|
#define GISTNProcs 11
|
2001-05-30 21:53:40 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Page opaque data in a GiST index page.
|
|
|
|
*/
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
#define F_LEAF (1 << 0) /* leaf page */
|
|
|
|
#define F_DELETED (1 << 1) /* the page has been deleted */
|
2015-09-09 17:43:37 +02:00
|
|
|
#define F_TUPLES_DELETED (1 << 2) /* some tuples on the page were
|
|
|
|
* deleted */
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
#define F_FOLLOW_RIGHT (1 << 3) /* page to the right has no downlink */
|
2015-09-09 17:43:37 +02:00
|
|
|
#define F_HAS_GARBAGE (1 << 4) /* some tuples on the page are dead,
|
|
|
|
* but not deleted yet */
|
1996-08-27 23:50:29 +02:00
|
|
|
|
2021-03-10 23:03:10 +01:00
|
|
|
/*
|
|
|
|
* NSN (node sequence number) is a special-purpose LSN which is stored on each
|
|
|
|
* index page in GISTPageOpaqueData and updated only during page splits. By
|
|
|
|
* recording the parent's LSN in GISTSearchItem.parentlsn, it is possible to
|
|
|
|
* detect concurrent child page splits by checking if parentlsn < child's NSN,
|
|
|
|
* and handle them properly. The child page's LSN is insufficient for this
|
|
|
|
* purpose since it is updated for every page change.
|
|
|
|
*/
|
2005-06-27 14:45:23 +02:00
|
|
|
typedef XLogRecPtr GistNSN;
|
2013-05-29 22:58:43 +02:00
|
|
|
|
Generate less WAL during GiST, GIN and SP-GiST index build.
Instead of WAL-logging every modification during the build separately,
first build the index without any WAL-logging, and make a separate pass
through the index at the end, to write all pages to the WAL. This
significantly reduces the amount of WAL generated, and is usually also
faster, despite the extra I/O needed for the extra scan through the index.
WAL generated this way is also faster to replay.
For GiST, the LSN-NSN interlock makes this a little tricky. All pages must
be marked with a valid (i.e. non-zero) LSN, so that the parent-child
LSN-NSN interlock works correctly. We now use magic value 1 for that during
index build. Change the fake LSN counter to begin from 1000, so that 1 is
safely smaller than any real or fake LSN. 2 would've been enough for our
purposes, but let's reserve a bigger range, in case we need more special
values in the future.
Author: Anastasia Lubennikova, Andrey V. Lepikhov
Reviewed-by: Heikki Linnakangas, Dmitry Dolgov
2019-04-03 16:03:15 +02:00
|
|
|
/*
|
2021-03-10 23:03:10 +01:00
|
|
|
* A fake LSN / NSN value used during index builds. Must be smaller than any
|
|
|
|
* real or fake (unlogged) LSN generated after the index build completes so
|
|
|
|
* that all splits are considered complete.
|
Generate less WAL during GiST, GIN and SP-GiST index build.
Instead of WAL-logging every modification during the build separately,
first build the index without any WAL-logging, and make a separate pass
through the index at the end, to write all pages to the WAL. This
significantly reduces the amount of WAL generated, and is usually also
faster, despite the extra I/O needed for the extra scan through the index.
WAL generated this way is also faster to replay.
For GiST, the LSN-NSN interlock makes this a little tricky. All pages must
be marked with a valid (i.e. non-zero) LSN, so that the parent-child
LSN-NSN interlock works correctly. We now use magic value 1 for that during
index build. Change the fake LSN counter to begin from 1000, so that 1 is
safely smaller than any real or fake LSN. 2 would've been enough for our
purposes, but let's reserve a bigger range, in case we need more special
values in the future.
Author: Anastasia Lubennikova, Andrey V. Lepikhov
Reviewed-by: Heikki Linnakangas, Dmitry Dolgov
2019-04-03 16:03:15 +02:00
|
|
|
*/
|
|
|
|
#define GistBuildLSN ((XLogRecPtr) 1)
|
|
|
|
|
2013-01-17 15:35:46 +01:00
|
|
|
/*
|
|
|
|
* For on-disk compatibility with pre-9.3 servers, NSN is stored as two
|
|
|
|
* 32-bit fields on disk, same as LSNs.
|
|
|
|
*/
|
|
|
|
typedef PageXLogRecPtr PageGistNSN;
|
2005-06-27 14:45:23 +02:00
|
|
|
|
1996-08-27 23:50:29 +02:00
|
|
|
typedef struct GISTPageOpaqueData
|
|
|
|
{
|
2013-01-17 15:35:46 +01:00
|
|
|
PageGistNSN nsn; /* this value must change on page split */
|
2007-04-10 00:04:08 +02:00
|
|
|
BlockNumber rightlink; /* next page if any */
|
|
|
|
uint16 flags; /* see bit definitions above */
|
|
|
|
uint16 gist_page_id; /* for identification of GiST indexes */
|
1996-08-27 23:50:29 +02:00
|
|
|
} GISTPageOpaqueData;
|
|
|
|
|
|
|
|
typedef GISTPageOpaqueData *GISTPageOpaque;
|
|
|
|
|
Implement operator class parameters
PostgreSQL provides set of template index access methods, where opclasses have
much freedom in the semantics of indexing. These index AMs are GiST, GIN,
SP-GiST and BRIN. There opclasses define representation of keys, operations on
them and supported search strategies. So, it's natural that opclasses may be
faced some tradeoffs, which require user-side decision. This commit implements
opclass parameters allowing users to set some values, which tell opclass how to
index the particular dataset.
This commit doesn't introduce new storage in system catalog. Instead it uses
pg_attribute.attoptions, which is used for table column storage options but
unused for index attributes.
In order to evade changing signature of each opclass support function, we
implement unified way to pass options to opclass support functions. Options
are set to fn_expr as the constant bytea expression. It's possible due to the
fact that opclass support functions are executed outside of expressions, so
fn_expr is unused for them.
This commit comes with some examples of opclass options usage. We parametrize
signature length in GiST. That applies to multiple opclasses: tsvector_ops,
gist__intbig_ops, gist_ltree_ops, gist__ltree_ops, gist_trgm_ops and
gist_hstore_ops. Also we parametrize maximum number of integer ranges for
gist__int_ops. However, the main future usage of this feature is expected
to be json, where users would be able to specify which way to index particular
json parts.
Catversion is bumped.
Discussion: https://postgr.es/m/d22c3a18-31c7-1879-fc11-4c1ce2f5e5af%40postgrespro.ru
Author: Nikita Glukhov, revised by me
Reviwed-by: Nikolay Shaplov, Robert Haas, Tom Lane, Tomas Vondra, Alvaro Herrera
2020-03-30 18:17:11 +02:00
|
|
|
/*
|
|
|
|
* Maximum possible sizes for GiST index tuple and index key. Calculation is
|
|
|
|
* based on assumption that GiST page should fit at least 4 tuples. In theory,
|
|
|
|
* GiST index can be functional when page can fit 3 tuples. But that seems
|
2020-11-14 03:43:10 +01:00
|
|
|
* rather inefficient, so we use a bit conservative estimate.
|
Implement operator class parameters
PostgreSQL provides set of template index access methods, where opclasses have
much freedom in the semantics of indexing. These index AMs are GiST, GIN,
SP-GiST and BRIN. There opclasses define representation of keys, operations on
them and supported search strategies. So, it's natural that opclasses may be
faced some tradeoffs, which require user-side decision. This commit implements
opclass parameters allowing users to set some values, which tell opclass how to
index the particular dataset.
This commit doesn't introduce new storage in system catalog. Instead it uses
pg_attribute.attoptions, which is used for table column storage options but
unused for index attributes.
In order to evade changing signature of each opclass support function, we
implement unified way to pass options to opclass support functions. Options
are set to fn_expr as the constant bytea expression. It's possible due to the
fact that opclass support functions are executed outside of expressions, so
fn_expr is unused for them.
This commit comes with some examples of opclass options usage. We parametrize
signature length in GiST. That applies to multiple opclasses: tsvector_ops,
gist__intbig_ops, gist_ltree_ops, gist__ltree_ops, gist_trgm_ops and
gist_hstore_ops. Also we parametrize maximum number of integer ranges for
gist__int_ops. However, the main future usage of this feature is expected
to be json, where users would be able to specify which way to index particular
json parts.
Catversion is bumped.
Discussion: https://postgr.es/m/d22c3a18-31c7-1879-fc11-4c1ce2f5e5af%40postgrespro.ru
Author: Nikita Glukhov, revised by me
Reviwed-by: Nikolay Shaplov, Robert Haas, Tom Lane, Tomas Vondra, Alvaro Herrera
2020-03-30 18:17:11 +02:00
|
|
|
*
|
|
|
|
* The maximum size of index key is true for unicolumn index. Therefore, this
|
|
|
|
* estimation should be used to figure out which maximum size of GiST index key
|
|
|
|
* makes sense at all. For multicolumn indexes, user might be able to tune
|
|
|
|
* key size using opclass parameters.
|
|
|
|
*/
|
|
|
|
#define GISTMaxIndexTupleSize \
|
|
|
|
MAXALIGN_DOWN((BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)) / \
|
|
|
|
4 - sizeof(ItemIdData))
|
|
|
|
|
|
|
|
#define GISTMaxIndexKeySize \
|
|
|
|
(GISTMaxIndexTupleSize - MAXALIGN(sizeof(IndexTupleData)))
|
|
|
|
|
2007-04-10 00:04:08 +02:00
|
|
|
/*
|
|
|
|
* The page ID is for the convenience of pg_filedump and similar utilities,
|
|
|
|
* which otherwise would have a hard time telling pages of different index
|
|
|
|
* types apart. It should be the last 2 bytes on the page. This is more or
|
|
|
|
* less "free" due to alignment considerations.
|
|
|
|
*/
|
|
|
|
#define GIST_PAGE_ID 0xFF81
|
|
|
|
|
1996-08-27 23:50:29 +02:00
|
|
|
/*
|
2001-05-30 21:53:40 +02:00
|
|
|
* This is the Split Vector to be returned by the PickSplit method.
|
2013-02-10 17:58:15 +01:00
|
|
|
* PickSplit should fill the indexes of tuples to go to the left side into
|
|
|
|
* spl_left[], and those to go to the right into spl_right[] (note the method
|
|
|
|
* is responsible for palloc'ing both of these arrays!). The tuple counts
|
|
|
|
* go into spl_nleft/spl_nright, and spl_ldatum/spl_rdatum must be set to
|
|
|
|
* the union keys for each side.
|
|
|
|
*
|
|
|
|
* If spl_ldatum_exists and spl_rdatum_exists are true, then we are performing
|
|
|
|
* a "secondary split" using a non-first index column. In this case some
|
|
|
|
* decisions have already been made about a page split, and the set of tuples
|
|
|
|
* being passed to PickSplit is just the tuples about which we are undecided.
|
|
|
|
* spl_ldatum/spl_rdatum then contain the union keys for the tuples already
|
|
|
|
* chosen to go left or right. Ideally the PickSplit method should take those
|
|
|
|
* keys into account while deciding what to do with the remaining tuples, ie
|
|
|
|
* it should try to "build out" from those unions so as to minimally expand
|
|
|
|
* them. If it does so, it should union the given tuples' keys into the
|
|
|
|
* existing spl_ldatum/spl_rdatum values rather than just setting those values
|
|
|
|
* from scratch, and then set spl_ldatum_exists/spl_rdatum_exists to false to
|
|
|
|
* show it has done this.
|
|
|
|
*
|
|
|
|
* If the PickSplit method fails to clear spl_ldatum_exists/spl_rdatum_exists,
|
|
|
|
* the core GiST code will make its own decision about how to merge the
|
|
|
|
* secondary-split results with the previously-chosen tuples, and will then
|
|
|
|
* recompute the union keys from scratch. This is a workable though often not
|
|
|
|
* optimal approach.
|
2001-05-30 21:53:40 +02:00
|
|
|
*/
|
1996-08-27 23:50:29 +02:00
|
|
|
typedef struct GIST_SPLITVEC
|
|
|
|
{
|
|
|
|
OffsetNumber *spl_left; /* array of entries that go left */
|
|
|
|
int spl_nleft; /* size of this array */
|
2001-05-31 20:16:55 +02:00
|
|
|
Datum spl_ldatum; /* Union of keys in spl_left */
|
2006-06-28 14:00:14 +02:00
|
|
|
bool spl_ldatum_exists; /* true, if spl_ldatum already exists. */
|
2001-05-31 20:16:55 +02:00
|
|
|
|
1996-08-27 23:50:29 +02:00
|
|
|
OffsetNumber *spl_right; /* array of entries that go right */
|
|
|
|
int spl_nright; /* size of the array */
|
2001-05-31 20:16:55 +02:00
|
|
|
Datum spl_rdatum; /* Union of keys in spl_right */
|
2006-06-28 14:00:14 +02:00
|
|
|
bool spl_rdatum_exists; /* true, if spl_rdatum already exists. */
|
1996-08-27 23:50:29 +02:00
|
|
|
} GIST_SPLITVEC;
|
|
|
|
|
|
|
|
/*
|
2005-05-17 05:34:18 +02:00
|
|
|
* An entry on a GiST node. Contains the key, as well as its own
|
|
|
|
* location (rel,page,offset) which can supply the matching pointer.
|
2006-06-28 14:00:14 +02:00
|
|
|
* leafkey is a flag to tell us if the entry is in a leaf node.
|
2001-05-30 21:53:40 +02:00
|
|
|
*/
|
1996-08-27 23:50:29 +02:00
|
|
|
typedef struct GISTENTRY
|
|
|
|
{
|
2001-05-31 20:16:55 +02:00
|
|
|
Datum key;
|
1996-08-27 23:50:29 +02:00
|
|
|
Relation rel;
|
|
|
|
Page page;
|
|
|
|
OffsetNumber offset;
|
|
|
|
bool leafkey;
|
|
|
|
} GISTENTRY;
|
|
|
|
|
2005-06-27 14:45:23 +02:00
|
|
|
#define GistPageGetOpaque(page) ( (GISTPageOpaque) PageGetSpecialPointer(page) )
|
|
|
|
|
|
|
|
#define GistPageIsLeaf(page) ( GistPageGetOpaque(page)->flags & F_LEAF)
|
2005-06-20 12:29:37 +02:00
|
|
|
#define GIST_LEAF(entry) (GistPageIsLeaf((entry)->page))
|
|
|
|
|
2005-06-27 14:45:23 +02:00
|
|
|
#define GistPageIsDeleted(page) ( GistPageGetOpaque(page)->flags & F_DELETED)
|
2005-06-20 12:29:37 +02:00
|
|
|
|
2005-06-27 14:45:23 +02:00
|
|
|
#define GistTuplesDeleted(page) ( GistPageGetOpaque(page)->flags & F_TUPLES_DELETED)
|
|
|
|
#define GistMarkTuplesDeleted(page) ( GistPageGetOpaque(page)->flags |= F_TUPLES_DELETED)
|
|
|
|
#define GistClearTuplesDeleted(page) ( GistPageGetOpaque(page)->flags &= ~F_TUPLES_DELETED)
|
2004-03-30 17:45:33 +02:00
|
|
|
|
2015-09-09 17:43:37 +02:00
|
|
|
#define GistPageHasGarbage(page) ( GistPageGetOpaque(page)->flags & F_HAS_GARBAGE)
|
|
|
|
#define GistMarkPageHasGarbage(page) ( GistPageGetOpaque(page)->flags |= F_HAS_GARBAGE)
|
|
|
|
#define GistClearPageHasGarbage(page) ( GistPageGetOpaque(page)->flags &= ~F_HAS_GARBAGE)
|
|
|
|
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
#define GistFollowRight(page) ( GistPageGetOpaque(page)->flags & F_FOLLOW_RIGHT)
|
|
|
|
#define GistMarkFollowRight(page) ( GistPageGetOpaque(page)->flags |= F_FOLLOW_RIGHT)
|
|
|
|
#define GistClearFollowRight(page) ( GistPageGetOpaque(page)->flags &= ~F_FOLLOW_RIGHT)
|
|
|
|
|
2013-01-17 15:35:46 +01:00
|
|
|
#define GistPageGetNSN(page) ( PageXLogRecPtrGet(GistPageGetOpaque(page)->nsn))
|
|
|
|
#define GistPageSetNSN(page, val) ( PageXLogRecPtrSet(GistPageGetOpaque(page)->nsn, val))
|
|
|
|
|
2019-07-24 19:24:07 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* On a deleted page, we store this struct. A deleted page doesn't contain any
|
|
|
|
* tuples, so we don't use the normal page layout with line pointers. Instead,
|
|
|
|
* this struct is stored right after the standard page header. pd_lower points
|
|
|
|
* to the end of this struct. If we add fields to this struct in the future, we
|
|
|
|
* can distinguish the old and new formats by pd_lower.
|
|
|
|
*/
|
|
|
|
typedef struct GISTDeletedPageContents
|
|
|
|
{
|
|
|
|
/* last xid which could see the page in a scan */
|
|
|
|
FullTransactionId deleteXid;
|
|
|
|
} GISTDeletedPageContents;
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
GistPageSetDeleted(Page page, FullTransactionId deletexid)
|
|
|
|
{
|
|
|
|
Assert(PageIsEmpty(page));
|
|
|
|
|
|
|
|
GistPageGetOpaque(page)->flags |= F_DELETED;
|
|
|
|
((PageHeader) page)->pd_lower = MAXALIGN(SizeOfPageHeaderData) + sizeof(GISTDeletedPageContents);
|
|
|
|
|
|
|
|
((GISTDeletedPageContents *) PageGetContents(page))->deleteXid = deletexid;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline FullTransactionId
|
|
|
|
GistPageGetDeleteXid(Page page)
|
|
|
|
{
|
|
|
|
Assert(GistPageIsDeleted(page));
|
|
|
|
|
|
|
|
/* Is the deleteXid field present? */
|
|
|
|
if (((PageHeader) page)->pd_lower >= MAXALIGN(SizeOfPageHeaderData) +
|
|
|
|
offsetof(GISTDeletedPageContents, deleteXid) + sizeof(FullTransactionId))
|
|
|
|
{
|
|
|
|
return ((GISTDeletedPageContents *) PageGetContents(page))->deleteXid;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
return FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
|
|
|
|
}
|
2019-03-22 12:21:20 +01:00
|
|
|
|
2004-03-30 17:45:33 +02:00
|
|
|
/*
|
2010-12-04 02:52:18 +01:00
|
|
|
* Vector of GISTENTRY structs; user-defined methods union and picksplit
|
|
|
|
* take it as one of their arguments
|
2004-03-30 17:45:33 +02:00
|
|
|
*/
|
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
int32 n; /* number of elements */
|
2011-06-16 21:39:09 +02:00
|
|
|
GISTENTRY vector[FLEXIBLE_ARRAY_MEMBER];
|
2004-03-30 17:45:33 +02:00
|
|
|
} GistEntryVector;
|
|
|
|
|
2006-06-25 03:02:12 +02:00
|
|
|
#define GEVHDRSZ (offsetof(GistEntryVector, vector))
|
2004-03-30 17:45:33 +02:00
|
|
|
|
1996-08-27 23:50:29 +02:00
|
|
|
/*
|
2001-05-30 21:53:40 +02:00
|
|
|
* macro to initialize a GISTENTRY
|
|
|
|
*/
|
2006-06-28 14:00:14 +02:00
|
|
|
#define gistentryinit(e, k, r, pg, o, l) \
|
2001-05-31 20:16:55 +02:00
|
|
|
do { (e).key = (k); (e).rel = (r); (e).page = (pg); \
|
2006-06-28 14:00:14 +02:00
|
|
|
(e).offset = (o); (e).leafkey = (l); } while (0)
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1996-08-27 23:50:29 +02:00
|
|
|
#endif /* GIST_H */
|