2005-05-17 05:34:18 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* gist_private.h
|
|
|
|
* private declarations for GiST -- declarations related to the
|
|
|
|
* internal implementation of GiST, not the public API
|
|
|
|
*
|
2019-01-02 18:44:25 +01:00
|
|
|
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
|
2005-05-17 05:34:18 +02:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/include/access/gist_private.h
|
2005-05-17 05:34:18 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#ifndef GIST_PRIVATE_H
|
|
|
|
#define GIST_PRIVATE_H
|
|
|
|
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
#include "access/amapi.h"
|
2005-05-17 05:34:18 +02:00
|
|
|
#include "access/gist.h"
|
2006-07-11 19:04:13 +02:00
|
|
|
#include "access/itup.h"
|
2011-09-09 19:23:41 +02:00
|
|
|
#include "fmgr.h"
|
2014-12-22 11:05:57 +01:00
|
|
|
#include "lib/pairingheap.h"
|
2008-06-19 02:46:06 +02:00
|
|
|
#include "storage/bufmgr.h"
|
2011-09-08 16:51:23 +02:00
|
|
|
#include "storage/buffile.h"
|
|
|
|
#include "utils/hsearch.h"
|
2015-09-09 17:43:37 +02:00
|
|
|
#include "access/genam.h"
|
2005-05-17 05:34:18 +02:00
|
|
|
|
2014-04-03 14:09:37 +02:00
|
|
|
/*
|
|
|
|
* Maximum number of "halves" a page can be split into in one operation.
|
|
|
|
* Typically a split produces 2 halves, but can be more if keys have very
|
|
|
|
* different lengths, or when inserting multiple keys in one operation (as
|
|
|
|
* when inserting downlinks to an internal node). There is no theoretical
|
|
|
|
* limit on this, but in practice if you get more than a handful page halves
|
|
|
|
* in one split, there's something wrong with the opclass implementation.
|
|
|
|
* GIST_MAX_SPLIT_PAGES is an arbitrary limit on that, used to size some
|
|
|
|
* local arrays used during split. Note that there is also a limit on the
|
|
|
|
* number of buffers that can be held locked at a time, MAX_SIMUL_LWLOCKS,
|
|
|
|
* so if you raise this higher than that limit, you'll just get a different
|
|
|
|
* error.
|
|
|
|
*/
|
|
|
|
#define GIST_MAX_SPLIT_PAGES 75
|
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
/* Buffer lock modes */
|
2005-06-27 14:45:23 +02:00
|
|
|
#define GIST_SHARE BUFFER_LOCK_SHARE
|
|
|
|
#define GIST_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE
|
2010-12-04 02:52:18 +01:00
|
|
|
#define GIST_UNLOCK BUFFER_LOCK_UNLOCK
|
2005-06-27 14:45:23 +02:00
|
|
|
|
2011-09-08 16:51:23 +02:00
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
BlockNumber prev;
|
|
|
|
uint32 freespace;
|
2015-02-20 06:11:42 +01:00
|
|
|
char tupledata[FLEXIBLE_ARRAY_MEMBER];
|
2011-09-08 16:51:23 +02:00
|
|
|
} GISTNodeBufferPage;
|
|
|
|
|
|
|
|
#define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata))
|
|
|
|
/* Returns free space in node buffer page */
|
|
|
|
#define PAGE_FREE_SPACE(nbp) (nbp->freespace)
|
|
|
|
/* Checks if node buffer page is empty */
|
|
|
|
#define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - BUFFER_PAGE_DATA_OFFSET)
|
|
|
|
/* Checks if node buffers page don't contain sufficient space for index tuple */
|
|
|
|
#define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \
|
|
|
|
MAXALIGN(IndexTupleSize(itup)))
|
|
|
|
|
2005-05-17 05:34:18 +02:00
|
|
|
/*
|
2010-12-04 02:52:18 +01:00
|
|
|
* GISTSTATE: information needed for any GiST index operation
|
2005-05-17 05:34:18 +02:00
|
|
|
*
|
2010-12-04 02:52:18 +01:00
|
|
|
* This struct retains call info for the index's opclass-specific support
|
|
|
|
* functions (per index column), plus the index's tuple descriptor.
|
2011-10-01 01:48:57 +02:00
|
|
|
*
|
|
|
|
* scanCxt holds the GISTSTATE itself as well as any data that lives for the
|
|
|
|
* lifetime of the index operation. We pass this to the support functions
|
|
|
|
* via fn_mcxt, so that they can store scan-lifespan data in it. The
|
|
|
|
* functions are invoked in tempCxt, which is typically short-lifespan
|
|
|
|
* (that is, it's reset after each tuple). However, tempCxt can be the same
|
|
|
|
* as scanCxt if we're not bothering with per-tuple context resets.
|
2005-05-17 05:34:18 +02:00
|
|
|
*/
|
|
|
|
typedef struct GISTSTATE
|
|
|
|
{
|
2011-10-01 01:48:57 +02:00
|
|
|
MemoryContext scanCxt; /* context for scan-lifespan data */
|
|
|
|
MemoryContext tempCxt; /* short-term context for calling functions */
|
|
|
|
|
2019-03-10 09:36:47 +01:00
|
|
|
TupleDesc leafTupdesc; /* index's tuple descriptor */
|
|
|
|
TupleDesc nonLeafTupdesc; /* truncated tuple descriptor for non-leaf
|
|
|
|
* pages */
|
2015-03-26 22:07:52 +01:00
|
|
|
TupleDesc fetchTupdesc; /* tuple descriptor for tuples returned in an
|
|
|
|
* index-only scan */
|
2011-10-01 01:48:57 +02:00
|
|
|
|
2005-05-17 05:34:18 +02:00
|
|
|
FmgrInfo consistentFn[INDEX_MAX_KEYS];
|
|
|
|
FmgrInfo unionFn[INDEX_MAX_KEYS];
|
|
|
|
FmgrInfo compressFn[INDEX_MAX_KEYS];
|
|
|
|
FmgrInfo decompressFn[INDEX_MAX_KEYS];
|
|
|
|
FmgrInfo penaltyFn[INDEX_MAX_KEYS];
|
|
|
|
FmgrInfo picksplitFn[INDEX_MAX_KEYS];
|
|
|
|
FmgrInfo equalFn[INDEX_MAX_KEYS];
|
2010-12-04 02:52:18 +01:00
|
|
|
FmgrInfo distanceFn[INDEX_MAX_KEYS];
|
2015-03-26 18:12:00 +01:00
|
|
|
FmgrInfo fetchFn[INDEX_MAX_KEYS];
|
2005-05-17 05:34:18 +02:00
|
|
|
|
2011-04-23 02:13:12 +02:00
|
|
|
/* Collations to pass to the support functions */
|
|
|
|
Oid supportCollation[INDEX_MAX_KEYS];
|
2005-05-17 05:34:18 +02:00
|
|
|
} GISTSTATE;
|
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* During a GiST index search, we must maintain a queue of unvisited items,
|
|
|
|
* which can be either individual heap tuples or whole index pages. If it
|
|
|
|
* is an ordered search, the unvisited items should be visited in distance
|
|
|
|
* order. Unvisited items at the same distance should be visited in
|
|
|
|
* depth-first order, that is heap items first, then lower index pages, then
|
|
|
|
* upper index pages; this rule avoids doing extra work during a search that
|
|
|
|
* ends early due to LIMIT.
|
|
|
|
*
|
2016-09-20 10:38:25 +02:00
|
|
|
* To perform an ordered search, we use a pairing heap to manage the
|
|
|
|
* distance-order queue. In a non-ordered search (no order-by operators),
|
|
|
|
* we use it to return heap tuples before unvisited index pages, to
|
|
|
|
* ensure depth-first order, but all entries are otherwise considered
|
|
|
|
* equal.
|
2010-12-04 02:52:18 +01:00
|
|
|
*/
|
|
|
|
|
|
|
|
/* Individual heap tuple to be visited */
|
|
|
|
typedef struct GISTSearchHeapItem
|
2008-08-23 12:37:24 +02:00
|
|
|
{
|
2009-06-11 16:49:15 +02:00
|
|
|
ItemPointerData heapPtr;
|
2010-12-04 02:52:18 +01:00
|
|
|
bool recheck; /* T if quals must be rechecked */
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
bool recheckDistances; /* T if distances must be rechecked */
|
2017-02-27 23:20:34 +01:00
|
|
|
HeapTuple recontup; /* data reconstructed from the index, used in
|
2015-03-26 18:12:00 +01:00
|
|
|
* index-only scans */
|
2016-01-18 01:13:18 +01:00
|
|
|
OffsetNumber offnum; /* track offset in page to mark tuple as
|
2015-09-09 17:43:37 +02:00
|
|
|
* LP_DEAD */
|
2010-12-04 02:52:18 +01:00
|
|
|
} GISTSearchHeapItem;
|
|
|
|
|
|
|
|
/* Unvisited item, either index page or heap tuple */
|
|
|
|
typedef struct GISTSearchItem
|
|
|
|
{
|
2014-12-22 11:05:57 +01:00
|
|
|
pairingheap_node phNode;
|
2010-12-04 02:52:18 +01:00
|
|
|
BlockNumber blkno; /* index page number, or InvalidBlockNumber */
|
|
|
|
union
|
|
|
|
{
|
2011-04-10 17:42:00 +02:00
|
|
|
GistNSN parentlsn; /* parent page's LSN, if index page */
|
2010-12-04 02:52:18 +01:00
|
|
|
/* we must store parentlsn to detect whether a split occurred */
|
|
|
|
GISTSearchHeapItem heap; /* heap info, if heap tuple */
|
|
|
|
} data;
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
double distances[FLEXIBLE_ARRAY_MEMBER]; /* numberOfOrderBys
|
|
|
|
* entries */
|
2011-04-10 17:42:00 +02:00
|
|
|
} GISTSearchItem;
|
2010-12-04 02:52:18 +01:00
|
|
|
|
2011-04-10 17:42:00 +02:00
|
|
|
#define GISTSearchItemIsHeap(item) ((item).blkno == InvalidBlockNumber)
|
2008-08-23 12:37:24 +02:00
|
|
|
|
2014-12-22 11:05:57 +01:00
|
|
|
#define SizeOfGISTSearchItem(n_distances) (offsetof(GISTSearchItem, distances) + sizeof(double) * (n_distances))
|
2010-12-04 02:52:18 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* GISTScanOpaqueData: private state for a scan of a GiST index
|
2005-05-17 05:34:18 +02:00
|
|
|
*/
|
|
|
|
typedef struct GISTScanOpaqueData
|
|
|
|
{
|
2010-12-04 02:52:18 +01:00
|
|
|
GISTSTATE *giststate; /* index information, see above */
|
Fix datatype confusion with the new lossy GiST distance functions.
We can only support a lossy distance function when the distance function's
datatype is comparable with the original ordering operator's datatype.
The distance function always returns a float8, so we are limited to float8,
and float4 (by a hard-coded cast of the float8 to float4).
In light of this limitation, it seems like a good idea to have a separate
'recheck' flag for the ORDER BY expressions, so that if you have a non-lossy
distance function, it still works with lossy quals. There are cases like
that with the build-in or contrib opclasses, but it's plausible.
There was a hidden assumption that the ORDER BY values returned by GiST
match the original ordering operator's return type, but there are plenty
of examples where that's not true, e.g. in btree_gist and pg_trgm. As long
as the distance function is not lossy, we can tolerate that and just not
return the distance to the executor (or rather, always return NULL). The
executor doesn't need the distances if there are no lossy results.
There was another little bug: the recheck variable was not initialized
before calling the distance function. That revealed the bigger issue,
as the executor tried to reorder tuples that didn't need reordering, and
that failed because of the datatype mismatch.
2015-05-15 16:59:46 +02:00
|
|
|
Oid *orderByTypes; /* datatypes of ORDER BY expressions */
|
|
|
|
|
2015-02-20 06:11:42 +01:00
|
|
|
pairingheap *queue; /* queue of unvisited items */
|
2010-12-04 02:52:18 +01:00
|
|
|
MemoryContext queueCxt; /* context holding the queue */
|
2009-06-11 16:49:15 +02:00
|
|
|
bool qual_ok; /* false if qual can never be satisfied */
|
2010-12-04 02:52:18 +01:00
|
|
|
bool firstCall; /* true until first gistgettuple call */
|
|
|
|
|
|
|
|
/* pre-allocated workspace arrays */
|
2010-12-04 09:21:49 +01:00
|
|
|
double *distances; /* output area for gistindex_keytest */
|
2010-12-04 02:52:18 +01:00
|
|
|
|
2015-09-09 17:43:37 +02:00
|
|
|
/* info about killed items if any (killedItems is NULL if never used) */
|
2016-01-18 01:13:18 +01:00
|
|
|
OffsetNumber *killedItems; /* offset numbers of killed items */
|
2015-09-09 17:43:37 +02:00
|
|
|
int numKilled; /* number of currently stored items */
|
|
|
|
BlockNumber curBlkno; /* current number of block */
|
2016-01-18 01:13:18 +01:00
|
|
|
GistNSN curPageLSN; /* pos in the WAL stream when page was read */
|
2015-09-09 17:43:37 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
/* In a non-ordered search, returnable heap items are stored here: */
|
|
|
|
GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)];
|
|
|
|
OffsetNumber nPageData; /* number of valid items in array */
|
|
|
|
OffsetNumber curPageData; /* next item to return */
|
2015-03-26 18:12:00 +01:00
|
|
|
MemoryContext pageDataCxt; /* context holding the fetched tuples, for
|
2015-05-24 03:35:49 +02:00
|
|
|
* index-only scans */
|
2005-05-17 05:34:18 +02:00
|
|
|
} GISTScanOpaqueData;
|
|
|
|
|
|
|
|
typedef GISTScanOpaqueData *GISTScanOpaque;
|
|
|
|
|
2017-02-14 21:37:59 +01:00
|
|
|
/* despite the name, gistxlogPage is not part of any xlog record */
|
2005-10-15 04:49:52 +02:00
|
|
|
typedef struct gistxlogPage
|
|
|
|
{
|
|
|
|
BlockNumber blkno;
|
2006-03-31 01:03:10 +02:00
|
|
|
int num; /* number of index tuples following */
|
2005-10-15 04:49:52 +02:00
|
|
|
} gistxlogPage;
|
2005-06-27 14:45:23 +02:00
|
|
|
|
|
|
|
/* SplitedPageLayout - gistSplit function result */
|
2005-10-15 04:49:52 +02:00
|
|
|
typedef struct SplitedPageLayout
|
|
|
|
{
|
|
|
|
gistxlogPage block;
|
|
|
|
IndexTupleData *list;
|
|
|
|
int lenlist;
|
2006-10-04 02:30:14 +02:00
|
|
|
IndexTuple itup; /* union key for page */
|
2006-05-10 11:19:54 +02:00
|
|
|
Page page; /* to operate */
|
2005-10-15 04:49:52 +02:00
|
|
|
Buffer buffer; /* to write after all proceed */
|
2005-06-27 14:45:23 +02:00
|
|
|
|
2005-10-15 04:49:52 +02:00
|
|
|
struct SplitedPageLayout *next;
|
2005-06-27 14:45:23 +02:00
|
|
|
} SplitedPageLayout;
|
|
|
|
|
2005-06-14 13:45:14 +02:00
|
|
|
/*
|
|
|
|
* GISTInsertStack used for locking buffers and transfer arguments during
|
|
|
|
* insertion
|
|
|
|
*/
|
2005-10-15 04:49:52 +02:00
|
|
|
typedef struct GISTInsertStack
|
|
|
|
{
|
2005-06-14 13:45:14 +02:00
|
|
|
/* current page */
|
2005-10-15 04:49:52 +02:00
|
|
|
BlockNumber blkno;
|
2005-06-14 13:45:14 +02:00
|
|
|
Buffer buffer;
|
|
|
|
Page page;
|
2005-06-27 14:45:23 +02:00
|
|
|
|
2005-10-15 04:49:52 +02:00
|
|
|
/*
|
2011-04-10 17:42:00 +02:00
|
|
|
* log sequence number from page->lsn to recognize page update and compare
|
|
|
|
* it with page's nsn to recognize page split
|
2005-10-15 04:49:52 +02:00
|
|
|
*/
|
2005-06-27 14:45:23 +02:00
|
|
|
GistNSN lsn;
|
2005-10-15 04:49:52 +02:00
|
|
|
|
Detect internal GiST page splits correctly during index build.
As we descend the GiST tree during insertion, we modify any downlinks on
the way down to include the new tuple we're about to insert (if they don't
cover it already). Modifying an existing downlink might cause an internal
page to split, if the new downlink tuple is larger than the old one. If
that happens, we need to back up to the parent and re-choose a page to
insert to. We used to detect that situation, thanks to the NSN-LSN
interlock normally used to detect concurrent page splits, but that got
broken by commit 9155580fd5. With that commit, we now use a dummy constant
LSN value for every page during index build, so the LSN-NSN interlock no
longer works. I thought that was OK because there can't be any other
backends modifying the index during index build, but missed that the
insertion itself can modify the page we're inserting to. The consequence
was that we would sometimes insert the new tuple to an incorrect page, one
whose downlink doesn't cover the new tuple.
To fix, add a flag to the stack that keeps track of the state while
descending tree, to indicate that a page was split, and that we need to
retry the descend from the parent.
Thomas Munro first reported that the contrib/intarray regression test was
failing occasionally on the buildfarm after commit 9155580fd5. The failure
was intermittent, because the gistchoose() function is not deterministic,
and would only occasionally create the right circumstances for this bug to
cause the failure.
Patch by Anastasia Lubennikova, with some changes by me to make it work
correctly also when the internal page split also causes the "grandparent"
to be split.
Discussion: https://www.postgresql.org/message-id/CA%2BhUKGJRzLo7tZExWfSbwM3XuK7aAK7FhdBV0FLkbUG%2BW0v0zg%40mail.gmail.com
2019-05-14 12:18:44 +02:00
|
|
|
/*
|
|
|
|
* If set, we split the page while descending the tree to find an
|
|
|
|
* insertion target. It means that we need to retry from the parent,
|
|
|
|
* because the downlink of this page might no longer cover the new key.
|
|
|
|
*/
|
|
|
|
bool retry_from_parent;
|
|
|
|
|
2011-07-15 11:11:17 +02:00
|
|
|
/* offset of the downlink in the parent page, that points to this page */
|
|
|
|
OffsetNumber downlinkoffnum;
|
2005-06-14 13:45:14 +02:00
|
|
|
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
/* pointer to parent */
|
2005-10-15 04:49:52 +02:00
|
|
|
struct GISTInsertStack *parent;
|
2005-06-14 13:45:14 +02:00
|
|
|
} GISTInsertStack;
|
|
|
|
|
2013-02-10 17:58:15 +01:00
|
|
|
/* Working state and results for multi-column split logic in gistsplit.c */
|
2006-10-04 02:30:14 +02:00
|
|
|
typedef struct GistSplitVector
|
|
|
|
{
|
2013-02-10 17:58:15 +01:00
|
|
|
GIST_SPLITVEC splitVector; /* passed to/from user PickSplit method */
|
2006-06-28 14:00:14 +02:00
|
|
|
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
Datum spl_lattr[INDEX_MAX_KEYS]; /* Union of subkeys in
|
|
|
|
* splitVector.spl_left */
|
2006-10-04 02:30:14 +02:00
|
|
|
bool spl_lisnull[INDEX_MAX_KEYS];
|
2006-06-28 14:00:14 +02:00
|
|
|
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
Datum spl_rattr[INDEX_MAX_KEYS]; /* Union of subkeys in
|
|
|
|
* splitVector.spl_right */
|
2006-10-04 02:30:14 +02:00
|
|
|
bool spl_risnull[INDEX_MAX_KEYS];
|
2006-06-28 14:00:14 +02:00
|
|
|
|
2013-02-10 17:58:15 +01:00
|
|
|
bool *spl_dontcare; /* flags tuples which could go to either side
|
|
|
|
* of the split for zero penalty */
|
2006-06-28 14:00:14 +02:00
|
|
|
} GistSplitVector;
|
|
|
|
|
2005-10-15 04:49:52 +02:00
|
|
|
typedef struct
|
|
|
|
{
|
2005-06-14 13:45:14 +02:00
|
|
|
Relation r;
|
2018-12-21 00:37:37 +01:00
|
|
|
Relation heapRel;
|
2006-07-02 04:23:23 +02:00
|
|
|
Size freespace; /* free space to be left */
|
Generate less WAL during GiST, GIN and SP-GiST index build.
Instead of WAL-logging every modification during the build separately,
first build the index without any WAL-logging, and make a separate pass
through the index at the end, to write all pages to the WAL. This
significantly reduces the amount of WAL generated, and is usually also
faster, despite the extra I/O needed for the extra scan through the index.
WAL generated this way is also faster to replay.
For GiST, the LSN-NSN interlock makes this a little tricky. All pages must
be marked with a valid (i.e. non-zero) LSN, so that the parent-child
LSN-NSN interlock works correctly. We now use magic value 1 for that during
index build. Change the fake LSN counter to begin from 1000, so that 1 is
safely smaller than any real or fake LSN. 2 would've been enough for our
purposes, but let's reserve a bigger range, in case we need more special
values in the future.
Author: Anastasia Lubennikova, Andrey V. Lepikhov
Reviewed-by: Heikki Linnakangas, Dmitry Dolgov
2019-04-03 16:03:15 +02:00
|
|
|
bool is_build;
|
2005-06-14 13:45:14 +02:00
|
|
|
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
GISTInsertStack *stack;
|
2005-06-14 13:45:14 +02:00
|
|
|
} GISTInsertState;
|
|
|
|
|
2005-05-17 05:34:18 +02:00
|
|
|
/* root page of a gist index */
|
|
|
|
#define GIST_ROOT_BLKNO 0
|
|
|
|
|
2005-06-20 12:29:37 +02:00
|
|
|
/*
|
2016-09-20 10:38:25 +02:00
|
|
|
* Before PostgreSQL 9.1, we used to rely on so-called "invalid tuples" on
|
|
|
|
* inner pages to finish crash recovery of incomplete page splits. If a crash
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
* happened in the middle of a page split, so that the downlink pointers were
|
|
|
|
* not yet inserted, crash recovery inserted a special downlink pointer. The
|
|
|
|
* semantics of an invalid tuple was that it if you encounter one in a scan,
|
|
|
|
* it must always be followed, because we don't know if the tuples on the
|
|
|
|
* child page match or not.
|
|
|
|
*
|
|
|
|
* We no longer create such invalid tuples, we now mark the left-half of such
|
|
|
|
* an incomplete split with the F_FOLLOW_RIGHT flag instead, and finish the
|
|
|
|
* split properly the next time we need to insert on that page. To retain
|
|
|
|
* on-disk compatibility for the sake of pg_upgrade, we still store 0xffff as
|
|
|
|
* the offset number of all inner tuples. If we encounter any invalid tuples
|
|
|
|
* with 0xfffe during insertion, we throw an error, though scans still handle
|
|
|
|
* them. You should only encounter invalid tuples if you pg_upgrade a pre-9.1
|
|
|
|
* gist index which already has invalid tuples in it because of a crash. That
|
|
|
|
* should be rare, and you are recommended to REINDEX anyway if you have any
|
|
|
|
* invalid tuples in an index, so throwing an error is as far as we go with
|
|
|
|
* supporting that.
|
2005-06-20 12:29:37 +02:00
|
|
|
*/
|
|
|
|
#define TUPLE_IS_VALID 0xffff
|
|
|
|
#define TUPLE_IS_INVALID 0xfffe
|
|
|
|
|
|
|
|
#define GistTupleIsInvalid(itup) ( ItemPointerGetOffsetNumber( &((itup)->t_tid) ) == TUPLE_IS_INVALID )
|
|
|
|
#define GistTupleSetValid(itup) ItemPointerSetOffsetNumber( &((itup)->t_tid), TUPLE_IS_VALID )
|
2005-06-14 13:45:14 +02:00
|
|
|
|
2011-09-08 16:51:23 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A buffer attached to an internal node, used when building an index in
|
|
|
|
* buffering mode.
|
|
|
|
*/
|
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
BlockNumber nodeBlocknum; /* index block # this buffer is for */
|
|
|
|
int32 blocksCount; /* current # of blocks occupied by buffer */
|
|
|
|
|
|
|
|
BlockNumber pageBlocknum; /* temporary file block # */
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
GISTNodeBufferPage *pageBuffer; /* in-memory buffer page */
|
2011-09-08 16:51:23 +02:00
|
|
|
|
|
|
|
/* is this buffer queued for emptying? */
|
|
|
|
bool queuedForEmptying;
|
|
|
|
|
Fix bug in gistRelocateBuildBuffersOnSplit().
When we create a temporary copy of the old node buffer, in stack, we mustn't
leak that into any of the long-lived data structures. Before this patch,
when we called gistPopItupFromNodeBuffer(), it got added to the array of
"loaded buffers". After gistRelocateBuildBuffersOnSplit() exits, the
pointer added to the loaded buffers array points to garbage. Often that goes
unnotied, because when we go through the array of loaded buffers to unload
them, buffers with a NULL pageBuffer are ignored, which can often happen by
accident even if the pointer points to garbage.
This patch fixes that by marking the temporary copy in stack explicitly as
temporary, and refrain from adding buffers marked as temporary to the array
of loaded buffers.
While we're at it, initialize nodeBuffer->pageBlocknum to InvalidBlockNumber
and improve comments a bit. This isn't strictly necessary, but makes
debugging easier.
2012-05-18 18:31:36 +02:00
|
|
|
/* is this a temporary copy, not in the hash table? */
|
|
|
|
bool isTemp;
|
|
|
|
|
2012-05-30 10:59:14 +02:00
|
|
|
int level; /* 0 == leaf */
|
2011-09-08 16:51:23 +02:00
|
|
|
} GISTNodeBuffer;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Does specified level have buffers? (Beware of multiple evaluation of
|
|
|
|
* arguments.)
|
|
|
|
*/
|
|
|
|
#define LEVEL_HAS_BUFFERS(nlevel, gfbb) \
|
|
|
|
((nlevel) != 0 && (nlevel) % (gfbb)->levelStep == 0 && \
|
2012-05-30 10:59:14 +02:00
|
|
|
(nlevel) != (gfbb)->rootlevel)
|
2011-09-08 16:51:23 +02:00
|
|
|
|
|
|
|
/* Is specified buffer at least half-filled (should be queued for emptying)? */
|
|
|
|
#define BUFFER_HALF_FILLED(nodeBuffer, gfbb) \
|
|
|
|
((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer / 2)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Is specified buffer full? Our buffers can actually grow indefinitely,
|
|
|
|
* beyond the "maximum" size, so this just means whether the buffer has grown
|
|
|
|
* beyond the nominal maximum size.
|
|
|
|
*/
|
|
|
|
#define BUFFER_OVERFLOWED(nodeBuffer, gfbb) \
|
|
|
|
((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Data structure with general information about build buffers.
|
|
|
|
*/
|
|
|
|
typedef struct GISTBuildBuffers
|
|
|
|
{
|
|
|
|
/* Persistent memory context for the buffers and metadata. */
|
|
|
|
MemoryContext context;
|
|
|
|
|
|
|
|
BufFile *pfile; /* Temporary file to store buffers in */
|
|
|
|
long nFileBlocks; /* Current size of the temporary file */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* resizable array of free blocks.
|
|
|
|
*/
|
|
|
|
long *freeBlocks;
|
|
|
|
int nFreeBlocks; /* # of currently free blocks in the array */
|
|
|
|
int freeBlocksLen; /* current allocated length of the array */
|
|
|
|
|
|
|
|
/* Hash for buffers by block number */
|
|
|
|
HTAB *nodeBuffersTab;
|
|
|
|
|
|
|
|
/* List of buffers scheduled for emptying */
|
|
|
|
List *bufferEmptyingQueue;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Parameters to the buffering build algorithm. levelStep determines which
|
|
|
|
* levels in the tree have buffers, and pagesPerBuffer determines how
|
|
|
|
* large each buffer is.
|
|
|
|
*/
|
|
|
|
int levelStep;
|
|
|
|
int pagesPerBuffer;
|
|
|
|
|
|
|
|
/* Array of lists of buffers on each level, for final emptying */
|
|
|
|
List **buffersOnLevels;
|
|
|
|
int buffersOnLevelsLen;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Dynamically-sized array of buffers that currently have their last page
|
|
|
|
* loaded in main memory.
|
|
|
|
*/
|
|
|
|
GISTNodeBuffer **loadedBuffers;
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
int loadedBuffersCount; /* # of entries in loadedBuffers */
|
|
|
|
int loadedBuffersLen; /* allocated size of loadedBuffers */
|
2011-09-08 16:51:23 +02:00
|
|
|
|
2012-05-30 10:59:14 +02:00
|
|
|
/* Level of the current root node (= height of the index tree - 1) */
|
|
|
|
int rootlevel;
|
2011-09-08 16:51:23 +02:00
|
|
|
} GISTBuildBuffers;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Storage type for GiST's reloptions
|
|
|
|
*/
|
|
|
|
typedef struct GiSTOptions
|
|
|
|
{
|
|
|
|
int32 vl_len_; /* varlena header (do not touch directly!) */
|
|
|
|
int fillfactor; /* page fill factor in percent (0..100) */
|
|
|
|
int bufferingModeOffset; /* use buffering build? */
|
2012-06-10 21:20:04 +02:00
|
|
|
} GiSTOptions;
|
2011-09-08 16:51:23 +02:00
|
|
|
|
2005-05-17 05:34:18 +02:00
|
|
|
/* gist.c */
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
extern void gistbuildempty(Relation index);
|
|
|
|
extern bool gistinsert(Relation r, Datum *values, bool *isnull,
|
|
|
|
ItemPointer ht_ctid, Relation heapRel,
|
Allow index AMs to cache data across aminsert calls within a SQL command.
It's always been possible for index AMs to cache data across successive
amgettuple calls within a single SQL command: the IndexScanDesc.opaque
field is meant for precisely that. However, no comparable facility
exists for amortizing setup work across successive aminsert calls.
This patch adds such a feature and teaches GIN, GIST, and BRIN to use it
to amortize catalog lookups they'd previously been doing on every call.
(The other standard index AMs keep everything they need in the relcache,
so there's little to improve there.)
For GIN, the overall improvement in a statement that inserts many rows
can be as much as 10%, though it seems a bit less for the other two.
In addition, this makes a really significant difference in runtime
for CLOBBER_CACHE_ALWAYS tests, since in those builds the repeated
catalog lookups are vastly more expensive.
The reason this has been hard up to now is that the aminsert function is
not passed any useful place to cache per-statement data. What I chose to
do is to add suitable fields to struct IndexInfo and pass that to aminsert.
That's not widening the index AM API very much because IndexInfo is already
within the ken of ambuild; in fact, by passing the same info to aminsert
as to ambuild, this is really removing an inconsistency in the AM API.
Discussion: https://postgr.es/m/27568.1486508680@sss.pgh.pa.us
2017-02-09 17:52:12 +01:00
|
|
|
IndexUniqueCheck checkUnique,
|
|
|
|
struct IndexInfo *indexInfo);
|
2005-05-17 05:34:18 +02:00
|
|
|
extern MemoryContext createTempGistContext(void);
|
2011-10-01 01:48:57 +02:00
|
|
|
extern GISTSTATE *initGISTstate(Relation index);
|
2005-05-17 05:34:18 +02:00
|
|
|
extern void freeGISTstate(GISTSTATE *giststate);
|
2011-09-08 16:51:23 +02:00
|
|
|
extern void gistdoinsert(Relation r,
|
|
|
|
IndexTuple itup,
|
|
|
|
Size freespace,
|
2018-12-21 00:37:37 +01:00
|
|
|
GISTSTATE *GISTstate,
|
Generate less WAL during GiST, GIN and SP-GiST index build.
Instead of WAL-logging every modification during the build separately,
first build the index without any WAL-logging, and make a separate pass
through the index at the end, to write all pages to the WAL. This
significantly reduces the amount of WAL generated, and is usually also
faster, despite the extra I/O needed for the extra scan through the index.
WAL generated this way is also faster to replay.
For GiST, the LSN-NSN interlock makes this a little tricky. All pages must
be marked with a valid (i.e. non-zero) LSN, so that the parent-child
LSN-NSN interlock works correctly. We now use magic value 1 for that during
index build. Change the fake LSN counter to begin from 1000, so that 1 is
safely smaller than any real or fake LSN. 2 would've been enough for our
purposes, but let's reserve a bigger range, in case we need more special
values in the future.
Author: Anastasia Lubennikova, Andrey V. Lepikhov
Reviewed-by: Heikki Linnakangas, Dmitry Dolgov
2019-04-03 16:03:15 +02:00
|
|
|
Relation heapRel,
|
|
|
|
bool is_build);
|
2011-09-08 16:51:23 +02:00
|
|
|
|
|
|
|
/* A List of these is returned from gistplacetopage() in *splitinfo */
|
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
Buffer buf; /* the split page "half" */
|
|
|
|
IndexTuple downlink; /* downlink for this half. */
|
|
|
|
} GISTPageSplitInfo;
|
|
|
|
|
|
|
|
extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
|
|
|
|
Buffer buffer,
|
2012-08-16 11:42:11 +02:00
|
|
|
IndexTuple *itup, int ntup,
|
|
|
|
OffsetNumber oldoffnum, BlockNumber *newblkno,
|
2011-09-08 16:51:23 +02:00
|
|
|
Buffer leftchildbuf,
|
|
|
|
List **splitinfo,
|
2018-12-21 00:37:37 +01:00
|
|
|
bool markleftchild,
|
Generate less WAL during GiST, GIN and SP-GiST index build.
Instead of WAL-logging every modification during the build separately,
first build the index without any WAL-logging, and make a separate pass
through the index at the end, to write all pages to the WAL. This
significantly reduces the amount of WAL generated, and is usually also
faster, despite the extra I/O needed for the extra scan through the index.
WAL generated this way is also faster to replay.
For GiST, the LSN-NSN interlock makes this a little tricky. All pages must
be marked with a valid (i.e. non-zero) LSN, so that the parent-child
LSN-NSN interlock works correctly. We now use magic value 1 for that during
index build. Change the fake LSN counter to begin from 1000, so that 1 is
safely smaller than any real or fake LSN. 2 would've been enough for our
purposes, but let's reserve a bigger range, in case we need more special
values in the future.
Author: Anastasia Lubennikova, Andrey V. Lepikhov
Reviewed-by: Heikki Linnakangas, Dmitry Dolgov
2019-04-03 16:03:15 +02:00
|
|
|
Relation heapRel,
|
|
|
|
bool is_build);
|
2005-06-06 19:01:25 +02:00
|
|
|
|
2006-05-10 11:19:54 +02:00
|
|
|
extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
|
|
|
|
int len, GISTSTATE *giststate);
|
2005-10-15 04:49:52 +02:00
|
|
|
|
2019-03-22 12:21:20 +01:00
|
|
|
/* gistxlog.c */
|
|
|
|
extern XLogRecPtr gistXLogPageDelete(Buffer buffer,
|
|
|
|
TransactionId xid, Buffer parentBuffer,
|
|
|
|
OffsetNumber downlinkOffset);
|
|
|
|
|
|
|
|
extern void gistXLogPageReuse(Relation rel, BlockNumber blkno,
|
|
|
|
TransactionId latestRemovedXid);
|
|
|
|
|
2016-06-28 22:01:13 +02:00
|
|
|
extern XLogRecPtr gistXLogUpdate(Buffer buffer,
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
OffsetNumber *todelete, int ntodelete,
|
2011-04-10 17:42:00 +02:00
|
|
|
IndexTuple *itup, int ntup,
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
Buffer leftchild);
|
2005-06-20 12:29:37 +02:00
|
|
|
|
2019-03-22 12:21:20 +01:00
|
|
|
extern XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete,
|
2019-04-22 23:28:30 +02:00
|
|
|
int ntodelete, TransactionId latestRemovedXid);
|
2018-12-21 00:37:37 +01:00
|
|
|
|
2016-06-28 22:01:13 +02:00
|
|
|
extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
SplitedPageLayout *dist,
|
|
|
|
BlockNumber origrlink, GistNSN oldnsn,
|
2011-09-08 16:51:23 +02:00
|
|
|
Buffer leftchild, bool markfollowright);
|
2005-05-17 05:34:18 +02:00
|
|
|
|
|
|
|
/* gistget.c */
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
extern bool gistgettuple(IndexScanDesc scan, ScanDirection dir);
|
|
|
|
extern int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
|
|
|
|
extern bool gistcanreturn(Relation index, int attno);
|
|
|
|
|
|
|
|
/* gistvalidate.c */
|
|
|
|
extern bool gistvalidate(Oid opclassoid);
|
2005-05-17 05:34:18 +02:00
|
|
|
|
2005-06-14 13:45:14 +02:00
|
|
|
/* gistutil.c */
|
2006-05-10 11:19:54 +02:00
|
|
|
|
|
|
|
#define GiSTPageSize \
|
2006-10-04 02:30:14 +02:00
|
|
|
( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GISTPageOpaqueData)) )
|
2006-05-10 11:19:54 +02:00
|
|
|
|
2006-07-11 23:05:57 +02:00
|
|
|
#define GIST_MIN_FILLFACTOR 10
|
2006-07-04 00:45:41 +02:00
|
|
|
#define GIST_DEFAULT_FILLFACTOR 90
|
|
|
|
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
extern bytea *gistoptions(Datum reloptions, bool validate);
|
2016-08-14 00:31:14 +02:00
|
|
|
extern bool gistproperty(Oid index_oid, int attno,
|
|
|
|
IndexAMProperty prop, const char *propname,
|
|
|
|
bool *res, bool *isnull);
|
2006-05-10 11:19:54 +02:00
|
|
|
extern bool gistfitpage(IndexTuple *itvec, int len);
|
2006-07-02 04:23:23 +02:00
|
|
|
extern bool gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace);
|
2005-11-06 23:39:21 +01:00
|
|
|
extern void gistcheckpage(Relation rel, Buffer buf);
|
2005-10-15 04:49:52 +02:00
|
|
|
extern Buffer gistNewBuffer(Relation r);
|
2019-03-22 12:21:20 +01:00
|
|
|
extern bool gistPageRecyclable(Page page);
|
2008-06-12 11:12:31 +02:00
|
|
|
extern void gistfillbuffer(Page page, IndexTuple *itup, int len,
|
2009-06-11 16:49:15 +02:00
|
|
|
OffsetNumber off);
|
2006-05-17 18:34:59 +02:00
|
|
|
extern IndexTuple *gistextractpage(Page page, int *len /* out */ );
|
2005-10-15 04:49:52 +02:00
|
|
|
extern IndexTuple *gistjoinvector(
|
|
|
|
IndexTuple *itvec, int *len,
|
|
|
|
IndexTuple *additvec, int addlen);
|
2006-10-04 02:30:14 +02:00
|
|
|
extern IndexTupleData *gistfillitupvec(IndexTuple *vec, int veclen, int *memlen);
|
2006-05-19 18:15:17 +02:00
|
|
|
|
2005-06-14 13:45:14 +02:00
|
|
|
extern IndexTuple gistunion(Relation r, IndexTuple *itvec,
|
2005-10-15 04:49:52 +02:00
|
|
|
int len, GISTSTATE *giststate);
|
2005-06-14 13:45:14 +02:00
|
|
|
extern IndexTuple gistgetadjusted(Relation r,
|
2005-10-15 04:49:52 +02:00
|
|
|
IndexTuple oldtup,
|
|
|
|
IndexTuple addtup,
|
|
|
|
GISTSTATE *giststate);
|
2005-06-14 13:45:14 +02:00
|
|
|
extern IndexTuple gistFormTuple(GISTSTATE *giststate,
|
2015-03-26 18:11:54 +01:00
|
|
|
Relation r, Datum *attdata, bool *isnull, bool isleaf);
|
2005-06-14 13:45:14 +02:00
|
|
|
|
|
|
|
extern OffsetNumber gistchoose(Relation r, Page p,
|
2005-10-15 04:49:52 +02:00
|
|
|
IndexTuple it,
|
|
|
|
GISTSTATE *giststate);
|
2006-05-19 18:15:17 +02:00
|
|
|
|
2005-06-14 13:45:14 +02:00
|
|
|
extern void GISTInitBuffer(Buffer b, uint32 f);
|
|
|
|
extern void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e,
|
|
|
|
Datum k, Relation r, Page pg, OffsetNumber o,
|
2006-06-28 14:00:14 +02:00
|
|
|
bool l, bool isNull);
|
|
|
|
|
|
|
|
extern float gistpenalty(GISTSTATE *giststate, int attno,
|
2006-10-04 02:30:14 +02:00
|
|
|
GISTENTRY *key1, bool isNull1,
|
|
|
|
GISTENTRY *key2, bool isNull2);
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
2013-02-07 23:44:02 +01:00
|
|
|
extern void gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len,
|
2006-10-04 02:30:14 +02:00
|
|
|
Datum *attr, bool *isnull);
|
2006-06-28 14:00:14 +02:00
|
|
|
extern bool gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b);
|
|
|
|
extern void gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p,
|
2006-10-04 02:30:14 +02:00
|
|
|
OffsetNumber o, GISTENTRY *attdata, bool *isnull);
|
2017-02-27 23:20:34 +01:00
|
|
|
extern HeapTuple gistFetchTuple(GISTSTATE *giststate, Relation r,
|
2015-03-26 18:12:00 +01:00
|
|
|
IndexTuple tuple);
|
2006-10-04 02:30:14 +02:00
|
|
|
extern void gistMakeUnionKey(GISTSTATE *giststate, int attno,
|
|
|
|
GISTENTRY *entry1, bool isnull1,
|
|
|
|
GISTENTRY *entry2, bool isnull2,
|
|
|
|
Datum *dst, bool *dstisnull);
|
2005-06-20 12:29:37 +02:00
|
|
|
|
2013-02-11 21:50:15 +01:00
|
|
|
extern XLogRecPtr gistGetFakeLSN(Relation rel);
|
2010-11-16 10:02:11 +01:00
|
|
|
|
2005-06-20 12:29:37 +02:00
|
|
|
/* gistvacuum.c */
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
extern IndexBulkDeleteResult *gistbulkdelete(IndexVacuumInfo *info,
|
|
|
|
IndexBulkDeleteResult *stats,
|
|
|
|
IndexBulkDeleteCallback callback,
|
|
|
|
void *callback_state);
|
|
|
|
extern IndexBulkDeleteResult *gistvacuumcleanup(IndexVacuumInfo *info,
|
|
|
|
IndexBulkDeleteResult *stats);
|
2005-06-14 13:45:14 +02:00
|
|
|
|
2006-06-28 14:00:14 +02:00
|
|
|
/* gistsplit.c */
|
2006-10-04 02:30:14 +02:00
|
|
|
extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup,
|
|
|
|
int len, GISTSTATE *giststate,
|
2013-02-10 17:58:15 +01:00
|
|
|
GistSplitVector *v,
|
2006-10-04 02:30:14 +02:00
|
|
|
int attno);
|
2006-07-04 00:45:41 +02:00
|
|
|
|
2011-09-08 16:51:23 +02:00
|
|
|
/* gistbuild.c */
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
extern IndexBuildResult *gistbuild(Relation heap, Relation index,
|
|
|
|
struct IndexInfo *indexInfo);
|
2017-10-31 15:34:31 +01:00
|
|
|
extern void gistValidateBufferingOption(const char *value);
|
2011-09-08 16:51:23 +02:00
|
|
|
|
|
|
|
/* gistbuildbuffers.c */
|
|
|
|
extern GISTBuildBuffers *gistInitBuildBuffers(int pagesPerBuffer, int levelStep,
|
|
|
|
int maxLevel);
|
|
|
|
extern GISTNodeBuffer *gistGetNodeBuffer(GISTBuildBuffers *gfbb,
|
|
|
|
GISTSTATE *giststate,
|
2012-05-30 10:59:14 +02:00
|
|
|
BlockNumber blkno, int level);
|
2011-09-08 16:51:23 +02:00
|
|
|
extern void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb,
|
|
|
|
GISTNodeBuffer *nodeBuffer, IndexTuple item);
|
|
|
|
extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb,
|
|
|
|
GISTNodeBuffer *nodeBuffer, IndexTuple *item);
|
|
|
|
extern void gistFreeBuildBuffers(GISTBuildBuffers *gfbb);
|
|
|
|
extern void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb,
|
|
|
|
GISTSTATE *giststate, Relation r,
|
2012-05-30 10:59:14 +02:00
|
|
|
int level, Buffer buffer,
|
2011-09-08 16:51:23 +02:00
|
|
|
List *splitinfo);
|
|
|
|
extern void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb);
|
|
|
|
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
|
|
|
#endif /* GIST_PRIVATE_H */
|