postgresql/src/include/access/gist_private.h

548 lines
18 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* gist_private.h
* private declarations for GiST -- declarations related to the
* internal implementation of GiST, not the public API
*
* Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
2010-09-20 22:08:53 +02:00
* src/include/access/gist_private.h
*
*-------------------------------------------------------------------------
*/
#ifndef GIST_PRIVATE_H
#define GIST_PRIVATE_H
#include "access/gist.h"
#include "access/itup.h"
#include "fmgr.h"
#include "storage/bufmgr.h"
#include "storage/buffile.h"
#include "utils/rbtree.h"
#include "utils/hsearch.h"
/* Buffer lock modes */
#define GIST_SHARE BUFFER_LOCK_SHARE
#define GIST_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE
#define GIST_UNLOCK BUFFER_LOCK_UNLOCK
typedef struct
{
BlockNumber prev;
uint32 freespace;
char tupledata[1];
} GISTNodeBufferPage;
#define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata))
/* Returns free space in node buffer page */
#define PAGE_FREE_SPACE(nbp) (nbp->freespace)
/* Checks if node buffer page is empty */
#define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - BUFFER_PAGE_DATA_OFFSET)
/* Checks if node buffers page don't contain sufficient space for index tuple */
#define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \
MAXALIGN(IndexTupleSize(itup)))
/*
* GISTSTATE: information needed for any GiST index operation
*
* This struct retains call info for the index's opclass-specific support
* functions (per index column), plus the index's tuple descriptor.
*
* scanCxt holds the GISTSTATE itself as well as any data that lives for the
* lifetime of the index operation. We pass this to the support functions
* via fn_mcxt, so that they can store scan-lifespan data in it. The
* functions are invoked in tempCxt, which is typically short-lifespan
* (that is, it's reset after each tuple). However, tempCxt can be the same
* as scanCxt if we're not bothering with per-tuple context resets.
*/
typedef struct GISTSTATE
{
MemoryContext scanCxt; /* context for scan-lifespan data */
MemoryContext tempCxt; /* short-term context for calling functions */
TupleDesc tupdesc; /* index's tuple descriptor */
FmgrInfo consistentFn[INDEX_MAX_KEYS];
FmgrInfo unionFn[INDEX_MAX_KEYS];
FmgrInfo compressFn[INDEX_MAX_KEYS];
FmgrInfo decompressFn[INDEX_MAX_KEYS];
FmgrInfo penaltyFn[INDEX_MAX_KEYS];
FmgrInfo picksplitFn[INDEX_MAX_KEYS];
FmgrInfo equalFn[INDEX_MAX_KEYS];
FmgrInfo distanceFn[INDEX_MAX_KEYS];
/* Collations to pass to the support functions */
Oid supportCollation[INDEX_MAX_KEYS];
} GISTSTATE;
/*
* During a GiST index search, we must maintain a queue of unvisited items,
* which can be either individual heap tuples or whole index pages. If it
* is an ordered search, the unvisited items should be visited in distance
* order. Unvisited items at the same distance should be visited in
* depth-first order, that is heap items first, then lower index pages, then
* upper index pages; this rule avoids doing extra work during a search that
* ends early due to LIMIT.
*
* To perform an ordered search, we use an RBTree to manage the distance-order
* queue. Each GISTSearchTreeItem stores all unvisited items of the same
* distance; they are GISTSearchItems chained together via their next fields.
*
* In a non-ordered search (no order-by operators), the RBTree degenerates
* to a single item, which we use as a queue of unvisited index pages only.
* In this case matched heap items from the current index leaf page are
* remembered in GISTScanOpaqueData.pageData[] and returned directly from
* there, instead of building a separate GISTSearchItem for each one.
*/
/* Individual heap tuple to be visited */
typedef struct GISTSearchHeapItem
{
ItemPointerData heapPtr;
bool recheck; /* T if quals must be rechecked */
} GISTSearchHeapItem;
/* Unvisited item, either index page or heap tuple */
typedef struct GISTSearchItem
{
struct GISTSearchItem *next; /* list link */
BlockNumber blkno; /* index page number, or InvalidBlockNumber */
union
{
2011-04-10 17:42:00 +02:00
GistNSN parentlsn; /* parent page's LSN, if index page */
/* we must store parentlsn to detect whether a split occurred */
GISTSearchHeapItem heap; /* heap info, if heap tuple */
} data;
2011-04-10 17:42:00 +02:00
} GISTSearchItem;
2011-04-10 17:42:00 +02:00
#define GISTSearchItemIsHeap(item) ((item).blkno == InvalidBlockNumber)
/*
* Within a GISTSearchTreeItem's chain, heap items always appear before
* index-page items, since we want to visit heap items first. lastHeap points
* to the last heap item in the chain, or is NULL if there are none.
*/
typedef struct GISTSearchTreeItem
{
RBNode rbnode; /* this is an RBTree item */
GISTSearchItem *head; /* first chain member */
GISTSearchItem *lastHeap; /* last heap-tuple member, if any */
double distances[1]; /* array with numberOfOrderBys entries */
} GISTSearchTreeItem;
#define GSTIHDRSZ offsetof(GISTSearchTreeItem, distances)
/*
* GISTScanOpaqueData: private state for a scan of a GiST index
*/
typedef struct GISTScanOpaqueData
{
GISTSTATE *giststate; /* index information, see above */
RBTree *queue; /* queue of unvisited items */
MemoryContext queueCxt; /* context holding the queue */
bool qual_ok; /* false if qual can never be satisfied */
bool firstCall; /* true until first gistgettuple call */
GISTSearchTreeItem *curTreeItem; /* current queue item, if any */
/* pre-allocated workspace arrays */
GISTSearchTreeItem *tmpTreeItem; /* workspace to pass to rb_insert */
double *distances; /* output area for gistindex_keytest */
/* In a non-ordered search, returnable heap items are stored here: */
GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)];
OffsetNumber nPageData; /* number of valid items in array */
OffsetNumber curPageData; /* next item to return */
} GISTScanOpaqueData;
typedef GISTScanOpaqueData *GISTScanOpaque;
/* XLog stuff */
#define XLOG_GIST_PAGE_UPDATE 0x00
2011-04-10 17:42:00 +02:00
/* #define XLOG_GIST_NEW_ROOT 0x20 */ /* not used anymore */
#define XLOG_GIST_PAGE_SPLIT 0x30
2011-04-10 17:42:00 +02:00
/* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */
#define XLOG_GIST_CREATE_INDEX 0x50
Fix multiple problems in WAL replay. Most of the replay functions for WAL record types that modify more than one page failed to ensure that those pages were locked correctly to ensure that concurrent queries could not see inconsistent page states. This is a hangover from coding decisions made long before Hot Standby was added, when it was hardly necessary to acquire buffer locks during WAL replay at all, let alone hold them for carefully-chosen periods. The key problem was that RestoreBkpBlocks was written to hold lock on each page restored from a full-page image for only as long as it took to update that page. This was guaranteed to break any WAL replay function in which there was any update-ordering constraint between pages, because even if the nominal order of the pages is the right one, any mixture of full-page and non-full-page updates in the same record would result in out-of-order updates. Moreover, it wouldn't work for situations where there's a requirement to maintain lock on one page while updating another. Failure to honor an update ordering constraint in this way is thought to be the cause of bug #7648 from Daniel Farina: what seems to have happened there is that a btree page being split was rewritten from a full-page image before the new right sibling page was written, and because lock on the original page was not maintained it was possible for hot standby queries to try to traverse the page's right-link to the not-yet-existing sibling page. To fix, get rid of RestoreBkpBlocks as such, and instead create a new function RestoreBackupBlock that restores just one full-page image at a time. This function can be invoked by WAL replay functions at the points where they would otherwise perform non-full-page updates; in this way, the physical order of page updates remains the same no matter which pages are replaced by full-page images. We can then further adjust the logic in individual replay functions if it is necessary to hold buffer locks for overlapping periods. A side benefit is that we can simplify the handling of concurrency conflict resolution by moving that code into the record-type-specfic functions; there's no more need to contort the code layout to keep conflict resolution in front of the RestoreBkpBlocks call. In connection with that, standardize on zero-based numbering rather than one-based numbering for referencing the full-page images. In HEAD, I removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are still there in the header files in previous branches, but are no longer used by the code. In addition, fix some other bugs identified in the course of making these changes: spgRedoAddNode could fail to update the parent downlink at all, if the parent tuple is in the same page as either the old or new split tuple and we're not doing a full-page image: it would get fooled by the LSN having been advanced already. This would result in permanent index corruption, not just transient failure of concurrent queries. Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old tail page as a candidate for a full-page image; in the worst case this could result in torn-page corruption. heap_xlog_freeze() was inconsistent about using a cleanup lock or plain exclusive lock: it did the former in the normal path but the latter for a full-page image. A plain exclusive lock seems sufficient, so change to that. Also, remove gistRedoPageDeleteRecord(), which has been dead code since VACUUM FULL was rewritten. Back-patch to 9.0, where hot standby was introduced. Note however that 9.0 had a significantly different WAL-logging scheme for GIST index updates, and it doesn't appear possible to make that scheme safe for concurrent hot standby queries, because it can leave inconsistent states in the index even between WAL records. Given the lack of complaints from the field, we won't work too hard on fixing that branch.
2012-11-13 04:05:08 +01:00
/* #define XLOG_GIST_PAGE_DELETE 0x60 */ /* not used anymore */
typedef struct gistxlogPageUpdate
2005-10-15 04:49:52 +02:00
{
RelFileNode node;
BlockNumber blkno;
2005-10-15 04:49:52 +02:00
/*
Rewrite the GiST insertion logic so that we don't need the post-recovery cleanup stage to finish incomplete inserts or splits anymore. There was two reasons for the cleanup step: 1. When a new tuple was inserted to a leaf page, the downlink in the parent needed to be updated to contain (ie. to be consistent with) the new key. Updating the parent in turn might require recursively updating the parent of the parent. We now handle that by updating the parent while traversing down the tree, so that when we insert the leaf tuple, all the parents are already consistent with the new key, and the tree is consistent at every step. 2. When a page is split, we need to insert the downlink for the new right page(s), and update the downlink for the original page to not include keys that moved to the right page(s). We now handle that by setting a new flag, F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is set, scans always follow the rightlink, regardless of the NSN mechanism used to detect concurrent page splits. That way the tree is consistent right after split, even though the downlink is still missing. This is very similar to the way B-tree splits are handled. When the downlink is inserted in the parent, the flag is cleared. To keep the insertion algorithm simple, when an insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it finishes the split before doing anything else. These changes allow removing the whole "invalid tuple" mechanism, but I retained the scan code to still follow invalid tuples correctly. While we don't create any such tuples anymore, we want to handle them gracefully in case you pg_upgrade a GiST index that has them. If we encounter any on an insert, though, we just throw an error saying that you need to REINDEX. The issue that got me into doing this is that if you did a checkpoint while an insert or split was in progress, and the checkpoint finishes quickly so that there is no WAL record related to the insert between RedoRecPtr and the checkpoint record, recovery from that checkpoint would not know to finish the incomplete insert. IOW, we have the same issue we solved with the rm_safe_restartpoint mechanism during normal operation too. It's highly unlikely to happen in practice, and this fix is far too large to backpatch, so we're just going to live with in previous versions, but this refactoring fixes it going forward. With this patch, you don't get the annoying 'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
* If this operation completes a page split, by inserting a downlink for
* the split page, leftchild points to the left half of the split.
2005-10-15 04:49:52 +02:00
*/
2011-04-10 17:42:00 +02:00
BlockNumber leftchild;
2005-10-15 04:49:52 +02:00
/* number of deleted offsets */
uint16 ntodelete;
2005-10-15 04:49:52 +02:00
/*
* follow: 1. todelete OffsetNumbers 2. tuples to insert
*/
} gistxlogPageUpdate;
2005-10-15 04:49:52 +02:00
typedef struct gistxlogPageSplit
{
RelFileNode node;
BlockNumber origblkno; /* splitted page */
2011-04-10 17:42:00 +02:00
BlockNumber origrlink; /* rightlink of the page before split */
Rewrite the GiST insertion logic so that we don't need the post-recovery cleanup stage to finish incomplete inserts or splits anymore. There was two reasons for the cleanup step: 1. When a new tuple was inserted to a leaf page, the downlink in the parent needed to be updated to contain (ie. to be consistent with) the new key. Updating the parent in turn might require recursively updating the parent of the parent. We now handle that by updating the parent while traversing down the tree, so that when we insert the leaf tuple, all the parents are already consistent with the new key, and the tree is consistent at every step. 2. When a page is split, we need to insert the downlink for the new right page(s), and update the downlink for the original page to not include keys that moved to the right page(s). We now handle that by setting a new flag, F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is set, scans always follow the rightlink, regardless of the NSN mechanism used to detect concurrent page splits. That way the tree is consistent right after split, even though the downlink is still missing. This is very similar to the way B-tree splits are handled. When the downlink is inserted in the parent, the flag is cleared. To keep the insertion algorithm simple, when an insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it finishes the split before doing anything else. These changes allow removing the whole "invalid tuple" mechanism, but I retained the scan code to still follow invalid tuples correctly. While we don't create any such tuples anymore, we want to handle them gracefully in case you pg_upgrade a GiST index that has them. If we encounter any on an insert, though, we just throw an error saying that you need to REINDEX. The issue that got me into doing this is that if you did a checkpoint while an insert or split was in progress, and the checkpoint finishes quickly so that there is no WAL record related to the insert between RedoRecPtr and the checkpoint record, recovery from that checkpoint would not know to finish the incomplete insert. IOW, we have the same issue we solved with the rm_safe_restartpoint mechanism during normal operation too. It's highly unlikely to happen in practice, and this fix is far too large to backpatch, so we're just going to live with in previous versions, but this refactoring fixes it going forward. With this patch, you don't get the annoying 'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
GistNSN orignsn; /* NSN of the page before split */
bool origleaf; /* was splitted page a leaf page? */
Rewrite the GiST insertion logic so that we don't need the post-recovery cleanup stage to finish incomplete inserts or splits anymore. There was two reasons for the cleanup step: 1. When a new tuple was inserted to a leaf page, the downlink in the parent needed to be updated to contain (ie. to be consistent with) the new key. Updating the parent in turn might require recursively updating the parent of the parent. We now handle that by updating the parent while traversing down the tree, so that when we insert the leaf tuple, all the parents are already consistent with the new key, and the tree is consistent at every step. 2. When a page is split, we need to insert the downlink for the new right page(s), and update the downlink for the original page to not include keys that moved to the right page(s). We now handle that by setting a new flag, F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is set, scans always follow the rightlink, regardless of the NSN mechanism used to detect concurrent page splits. That way the tree is consistent right after split, even though the downlink is still missing. This is very similar to the way B-tree splits are handled. When the downlink is inserted in the parent, the flag is cleared. To keep the insertion algorithm simple, when an insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it finishes the split before doing anything else. These changes allow removing the whole "invalid tuple" mechanism, but I retained the scan code to still follow invalid tuples correctly. While we don't create any such tuples anymore, we want to handle them gracefully in case you pg_upgrade a GiST index that has them. If we encounter any on an insert, though, we just throw an error saying that you need to REINDEX. The issue that got me into doing this is that if you did a checkpoint while an insert or split was in progress, and the checkpoint finishes quickly so that there is no WAL record related to the insert between RedoRecPtr and the checkpoint record, recovery from that checkpoint would not know to finish the incomplete insert. IOW, we have the same issue we solved with the rm_safe_restartpoint mechanism during normal operation too. It's highly unlikely to happen in practice, and this fix is far too large to backpatch, so we're just going to live with in previous versions, but this refactoring fixes it going forward. With this patch, you don't get the annoying 'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
BlockNumber leftchild; /* like in gistxlogPageUpdate */
uint16 npage; /* # of pages in the split */
bool markfollowright; /* set F_FOLLOW_RIGHT flags */
2005-10-15 04:49:52 +02:00
/*
* follow: 1. gistxlogPage and array of IndexTupleData per page
*/
} gistxlogPageSplit;
2005-10-15 04:49:52 +02:00
typedef struct gistxlogPage
{
BlockNumber blkno;
int num; /* number of index tuples following */
2005-10-15 04:49:52 +02:00
} gistxlogPage;
/* SplitedPageLayout - gistSplit function result */
2005-10-15 04:49:52 +02:00
typedef struct SplitedPageLayout
{
gistxlogPage block;
IndexTupleData *list;
int lenlist;
2006-10-04 02:30:14 +02:00
IndexTuple itup; /* union key for page */
Page page; /* to operate */
2005-10-15 04:49:52 +02:00
Buffer buffer; /* to write after all proceed */
2005-10-15 04:49:52 +02:00
struct SplitedPageLayout *next;
} SplitedPageLayout;
/*
* GISTInsertStack used for locking buffers and transfer arguments during
* insertion
*/
2005-10-15 04:49:52 +02:00
typedef struct GISTInsertStack
{
/* current page */
2005-10-15 04:49:52 +02:00
BlockNumber blkno;
Buffer buffer;
Page page;
2005-10-15 04:49:52 +02:00
/*
2011-04-10 17:42:00 +02:00
* log sequence number from page->lsn to recognize page update and compare
* it with page's nsn to recognize page split
2005-10-15 04:49:52 +02:00
*/
GistNSN lsn;
2005-10-15 04:49:52 +02:00
/* offset of the downlink in the parent page, that points to this page */
OffsetNumber downlinkoffnum;
Rewrite the GiST insertion logic so that we don't need the post-recovery cleanup stage to finish incomplete inserts or splits anymore. There was two reasons for the cleanup step: 1. When a new tuple was inserted to a leaf page, the downlink in the parent needed to be updated to contain (ie. to be consistent with) the new key. Updating the parent in turn might require recursively updating the parent of the parent. We now handle that by updating the parent while traversing down the tree, so that when we insert the leaf tuple, all the parents are already consistent with the new key, and the tree is consistent at every step. 2. When a page is split, we need to insert the downlink for the new right page(s), and update the downlink for the original page to not include keys that moved to the right page(s). We now handle that by setting a new flag, F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is set, scans always follow the rightlink, regardless of the NSN mechanism used to detect concurrent page splits. That way the tree is consistent right after split, even though the downlink is still missing. This is very similar to the way B-tree splits are handled. When the downlink is inserted in the parent, the flag is cleared. To keep the insertion algorithm simple, when an insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it finishes the split before doing anything else. These changes allow removing the whole "invalid tuple" mechanism, but I retained the scan code to still follow invalid tuples correctly. While we don't create any such tuples anymore, we want to handle them gracefully in case you pg_upgrade a GiST index that has them. If we encounter any on an insert, though, we just throw an error saying that you need to REINDEX. The issue that got me into doing this is that if you did a checkpoint while an insert or split was in progress, and the checkpoint finishes quickly so that there is no WAL record related to the insert between RedoRecPtr and the checkpoint record, recovery from that checkpoint would not know to finish the incomplete insert. IOW, we have the same issue we solved with the rm_safe_restartpoint mechanism during normal operation too. It's highly unlikely to happen in practice, and this fix is far too large to backpatch, so we're just going to live with in previous versions, but this refactoring fixes it going forward. With this patch, you don't get the annoying 'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
/* pointer to parent */
2005-10-15 04:49:52 +02:00
struct GISTInsertStack *parent;
} GISTInsertStack;
2006-10-04 02:30:14 +02:00
typedef struct GistSplitVector
{
GIST_SPLITVEC splitVector; /* to/from PickSplit method */
2006-10-04 02:30:14 +02:00
Datum spl_lattr[INDEX_MAX_KEYS]; /* Union of subkeys in
* spl_left */
bool spl_lisnull[INDEX_MAX_KEYS];
2006-10-04 02:30:14 +02:00
Datum spl_rattr[INDEX_MAX_KEYS]; /* Union of subkeys in
* spl_right */
bool spl_risnull[INDEX_MAX_KEYS];
2006-10-04 02:30:14 +02:00
bool *spl_equiv; /* equivalent tuples which can be freely
* distributed between left and right pages */
} GistSplitVector;
2005-10-15 04:49:52 +02:00
typedef struct
{
Relation r;
Size freespace; /* free space to be left */
Rewrite the GiST insertion logic so that we don't need the post-recovery cleanup stage to finish incomplete inserts or splits anymore. There was two reasons for the cleanup step: 1. When a new tuple was inserted to a leaf page, the downlink in the parent needed to be updated to contain (ie. to be consistent with) the new key. Updating the parent in turn might require recursively updating the parent of the parent. We now handle that by updating the parent while traversing down the tree, so that when we insert the leaf tuple, all the parents are already consistent with the new key, and the tree is consistent at every step. 2. When a page is split, we need to insert the downlink for the new right page(s), and update the downlink for the original page to not include keys that moved to the right page(s). We now handle that by setting a new flag, F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is set, scans always follow the rightlink, regardless of the NSN mechanism used to detect concurrent page splits. That way the tree is consistent right after split, even though the downlink is still missing. This is very similar to the way B-tree splits are handled. When the downlink is inserted in the parent, the flag is cleared. To keep the insertion algorithm simple, when an insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it finishes the split before doing anything else. These changes allow removing the whole "invalid tuple" mechanism, but I retained the scan code to still follow invalid tuples correctly. While we don't create any such tuples anymore, we want to handle them gracefully in case you pg_upgrade a GiST index that has them. If we encounter any on an insert, though, we just throw an error saying that you need to REINDEX. The issue that got me into doing this is that if you did a checkpoint while an insert or split was in progress, and the checkpoint finishes quickly so that there is no WAL record related to the insert between RedoRecPtr and the checkpoint record, recovery from that checkpoint would not know to finish the incomplete insert. IOW, we have the same issue we solved with the rm_safe_restartpoint mechanism during normal operation too. It's highly unlikely to happen in practice, and this fix is far too large to backpatch, so we're just going to live with in previous versions, but this refactoring fixes it going forward. With this patch, you don't get the annoying 'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
GISTInsertStack *stack;
} GISTInsertState;
/* root page of a gist index */
#define GIST_ROOT_BLKNO 0
/*
Rewrite the GiST insertion logic so that we don't need the post-recovery cleanup stage to finish incomplete inserts or splits anymore. There was two reasons for the cleanup step: 1. When a new tuple was inserted to a leaf page, the downlink in the parent needed to be updated to contain (ie. to be consistent with) the new key. Updating the parent in turn might require recursively updating the parent of the parent. We now handle that by updating the parent while traversing down the tree, so that when we insert the leaf tuple, all the parents are already consistent with the new key, and the tree is consistent at every step. 2. When a page is split, we need to insert the downlink for the new right page(s), and update the downlink for the original page to not include keys that moved to the right page(s). We now handle that by setting a new flag, F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is set, scans always follow the rightlink, regardless of the NSN mechanism used to detect concurrent page splits. That way the tree is consistent right after split, even though the downlink is still missing. This is very similar to the way B-tree splits are handled. When the downlink is inserted in the parent, the flag is cleared. To keep the insertion algorithm simple, when an insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it finishes the split before doing anything else. These changes allow removing the whole "invalid tuple" mechanism, but I retained the scan code to still follow invalid tuples correctly. While we don't create any such tuples anymore, we want to handle them gracefully in case you pg_upgrade a GiST index that has them. If we encounter any on an insert, though, we just throw an error saying that you need to REINDEX. The issue that got me into doing this is that if you did a checkpoint while an insert or split was in progress, and the checkpoint finishes quickly so that there is no WAL record related to the insert between RedoRecPtr and the checkpoint record, recovery from that checkpoint would not know to finish the incomplete insert. IOW, we have the same issue we solved with the rm_safe_restartpoint mechanism during normal operation too. It's highly unlikely to happen in practice, and this fix is far too large to backpatch, so we're just going to live with in previous versions, but this refactoring fixes it going forward. With this patch, you don't get the annoying 'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
* Before PostgreSQL 9.1, we used rely on so-called "invalid tuples" on inner
* pages to finish crash recovery of incomplete page splits. If a crash
* happened in the middle of a page split, so that the downlink pointers were
* not yet inserted, crash recovery inserted a special downlink pointer. The
* semantics of an invalid tuple was that it if you encounter one in a scan,
* it must always be followed, because we don't know if the tuples on the
* child page match or not.
*
* We no longer create such invalid tuples, we now mark the left-half of such
* an incomplete split with the F_FOLLOW_RIGHT flag instead, and finish the
* split properly the next time we need to insert on that page. To retain
* on-disk compatibility for the sake of pg_upgrade, we still store 0xffff as
* the offset number of all inner tuples. If we encounter any invalid tuples
* with 0xfffe during insertion, we throw an error, though scans still handle
* them. You should only encounter invalid tuples if you pg_upgrade a pre-9.1
* gist index which already has invalid tuples in it because of a crash. That
* should be rare, and you are recommended to REINDEX anyway if you have any
* invalid tuples in an index, so throwing an error is as far as we go with
* supporting that.
*/
#define TUPLE_IS_VALID 0xffff
#define TUPLE_IS_INVALID 0xfffe
#define GistTupleIsInvalid(itup) ( ItemPointerGetOffsetNumber( &((itup)->t_tid) ) == TUPLE_IS_INVALID )
#define GistTupleSetValid(itup) ItemPointerSetOffsetNumber( &((itup)->t_tid), TUPLE_IS_VALID )
/*
* A buffer attached to an internal node, used when building an index in
* buffering mode.
*/
typedef struct
{
BlockNumber nodeBlocknum; /* index block # this buffer is for */
int32 blocksCount; /* current # of blocks occupied by buffer */
BlockNumber pageBlocknum; /* temporary file block # */
GISTNodeBufferPage *pageBuffer; /* in-memory buffer page */
/* is this buffer queued for emptying? */
bool queuedForEmptying;
/* is this a temporary copy, not in the hash table? */
bool isTemp;
int level; /* 0 == leaf */
} GISTNodeBuffer;
/*
* Does specified level have buffers? (Beware of multiple evaluation of
* arguments.)
*/
#define LEVEL_HAS_BUFFERS(nlevel, gfbb) \
((nlevel) != 0 && (nlevel) % (gfbb)->levelStep == 0 && \
(nlevel) != (gfbb)->rootlevel)
/* Is specified buffer at least half-filled (should be queued for emptying)? */
#define BUFFER_HALF_FILLED(nodeBuffer, gfbb) \
((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer / 2)
/*
* Is specified buffer full? Our buffers can actually grow indefinitely,
* beyond the "maximum" size, so this just means whether the buffer has grown
* beyond the nominal maximum size.
*/
#define BUFFER_OVERFLOWED(nodeBuffer, gfbb) \
((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer)
/*
* Data structure with general information about build buffers.
*/
typedef struct GISTBuildBuffers
{
/* Persistent memory context for the buffers and metadata. */
MemoryContext context;
BufFile *pfile; /* Temporary file to store buffers in */
long nFileBlocks; /* Current size of the temporary file */
/*
* resizable array of free blocks.
*/
long *freeBlocks;
int nFreeBlocks; /* # of currently free blocks in the array */
int freeBlocksLen; /* current allocated length of the array */
/* Hash for buffers by block number */
HTAB *nodeBuffersTab;
/* List of buffers scheduled for emptying */
List *bufferEmptyingQueue;
/*
* Parameters to the buffering build algorithm. levelStep determines which
* levels in the tree have buffers, and pagesPerBuffer determines how
* large each buffer is.
*/
int levelStep;
int pagesPerBuffer;
/* Array of lists of buffers on each level, for final emptying */
List **buffersOnLevels;
int buffersOnLevelsLen;
/*
* Dynamically-sized array of buffers that currently have their last page
* loaded in main memory.
*/
GISTNodeBuffer **loadedBuffers;
int loadedBuffersCount; /* # of entries in loadedBuffers */
int loadedBuffersLen; /* allocated size of loadedBuffers */
/* Level of the current root node (= height of the index tree - 1) */
int rootlevel;
} GISTBuildBuffers;
/*
* Storage type for GiST's reloptions
*/
typedef struct GiSTOptions
{
int32 vl_len_; /* varlena header (do not touch directly!) */
int fillfactor; /* page fill factor in percent (0..100) */
int bufferingModeOffset; /* use buffering build? */
} GiSTOptions;
/* gist.c */
extern Datum gistbuildempty(PG_FUNCTION_ARGS);
extern Datum gistinsert(PG_FUNCTION_ARGS);
extern MemoryContext createTempGistContext(void);
extern GISTSTATE *initGISTstate(Relation index);
extern void freeGISTstate(GISTSTATE *giststate);
extern void gistdoinsert(Relation r,
IndexTuple itup,
Size freespace,
GISTSTATE *GISTstate);
/* A List of these is returned from gistplacetopage() in *splitinfo */
typedef struct
{
Buffer buf; /* the split page "half" */
IndexTuple downlink; /* downlink for this half. */
} GISTPageSplitInfo;
extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
Buffer buffer,
IndexTuple *itup, int ntup,
OffsetNumber oldoffnum, BlockNumber *newblkno,
Buffer leftchildbuf,
List **splitinfo,
bool markleftchild);
extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
int len, GISTSTATE *giststate);
2005-10-15 04:49:52 +02:00
/* gistxlog.c */
extern void gist_redo(XLogRecPtr lsn, XLogRecord *record);
extern void gist_desc(StringInfo buf, uint8 xl_info, char *rec);
extern void gist_xlog_startup(void);
extern void gist_xlog_cleanup(void);
Rewrite the GiST insertion logic so that we don't need the post-recovery cleanup stage to finish incomplete inserts or splits anymore. There was two reasons for the cleanup step: 1. When a new tuple was inserted to a leaf page, the downlink in the parent needed to be updated to contain (ie. to be consistent with) the new key. Updating the parent in turn might require recursively updating the parent of the parent. We now handle that by updating the parent while traversing down the tree, so that when we insert the leaf tuple, all the parents are already consistent with the new key, and the tree is consistent at every step. 2. When a page is split, we need to insert the downlink for the new right page(s), and update the downlink for the original page to not include keys that moved to the right page(s). We now handle that by setting a new flag, F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is set, scans always follow the rightlink, regardless of the NSN mechanism used to detect concurrent page splits. That way the tree is consistent right after split, even though the downlink is still missing. This is very similar to the way B-tree splits are handled. When the downlink is inserted in the parent, the flag is cleared. To keep the insertion algorithm simple, when an insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it finishes the split before doing anything else. These changes allow removing the whole "invalid tuple" mechanism, but I retained the scan code to still follow invalid tuples correctly. While we don't create any such tuples anymore, we want to handle them gracefully in case you pg_upgrade a GiST index that has them. If we encounter any on an insert, though, we just throw an error saying that you need to REINDEX. The issue that got me into doing this is that if you did a checkpoint while an insert or split was in progress, and the checkpoint finishes quickly so that there is no WAL record related to the insert between RedoRecPtr and the checkpoint record, recovery from that checkpoint would not know to finish the incomplete insert. IOW, we have the same issue we solved with the rm_safe_restartpoint mechanism during normal operation too. It's highly unlikely to happen in practice, and this fix is far too large to backpatch, so we're just going to live with in previous versions, but this refactoring fixes it going forward. With this patch, you don't get the annoying 'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
extern XLogRecPtr gistXLogUpdate(RelFileNode node, Buffer buffer,
OffsetNumber *todelete, int ntodelete,
2011-04-10 17:42:00 +02:00
IndexTuple *itup, int ntup,
Rewrite the GiST insertion logic so that we don't need the post-recovery cleanup stage to finish incomplete inserts or splits anymore. There was two reasons for the cleanup step: 1. When a new tuple was inserted to a leaf page, the downlink in the parent needed to be updated to contain (ie. to be consistent with) the new key. Updating the parent in turn might require recursively updating the parent of the parent. We now handle that by updating the parent while traversing down the tree, so that when we insert the leaf tuple, all the parents are already consistent with the new key, and the tree is consistent at every step. 2. When a page is split, we need to insert the downlink for the new right page(s), and update the downlink for the original page to not include keys that moved to the right page(s). We now handle that by setting a new flag, F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is set, scans always follow the rightlink, regardless of the NSN mechanism used to detect concurrent page splits. That way the tree is consistent right after split, even though the downlink is still missing. This is very similar to the way B-tree splits are handled. When the downlink is inserted in the parent, the flag is cleared. To keep the insertion algorithm simple, when an insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it finishes the split before doing anything else. These changes allow removing the whole "invalid tuple" mechanism, but I retained the scan code to still follow invalid tuples correctly. While we don't create any such tuples anymore, we want to handle them gracefully in case you pg_upgrade a GiST index that has them. If we encounter any on an insert, though, we just throw an error saying that you need to REINDEX. The issue that got me into doing this is that if you did a checkpoint while an insert or split was in progress, and the checkpoint finishes quickly so that there is no WAL record related to the insert between RedoRecPtr and the checkpoint record, recovery from that checkpoint would not know to finish the incomplete insert. IOW, we have the same issue we solved with the rm_safe_restartpoint mechanism during normal operation too. It's highly unlikely to happen in practice, and this fix is far too large to backpatch, so we're just going to live with in previous versions, but this refactoring fixes it going forward. With this patch, you don't get the annoying 'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
Buffer leftchild);
Rewrite the GiST insertion logic so that we don't need the post-recovery cleanup stage to finish incomplete inserts or splits anymore. There was two reasons for the cleanup step: 1. When a new tuple was inserted to a leaf page, the downlink in the parent needed to be updated to contain (ie. to be consistent with) the new key. Updating the parent in turn might require recursively updating the parent of the parent. We now handle that by updating the parent while traversing down the tree, so that when we insert the leaf tuple, all the parents are already consistent with the new key, and the tree is consistent at every step. 2. When a page is split, we need to insert the downlink for the new right page(s), and update the downlink for the original page to not include keys that moved to the right page(s). We now handle that by setting a new flag, F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is set, scans always follow the rightlink, regardless of the NSN mechanism used to detect concurrent page splits. That way the tree is consistent right after split, even though the downlink is still missing. This is very similar to the way B-tree splits are handled. When the downlink is inserted in the parent, the flag is cleared. To keep the insertion algorithm simple, when an insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it finishes the split before doing anything else. These changes allow removing the whole "invalid tuple" mechanism, but I retained the scan code to still follow invalid tuples correctly. While we don't create any such tuples anymore, we want to handle them gracefully in case you pg_upgrade a GiST index that has them. If we encounter any on an insert, though, we just throw an error saying that you need to REINDEX. The issue that got me into doing this is that if you did a checkpoint while an insert or split was in progress, and the checkpoint finishes quickly so that there is no WAL record related to the insert between RedoRecPtr and the checkpoint record, recovery from that checkpoint would not know to finish the incomplete insert. IOW, we have the same issue we solved with the rm_safe_restartpoint mechanism during normal operation too. It's highly unlikely to happen in practice, and this fix is far too large to backpatch, so we're just going to live with in previous versions, but this refactoring fixes it going forward. With this patch, you don't get the annoying 'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
extern XLogRecPtr gistXLogSplit(RelFileNode node,
BlockNumber blkno, bool page_is_leaf,
SplitedPageLayout *dist,
BlockNumber origrlink, GistNSN oldnsn,
Buffer leftchild, bool markfollowright);
/* gistget.c */
extern Datum gistgettuple(PG_FUNCTION_ARGS);
extern Datum gistgetbitmap(PG_FUNCTION_ARGS);
/* gistutil.c */
#define GiSTPageSize \
2006-10-04 02:30:14 +02:00
( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GISTPageOpaqueData)) )
#define GIST_MIN_FILLFACTOR 10
#define GIST_DEFAULT_FILLFACTOR 90
extern Datum gistoptions(PG_FUNCTION_ARGS);
extern bool gistfitpage(IndexTuple *itvec, int len);
extern bool gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace);
extern void gistcheckpage(Relation rel, Buffer buf);
2005-10-15 04:49:52 +02:00
extern Buffer gistNewBuffer(Relation r);
extern void gistfillbuffer(Page page, IndexTuple *itup, int len,
OffsetNumber off);
extern IndexTuple *gistextractpage(Page page, int *len /* out */ );
2005-10-15 04:49:52 +02:00
extern IndexTuple *gistjoinvector(
IndexTuple *itvec, int *len,
IndexTuple *additvec, int addlen);
2006-10-04 02:30:14 +02:00
extern IndexTupleData *gistfillitupvec(IndexTuple *vec, int veclen, int *memlen);
extern IndexTuple gistunion(Relation r, IndexTuple *itvec,
2005-10-15 04:49:52 +02:00
int len, GISTSTATE *giststate);
extern IndexTuple gistgetadjusted(Relation r,
2005-10-15 04:49:52 +02:00
IndexTuple oldtup,
IndexTuple addtup,
GISTSTATE *giststate);
extern IndexTuple gistFormTuple(GISTSTATE *giststate,
Relation r, Datum *attdata, bool *isnull, bool newValues);
extern OffsetNumber gistchoose(Relation r, Page p,
2005-10-15 04:49:52 +02:00
IndexTuple it,
GISTSTATE *giststate);
extern void gistcentryinit(GISTSTATE *giststate, int nkey,
2005-10-15 04:49:52 +02:00
GISTENTRY *e, Datum k,
Relation r, Page pg,
OffsetNumber o, bool l, bool isNull);
extern void GISTInitBuffer(Buffer b, uint32 f);
extern void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e,
Datum k, Relation r, Page pg, OffsetNumber o,
bool l, bool isNull);
extern float gistpenalty(GISTSTATE *giststate, int attno,
2006-10-04 02:30:14 +02:00
GISTENTRY *key1, bool isNull1,
GISTENTRY *key2, bool isNull2);
Repair bugs in GiST page splitting code for multi-column indexes. When considering a non-last column in a multi-column GiST index, gistsplit.c tries to improve on the split chosen by the opclass-specific pickSplit function by considering penalties for the next column. However, there were two bugs in this code: it failed to recompute the union keys for the leftmost index columns, even though these might well change after reassigning tuples; and it included the old union keys in the recomputation for the columns it did recompute, so that those keys couldn't get smaller even if they should. The first problem could result in an invalid index in which searches wouldn't find index entries that are in fact present; the second would make the index less efficient to search. Both of these errors were caused by misuse of gistMakeUnionItVec, whose API was designed in a way that just begged such errors to be made. There is no situation in which it's safe or useful to compute the union keys for a subset of the index columns, and there is no caller that wants any previous union keys to be included in the computation; so the undocumented choice to treat the union keys as in/out rather than pure output parameters is a waste of code as well as being dangerous. Hence, rather than just making a minimal patch, I've changed the API of gistMakeUnionItVec to remove the "startkey" parameter (it now always processes all index columns) and treat the attr/isnull arrays as purely output parameters. In passing, also get rid of a couple of unnecessary and dangerous uses of static variables in gistutil.c. It's remarkable that the one in gistMakeUnionKey hasn't given us portability troubles before now, because in addition to posing a re-entrancy hazard, it was unsafely assuming that a static char[] array would have at least Datum alignment. Per investigation of a trouble report from Tomas Vondra. (There are also some bugs in contrib/btree_gist to be fixed, but that seems like material for a separate patch.) Back-patch to all supported branches.
2013-02-07 23:44:02 +01:00
extern void gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len,
2006-10-04 02:30:14 +02:00
Datum *attr, bool *isnull);
extern bool gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b);
extern void gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p,
2006-10-04 02:30:14 +02:00
OffsetNumber o, GISTENTRY *attdata, bool *isnull);
2006-10-04 02:30:14 +02:00
extern void gistMakeUnionKey(GISTSTATE *giststate, int attno,
GISTENTRY *entry1, bool isnull1,
GISTENTRY *entry2, bool isnull2,
Datum *dst, bool *dstisnull);
extern XLogRecPtr GetXLogRecPtrForTemp(void);
/* gistvacuum.c */
extern Datum gistbulkdelete(PG_FUNCTION_ARGS);
extern Datum gistvacuumcleanup(PG_FUNCTION_ARGS);
/* gistsplit.c */
2006-10-04 02:30:14 +02:00
extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup,
int len, GISTSTATE *giststate,
GistSplitVector *v, GistEntryVector *entryvec,
int attno);
/* gistbuild.c */
extern Datum gistbuild(PG_FUNCTION_ARGS);
extern void gistValidateBufferingOption(char *value);
/* gistbuildbuffers.c */
extern GISTBuildBuffers *gistInitBuildBuffers(int pagesPerBuffer, int levelStep,
int maxLevel);
extern GISTNodeBuffer *gistGetNodeBuffer(GISTBuildBuffers *gfbb,
GISTSTATE *giststate,
BlockNumber blkno, int level);
extern void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb,
GISTNodeBuffer *nodeBuffer, IndexTuple item);
extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb,
GISTNodeBuffer *nodeBuffer, IndexTuple *item);
extern void gistFreeBuildBuffers(GISTBuildBuffers *gfbb);
extern void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb,
GISTSTATE *giststate, Relation r,
int level, Buffer buffer,
List *splitinfo);
extern void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb);
2005-10-15 04:49:52 +02:00
#endif /* GIST_PRIVATE_H */