1996-08-27 23:50:29 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
1999-02-14 00:22:53 +01:00
|
|
|
* nbtree.h
|
1997-09-07 07:04:48 +02:00
|
|
|
* header file for postgres btree access method implementation.
|
1996-08-27 23:50:29 +02:00
|
|
|
*
|
|
|
|
*
|
2002-06-20 22:29:54 +02:00
|
|
|
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-08-27 23:50:29 +02:00
|
|
|
*
|
2003-02-21 01:06:22 +01:00
|
|
|
* $Id: nbtree.h,v 1.64 2003/02/21 00:06:22 tgl Exp $
|
1996-08-27 23:50:29 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
1997-09-07 07:04:48 +02:00
|
|
|
#ifndef NBTREE_H
|
|
|
|
#define NBTREE_H
|
1996-08-27 23:50:29 +02:00
|
|
|
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "access/itup.h"
|
1999-07-16 19:07:40 +02:00
|
|
|
#include "access/relscan.h"
|
|
|
|
#include "access/sdir.h"
|
2000-11-21 22:16:06 +01:00
|
|
|
#include "access/xlogutils.h"
|
1996-08-27 23:50:29 +02:00
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* BTPageOpaqueData -- At the end of every page, we store a pointer
|
2001-02-22 22:48:49 +01:00
|
|
|
* to both siblings in the tree. This is used to do forward/backward
|
2003-02-21 01:06:22 +01:00
|
|
|
* index scans. The next-page link is also critical for recovery when
|
|
|
|
* a search has navigated to the wrong page due to concurrent page splits
|
|
|
|
* or deletions; see src/backend/access/nbtree/README for more info.
|
1996-08-27 23:50:29 +02:00
|
|
|
*
|
2003-02-21 01:06:22 +01:00
|
|
|
* In addition, we store the page's btree level (counting upwards from
|
|
|
|
* zero at a leaf page) as well as some flag bits indicating the page type
|
|
|
|
* and status. If the page is deleted, we replace the level with the
|
|
|
|
* next-transaction-ID value indicating when it is safe to reclaim the page.
|
|
|
|
*
|
|
|
|
* NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
|
|
|
|
* instead.
|
1996-08-27 23:50:29 +02:00
|
|
|
*/
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
typedef struct BTPageOpaqueData
|
|
|
|
{
|
2003-02-21 01:06:22 +01:00
|
|
|
BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */
|
|
|
|
BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */
|
|
|
|
union
|
|
|
|
{
|
|
|
|
uint32 level; /* tree level --- zero for leaf pages */
|
|
|
|
TransactionId xact; /* next transaction ID, if deleted */
|
|
|
|
} btpo;
|
|
|
|
uint16 btpo_flags; /* flag bits, see below */
|
2001-02-21 20:07:04 +01:00
|
|
|
} BTPageOpaqueData;
|
|
|
|
|
|
|
|
typedef BTPageOpaqueData *BTPageOpaque;
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* Bits defined in btpo_flags */
|
2003-02-21 01:06:22 +01:00
|
|
|
#define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */
|
2001-10-28 07:26:15 +01:00
|
|
|
#define BTP_ROOT (1 << 1) /* root page (has no parent) */
|
2003-02-21 01:06:22 +01:00
|
|
|
#define BTP_DELETED (1 << 2) /* page has been deleted from tree */
|
2001-10-28 07:26:15 +01:00
|
|
|
#define BTP_META (1 << 3) /* meta-page */
|
1996-08-27 23:50:29 +02:00
|
|
|
|
|
|
|
|
2001-02-22 22:48:49 +01:00
|
|
|
/*
|
|
|
|
* The Meta page is always the first page in the btree index.
|
|
|
|
* Its primary purpose is to point to the location of the btree root page.
|
2003-02-21 01:06:22 +01:00
|
|
|
* We also point to the "fast" root, which is the current effective root;
|
|
|
|
* see README for discussion.
|
2001-02-22 22:48:49 +01:00
|
|
|
*/
|
2000-10-13 04:03:02 +02:00
|
|
|
|
|
|
|
typedef struct BTMetaPageData
|
|
|
|
{
|
2003-02-21 01:06:22 +01:00
|
|
|
uint32 btm_magic; /* should contain BTREE_MAGIC */
|
|
|
|
uint32 btm_version; /* should contain BTREE_VERSION */
|
|
|
|
BlockNumber btm_root; /* current root location */
|
|
|
|
uint32 btm_level; /* tree level of the root page */
|
|
|
|
BlockNumber btm_fastroot; /* current "fast" root location */
|
|
|
|
uint32 btm_fastlevel; /* tree level of the "fast" root page */
|
2000-10-13 04:03:02 +02:00
|
|
|
} BTMetaPageData;
|
|
|
|
|
|
|
|
#define BTPageGetMeta(p) \
|
2002-07-02 07:48:44 +02:00
|
|
|
((BTMetaPageData *) PageGetContents(p))
|
2000-10-04 02:04:43 +02:00
|
|
|
|
2001-03-22 05:01:46 +01:00
|
|
|
#define BTREE_METAPAGE 0 /* first page is meta */
|
2001-10-28 07:26:15 +01:00
|
|
|
#define BTREE_MAGIC 0x053162 /* magic number of btree pages */
|
2003-02-21 01:06:22 +01:00
|
|
|
#define BTREE_VERSION 2 /* current version number */
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2002-07-02 07:48:44 +02:00
|
|
|
/*
|
|
|
|
* We actually need to be able to fit three items on every page,
|
|
|
|
* so restrict any one item to 1/3 the per-page available space.
|
|
|
|
*/
|
|
|
|
#define BTMaxItemSize(page) \
|
|
|
|
((PageGetPageSize(page) - \
|
|
|
|
sizeof(PageHeaderData) - \
|
|
|
|
MAXALIGN(sizeof(BTPageOpaqueData))) / 3 - sizeof(ItemIdData))
|
1996-08-27 23:50:29 +02:00
|
|
|
|
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* BTItems are what we store in the btree. Each item is an index tuple,
|
|
|
|
* including key and pointer values. (In some cases either the key or the
|
|
|
|
* pointer may go unused, see backend/access/nbtree/README for details.)
|
|
|
|
*
|
|
|
|
* Old comments:
|
|
|
|
* In addition, we must guarantee that all tuples in the index are unique,
|
|
|
|
* in order to satisfy some assumptions in Lehman and Yao. The way that we
|
|
|
|
* do this is by generating a new OID for every insertion that we do in the
|
|
|
|
* tree. This adds eight bytes to the size of btree index tuples. Note
|
|
|
|
* that we do not use the OID as part of a composite key; the OID only
|
|
|
|
* serves as a unique identifier for a given index tuple (logical position
|
|
|
|
* within a page).
|
1997-04-16 03:21:59 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* New comments:
|
|
|
|
* actually, we must guarantee that all tuples in A LEVEL
|
|
|
|
* are unique, not in ALL INDEX. So, we can use bti_itup->t_tid
|
|
|
|
* as unique identifier for a given index tuple (logical position
|
2000-07-21 08:42:39 +02:00
|
|
|
* within a level). - vadim 04/09/97
|
1996-08-27 23:50:29 +02:00
|
|
|
*/
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
typedef struct BTItemData
|
|
|
|
{
|
1997-09-08 04:41:22 +02:00
|
|
|
IndexTupleData bti_itup;
|
1997-09-08 22:59:27 +02:00
|
|
|
} BTItemData;
|
1996-08-27 23:50:29 +02:00
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
typedef BTItemData *BTItem;
|
1996-08-27 23:50:29 +02:00
|
|
|
|
2001-03-22 05:01:46 +01:00
|
|
|
/*
|
2003-02-21 01:06:22 +01:00
|
|
|
* For XLOG: size without alignment. Sizeof works as long as
|
2000-10-04 02:04:43 +02:00
|
|
|
* IndexTupleData has exactly 8 bytes.
|
|
|
|
*/
|
|
|
|
#define SizeOfBTItem sizeof(BTItemData)
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* Test whether items are the "same" per the above notes */
|
|
|
|
#define BTItemSame(i1, i2) ( (i1)->bti_itup.t_tid.ip_blkid.bi_hi == \
|
|
|
|
(i2)->bti_itup.t_tid.ip_blkid.bi_hi && \
|
|
|
|
(i1)->bti_itup.t_tid.ip_blkid.bi_lo == \
|
|
|
|
(i2)->bti_itup.t_tid.ip_blkid.bi_lo && \
|
|
|
|
(i1)->bti_itup.t_tid.ip_posid == \
|
|
|
|
(i2)->bti_itup.t_tid.ip_posid )
|
1997-04-16 03:21:59 +02:00
|
|
|
|
1996-08-27 23:50:29 +02:00
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* In general, the btree code tries to localize its knowledge about
|
|
|
|
* page layout to a couple of routines. However, we need a special
|
|
|
|
* value to indicate "no page number" in those places where we expect
|
2000-07-21 08:42:39 +02:00
|
|
|
* page numbers. We can use zero for this because we never need to
|
|
|
|
* make a pointer to the metadata page.
|
1996-08-27 23:50:29 +02:00
|
|
|
*/
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
#define P_NONE 0
|
2000-07-21 08:42:39 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Macros to test whether a page is leftmost or rightmost on its tree level,
|
|
|
|
* as well as other state info kept in the opaque data.
|
|
|
|
*/
|
1997-09-07 07:04:48 +02:00
|
|
|
#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE)
|
|
|
|
#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE)
|
2000-07-21 08:42:39 +02:00
|
|
|
#define P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF)
|
|
|
|
#define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT)
|
2003-02-21 01:06:22 +01:00
|
|
|
#define P_ISDELETED(opaque) ((opaque)->btpo_flags & BTP_DELETED)
|
2000-07-21 08:42:39 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
|
|
|
|
* page. The high key is not a data key, but gives info about what range of
|
|
|
|
* keys is supposed to be on this page. The high key on a page is required
|
|
|
|
* to be greater than or equal to any data key that appears on the page.
|
|
|
|
* If we find ourselves trying to insert a key > high key, we know we need
|
|
|
|
* to move right (this should only happen if the page was split since we
|
|
|
|
* examined the parent page).
|
|
|
|
*
|
|
|
|
* Our insertion algorithm guarantees that we can use the initial least key
|
|
|
|
* on our right sibling as the high key. Once a page is created, its high
|
|
|
|
* key changes only if the page is split.
|
|
|
|
*
|
|
|
|
* On a non-rightmost page, the high key lives in item 1 and data items
|
|
|
|
* start in item 2. Rightmost pages have no high key, so we store data
|
|
|
|
* items beginning in item 1.
|
|
|
|
*/
|
1996-08-27 23:50:29 +02:00
|
|
|
|
2001-02-23 00:02:33 +01:00
|
|
|
#define P_HIKEY ((OffsetNumber) 1)
|
|
|
|
#define P_FIRSTKEY ((OffsetNumber) 2)
|
2001-03-22 05:01:46 +01:00
|
|
|
#define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
|
1996-08-27 23:50:29 +02:00
|
|
|
|
2000-09-12 08:07:52 +02:00
|
|
|
/*
|
2003-02-21 01:06:22 +01:00
|
|
|
* XLOG records for btree operations
|
|
|
|
*
|
2000-09-12 08:07:52 +02:00
|
|
|
* XLOG allows to store some information in high 4 bits of log
|
|
|
|
* record xl_info field
|
|
|
|
*/
|
2003-02-21 01:06:22 +01:00
|
|
|
#define XLOG_BTREE_INSERT_LEAF 0x00 /* add btitem without split */
|
|
|
|
#define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */
|
|
|
|
#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
|
|
|
|
#define XLOG_BTREE_SPLIT_L 0x30 /* add btitem with split */
|
|
|
|
#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
|
|
|
|
#define XLOG_BTREE_SPLIT_L_ROOT 0x50 /* add btitem with split of root */
|
|
|
|
#define XLOG_BTREE_SPLIT_R_ROOT 0x60 /* as above, new item on right */
|
|
|
|
#define XLOG_BTREE_DELETE 0x70 /* delete leaf btitem */
|
|
|
|
#define XLOG_BTREE_DELETE_PAGE 0x80 /* delete an entire page */
|
|
|
|
#define XLOG_BTREE_DELETE_PAGE_META 0x90 /* same, plus update metapage */
|
|
|
|
#define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
|
|
|
|
#define XLOG_BTREE_NEWMETA 0xB0 /* update metadata page */
|
|
|
|
#define XLOG_BTREE_NEWPAGE 0xC0 /* new index page during build */
|
2000-12-28 14:00:29 +01:00
|
|
|
|
2000-09-12 08:07:52 +02:00
|
|
|
/*
|
2003-02-21 01:06:22 +01:00
|
|
|
* All that we need to find changed index tuple
|
2000-09-12 08:07:52 +02:00
|
|
|
*/
|
|
|
|
typedef struct xl_btreetid
|
|
|
|
{
|
2001-03-22 05:01:46 +01:00
|
|
|
RelFileNode node;
|
|
|
|
ItemPointerData tid; /* changed tuple id */
|
2000-09-12 08:07:52 +02:00
|
|
|
} xl_btreetid;
|
|
|
|
|
2000-10-04 02:04:43 +02:00
|
|
|
/*
|
2003-02-21 01:06:22 +01:00
|
|
|
* All that we need to regenerate the meta-data page
|
2000-10-04 02:04:43 +02:00
|
|
|
*/
|
2003-02-21 01:06:22 +01:00
|
|
|
typedef struct xl_btree_metadata
|
2000-09-12 08:07:52 +02:00
|
|
|
{
|
2003-02-21 01:06:22 +01:00
|
|
|
BlockNumber root;
|
|
|
|
uint32 level;
|
|
|
|
BlockNumber fastroot;
|
|
|
|
uint32 fastlevel;
|
|
|
|
} xl_btree_metadata;
|
2000-09-12 08:07:52 +02:00
|
|
|
|
2001-03-22 05:01:46 +01:00
|
|
|
/*
|
2003-02-21 01:06:22 +01:00
|
|
|
* This is what we need to know about simple (without split) insert.
|
|
|
|
*
|
|
|
|
* This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
|
|
|
|
* Note that INSERT_META implies it's not a leaf page.
|
2000-10-04 02:04:43 +02:00
|
|
|
*/
|
2000-09-12 08:07:52 +02:00
|
|
|
typedef struct xl_btree_insert
|
|
|
|
{
|
2001-03-22 05:01:46 +01:00
|
|
|
xl_btreetid target; /* inserted tuple id */
|
2003-02-21 01:06:22 +01:00
|
|
|
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */
|
2000-12-28 14:00:29 +01:00
|
|
|
/* BTITEM FOLLOWS AT END OF STRUCT */
|
2000-09-12 08:07:52 +02:00
|
|
|
} xl_btree_insert;
|
|
|
|
|
2000-10-04 02:04:43 +02:00
|
|
|
#define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData)
|
2000-09-12 08:07:52 +02:00
|
|
|
|
2001-03-22 05:01:46 +01:00
|
|
|
/*
|
2000-12-28 14:00:29 +01:00
|
|
|
* On insert with split we save items of both left and right siblings
|
2003-02-21 01:06:22 +01:00
|
|
|
* and restore content of both pages from log record. This way takes less
|
|
|
|
* xlog space than the normal approach, because if we did it standardly,
|
|
|
|
* XLogInsert would almost always think the right page is new and store its
|
|
|
|
* whole page image.
|
|
|
|
*
|
|
|
|
* Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
|
|
|
|
* The _L and _R variants indicate whether the inserted btitem went into the
|
|
|
|
* left or right split page (and thus, whether otherblk is the right or left
|
|
|
|
* page of the split pair). The _ROOT variants indicate that we are splitting
|
|
|
|
* the root page, and thus that a newroot record rather than an insert or
|
|
|
|
* split record should follow. Note that a split record never carries a
|
|
|
|
* metapage update --- we'll do that in the parent-level update.
|
2000-10-04 02:04:43 +02:00
|
|
|
*/
|
2000-09-12 08:07:52 +02:00
|
|
|
typedef struct xl_btree_split
|
|
|
|
{
|
2001-03-22 05:01:46 +01:00
|
|
|
xl_btreetid target; /* inserted tuple id */
|
2003-02-21 01:06:22 +01:00
|
|
|
BlockNumber otherblk; /* second block participated in split: */
|
2001-03-22 05:01:46 +01:00
|
|
|
/* first one is stored in target' tid */
|
2003-02-21 01:06:22 +01:00
|
|
|
BlockNumber leftblk; /* prev/left block */
|
|
|
|
BlockNumber rightblk; /* next/right block */
|
|
|
|
uint32 level; /* tree level of page being split */
|
2001-03-22 05:01:46 +01:00
|
|
|
uint16 leftlen; /* len of left page items below */
|
2003-02-21 01:06:22 +01:00
|
|
|
/* LEFT AND RIGHT PAGES TUPLES FOLLOW AT THE END */
|
2000-09-12 08:07:52 +02:00
|
|
|
} xl_btree_split;
|
|
|
|
|
2000-12-28 14:00:29 +01:00
|
|
|
#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
|
2000-10-04 02:04:43 +02:00
|
|
|
|
2001-03-22 05:01:46 +01:00
|
|
|
/*
|
2003-02-21 01:06:22 +01:00
|
|
|
* This is what we need to know about delete of an individual leaf btitem
|
|
|
|
*/
|
|
|
|
typedef struct xl_btree_delete
|
|
|
|
{
|
|
|
|
xl_btreetid target; /* deleted tuple id */
|
|
|
|
} xl_btree_delete;
|
|
|
|
|
|
|
|
#define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is what we need to know about deletion of a btree page. The target
|
|
|
|
* identifies the tuple removed from the parent page (note that we remove
|
|
|
|
* this tuple's downlink and the *following* tuple's key). Note we do not
|
|
|
|
* store any content for the deleted page --- it is just rewritten as empty
|
|
|
|
* during recovery.
|
|
|
|
*/
|
|
|
|
typedef struct xl_btree_delete_page
|
|
|
|
{
|
|
|
|
xl_btreetid target; /* deleted tuple id in parent page */
|
|
|
|
BlockNumber deadblk; /* child block being deleted */
|
|
|
|
BlockNumber leftblk; /* child block's left sibling, if any */
|
|
|
|
BlockNumber rightblk; /* child block's right sibling */
|
|
|
|
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_DELETE_PAGE_META */
|
|
|
|
} xl_btree_delete_page;
|
|
|
|
|
|
|
|
#define SizeOfBtreeDeletePage (offsetof(xl_btree_delete_page, rightblk) + sizeof(BlockNumber))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* New root log record. There are zero btitems if this is to establish an
|
|
|
|
* empty root, or two if it is the result of splitting an old root.
|
|
|
|
*
|
|
|
|
* Note that although this implies rewriting the metadata page, we don't need
|
|
|
|
* an xl_btree_metadata record --- the rootblk and level are sufficient.
|
2000-10-04 02:04:43 +02:00
|
|
|
*/
|
|
|
|
typedef struct xl_btree_newroot
|
|
|
|
{
|
2001-03-22 05:01:46 +01:00
|
|
|
RelFileNode node;
|
2003-02-21 01:06:22 +01:00
|
|
|
BlockNumber rootblk; /* location of new root */
|
|
|
|
uint32 level; /* its tree level */
|
2000-10-04 02:04:43 +02:00
|
|
|
/* 0 or 2 BTITEMS FOLLOW AT END OF STRUCT */
|
|
|
|
} xl_btree_newroot;
|
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* New metapage log record. This is not issued during routine operations;
|
|
|
|
* it's only used when initializing an empty index and at completion of
|
|
|
|
* index build.
|
|
|
|
*/
|
|
|
|
typedef struct xl_btree_newmeta
|
|
|
|
{
|
|
|
|
RelFileNode node;
|
|
|
|
xl_btree_metadata meta;
|
|
|
|
} xl_btree_newmeta;
|
|
|
|
|
|
|
|
#define SizeOfBtreeNewmeta (sizeof(xl_btree_newmeta))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* New index page log record. This is only used while building a new index.
|
|
|
|
*/
|
|
|
|
typedef struct xl_btree_newpage
|
|
|
|
{
|
|
|
|
RelFileNode node;
|
|
|
|
BlockNumber blkno; /* location of new page */
|
|
|
|
/* entire page contents follow at end of record */
|
|
|
|
} xl_btree_newpage;
|
|
|
|
|
|
|
|
#define SizeOfBtreeNewpage (offsetof(xl_btree_newpage, blkno) + sizeof(BlockNumber))
|
|
|
|
|
2000-09-12 08:07:52 +02:00
|
|
|
|
1996-08-27 23:50:29 +02:00
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Operator strategy numbers -- ordering of these is <, <=, =, >=, >
|
1996-08-27 23:50:29 +02:00
|
|
|
*/
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
#define BTLessStrategyNumber 1
|
|
|
|
#define BTLessEqualStrategyNumber 2
|
|
|
|
#define BTEqualStrategyNumber 3
|
1996-08-27 23:50:29 +02:00
|
|
|
#define BTGreaterEqualStrategyNumber 4
|
1997-09-07 07:04:48 +02:00
|
|
|
#define BTGreaterStrategyNumber 5
|
|
|
|
#define BTMaxStrategyNumber 5
|
1996-08-27 23:50:29 +02:00
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* When a new operator class is declared, we require that the user
|
|
|
|
* supply us with an amproc procedure for determining whether, for
|
|
|
|
* two keys a and b, a < b, a = b, or a > b. This routine must
|
|
|
|
* return < 0, 0, > 0, respectively, in these three cases. Since we
|
|
|
|
* only have one such proc in amproc, it's number 1.
|
1996-08-27 23:50:29 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
#define BTORDER_PROC 1
|
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
/*
|
|
|
|
* We need to be able to tell the difference between read and write
|
|
|
|
* requests for pages, in order to do locking correctly.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define BT_READ BUFFER_LOCK_SHARE
|
|
|
|
#define BT_WRITE BUFFER_LOCK_EXCLUSIVE
|
|
|
|
|
|
|
|
/*
|
|
|
|
* BTStackData -- As we descend a tree, we push the (location, downlink)
|
|
|
|
* pairs from internal pages onto a private stack. If we split a
|
|
|
|
* leaf, we use this stack to walk back up the tree and insert data
|
|
|
|
* into parent pages (and possibly to split them, too). Lehman and
|
|
|
|
* Yao's update algorithm guarantees that under no circumstances can
|
|
|
|
* our private stack give us an irredeemably bad picture up the tree.
|
|
|
|
* Again, see the paper for details.
|
|
|
|
*/
|
|
|
|
|
|
|
|
typedef struct BTStackData
|
|
|
|
{
|
|
|
|
BlockNumber bts_blkno;
|
|
|
|
OffsetNumber bts_offset;
|
|
|
|
BTItemData bts_btitem;
|
|
|
|
struct BTStackData *bts_parent;
|
|
|
|
} BTStackData;
|
|
|
|
|
|
|
|
typedef BTStackData *BTStack;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* BTScanOpaqueData is used to remember which buffers we're currently
|
|
|
|
* examining in the scan. We keep these buffers pinned (but not locked,
|
|
|
|
* see nbtree.c) and recorded in the opaque entry of the scan to avoid
|
|
|
|
* doing a ReadBuffer() for every tuple in the index.
|
|
|
|
*
|
|
|
|
* And it's used to remember actual scankey info (we need it
|
|
|
|
* if some scankeys evaled at runtime).
|
|
|
|
*
|
|
|
|
* curHeapIptr & mrkHeapIptr are heap iptr-s from current/marked
|
|
|
|
* index tuples: we don't adjust scans on insertions (and, if LLL
|
|
|
|
* is ON, don't hold locks on index pages between passes) - we
|
|
|
|
* use these pointers to restore index scan positions...
|
|
|
|
* - vadim 07/29/98
|
|
|
|
*/
|
|
|
|
|
|
|
|
typedef struct BTScanOpaqueData
|
|
|
|
{
|
|
|
|
Buffer btso_curbuf;
|
|
|
|
Buffer btso_mrkbuf;
|
|
|
|
ItemPointerData curHeapIptr;
|
|
|
|
ItemPointerData mrkHeapIptr;
|
|
|
|
/* these fields are set by _bt_orderkeys(), which see for more info: */
|
|
|
|
bool qual_ok; /* false if qual can never be satisfied */
|
|
|
|
int numberOfKeys; /* number of scan keys */
|
|
|
|
int numberOfRequiredKeys; /* number of keys that must be
|
|
|
|
* matched to continue the scan */
|
|
|
|
ScanKey keyData; /* array of scan keys */
|
|
|
|
} BTScanOpaqueData;
|
|
|
|
|
|
|
|
typedef BTScanOpaqueData *BTScanOpaque;
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* prototypes for functions in nbtree.c (external entry points for btree)
|
|
|
|
*/
|
|
|
|
extern bool BuildingBtree; /* in nbtree.c */
|
|
|
|
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
extern void AtEOXact_nbtree(void);
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
extern Datum btbuild(PG_FUNCTION_ARGS);
|
|
|
|
extern Datum btinsert(PG_FUNCTION_ARGS);
|
|
|
|
extern Datum btgettuple(PG_FUNCTION_ARGS);
|
|
|
|
extern Datum btbeginscan(PG_FUNCTION_ARGS);
|
|
|
|
extern Datum btrescan(PG_FUNCTION_ARGS);
|
|
|
|
extern void btmovescan(IndexScanDesc scan, Datum v);
|
|
|
|
extern Datum btendscan(PG_FUNCTION_ARGS);
|
|
|
|
extern Datum btmarkpos(PG_FUNCTION_ARGS);
|
|
|
|
extern Datum btrestrpos(PG_FUNCTION_ARGS);
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
extern Datum btbulkdelete(PG_FUNCTION_ARGS);
|
2000-07-21 08:42:39 +02:00
|
|
|
|
1996-08-27 23:50:29 +02:00
|
|
|
/*
|
|
|
|
* prototypes for functions in nbtinsert.c
|
|
|
|
*/
|
1998-09-01 06:40:42 +02:00
|
|
|
extern InsertIndexResult _bt_doinsert(Relation rel, BTItem btitem,
|
2000-04-12 19:17:23 +02:00
|
|
|
bool index_is_unique, Relation heapRel);
|
2003-02-21 01:06:22 +01:00
|
|
|
extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
|
|
|
|
BTStack stack, bool is_root, bool is_only);
|
1996-08-27 23:50:29 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* prototypes for functions in nbtpage.c
|
|
|
|
*/
|
1997-09-08 04:41:22 +02:00
|
|
|
extern void _bt_metapinit(Relation rel);
|
|
|
|
extern Buffer _bt_getroot(Relation rel, int access);
|
2003-02-21 01:06:22 +01:00
|
|
|
extern Buffer _bt_gettrueroot(Relation rel);
|
1997-09-08 04:41:22 +02:00
|
|
|
extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
extern void _bt_relbuf(Relation rel, Buffer buf);
|
1997-09-08 04:41:22 +02:00
|
|
|
extern void _bt_wrtbuf(Relation rel, Buffer buf);
|
|
|
|
extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
|
|
|
|
extern void _bt_pageinit(Page page, Size size);
|
2003-02-21 01:06:22 +01:00
|
|
|
extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level);
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid);
|
1996-08-27 23:50:29 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* prototypes for functions in nbtsearch.c
|
|
|
|
*/
|
1998-09-01 06:40:42 +02:00
|
|
|
extern BTStack _bt_search(Relation rel, int keysz, ScanKey scankey,
|
2001-03-22 05:01:46 +01:00
|
|
|
Buffer *bufP, int access);
|
1998-09-01 06:40:42 +02:00
|
|
|
extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
|
1997-09-07 07:04:48 +02:00
|
|
|
ScanKey scankey, int access);
|
1998-09-01 06:40:42 +02:00
|
|
|
extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
|
2001-03-22 05:01:46 +01:00
|
|
|
ScanKey scankey);
|
2000-07-21 08:42:39 +02:00
|
|
|
extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
|
2001-03-22 05:01:46 +01:00
|
|
|
Page page, OffsetNumber offnum);
|
2002-05-21 01:51:44 +02:00
|
|
|
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
|
|
|
|
extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
|
1997-09-08 22:59:27 +02:00
|
|
|
extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
|
2003-02-21 01:06:22 +01:00
|
|
|
extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
|
1996-08-27 23:50:29 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* prototypes for functions in nbtstrat.c
|
|
|
|
*/
|
1998-09-01 06:40:42 +02:00
|
|
|
extern StrategyNumber _bt_getstrat(Relation rel, AttrNumber attno,
|
2000-04-12 19:17:23 +02:00
|
|
|
RegProcedure proc);
|
1996-08-27 23:50:29 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* prototypes for functions in nbtutils.c
|
|
|
|
*/
|
1997-09-08 04:41:22 +02:00
|
|
|
extern ScanKey _bt_mkscankey(Relation rel, IndexTuple itup);
|
2000-02-18 07:32:39 +01:00
|
|
|
extern ScanKey _bt_mkscankey_nodata(Relation rel);
|
1997-09-08 04:41:22 +02:00
|
|
|
extern void _bt_freeskey(ScanKey skey);
|
|
|
|
extern void _bt_freestack(BTStack stack);
|
2002-05-24 20:57:57 +02:00
|
|
|
extern void _bt_orderkeys(IndexScanDesc scan);
|
2000-07-25 06:47:59 +02:00
|
|
|
extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
|
2001-03-22 05:01:46 +01:00
|
|
|
ScanDirection dir, bool *continuescan);
|
1997-09-08 04:41:22 +02:00
|
|
|
extern BTItem _bt_formitem(IndexTuple itup);
|
1996-08-27 23:50:29 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* prototypes for functions in nbtsort.c
|
|
|
|
*/
|
1999-10-18 00:15:09 +02:00
|
|
|
|
2000-04-12 19:17:23 +02:00
|
|
|
typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */
|
1999-10-18 00:15:09 +02:00
|
|
|
|
|
|
|
extern BTSpool *_bt_spoolinit(Relation index, bool isunique);
|
|
|
|
extern void _bt_spooldestroy(BTSpool *btspool);
|
|
|
|
extern void _bt_spool(BTItem btitem, BTSpool *btspool);
|
2000-08-10 04:33:20 +02:00
|
|
|
extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
|
2001-10-28 07:26:15 +01:00
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
/*
|
|
|
|
* prototypes for functions in nbtxlog.c
|
|
|
|
*/
|
|
|
|
extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
|
|
|
|
extern void btree_undo(XLogRecPtr lsn, XLogRecord *record);
|
|
|
|
extern void btree_desc(char *buf, uint8 xl_info, char *rec);
|
|
|
|
extern void btree_xlog_startup(void);
|
|
|
|
extern void btree_xlog_cleanup(void);
|
|
|
|
|
2001-11-05 18:46:40 +01:00
|
|
|
#endif /* NBTREE_H */
|