postgresql/src/include/access/nbtree.h

494 lines
18 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* nbtree.h
* header file for postgres btree access method implementation.
*
*
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.89 2005/12/07 19:37:53 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef NBTREE_H
#define NBTREE_H
#include "access/itup.h"
1999-07-16 19:07:40 +02:00
#include "access/relscan.h"
#include "access/sdir.h"
#include "access/xlogutils.h"
/*
* BTPageOpaqueData -- At the end of every page, we store a pointer
* to both siblings in the tree. This is used to do forward/backward
* index scans. The next-page link is also critical for recovery when
* a search has navigated to the wrong page due to concurrent page splits
* or deletions; see src/backend/access/nbtree/README for more info.
*
2003-08-04 02:43:34 +02:00
* In addition, we store the page's btree level (counting upwards from
* zero at a leaf page) as well as some flag bits indicating the page type
* and status. If the page is deleted, we replace the level with the
* next-transaction-ID value indicating when it is safe to reclaim the page.
*
* NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
* instead.
*/
typedef struct BTPageOpaqueData
{
BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */
BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */
union
{
2003-08-04 02:43:34 +02:00
uint32 level; /* tree level --- zero for leaf pages */
TransactionId xact; /* next transaction ID, if deleted */
2003-08-04 02:43:34 +02:00
} btpo;
uint16 btpo_flags; /* flag bits, see below */
2001-02-21 20:07:04 +01:00
} BTPageOpaqueData;
typedef BTPageOpaqueData *BTPageOpaque;
/* Bits defined in btpo_flags */
#define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */
#define BTP_ROOT (1 << 1) /* root page (has no parent) */
#define BTP_DELETED (1 << 2) /* page has been deleted from tree */
#define BTP_META (1 << 3) /* meta-page */
#define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */
/*
* The Meta page is always the first page in the btree index.
* Its primary purpose is to point to the location of the btree root page.
* We also point to the "fast" root, which is the current effective root;
* see README for discussion.
*/
2000-10-13 04:03:02 +02:00
typedef struct BTMetaPageData
{
uint32 btm_magic; /* should contain BTREE_MAGIC */
uint32 btm_version; /* should contain BTREE_VERSION */
BlockNumber btm_root; /* current root location */
uint32 btm_level; /* tree level of the root page */
BlockNumber btm_fastroot; /* current "fast" root location */
uint32 btm_fastlevel; /* tree level of the "fast" root page */
2000-10-13 04:03:02 +02:00
} BTMetaPageData;
#define BTPageGetMeta(p) \
((BTMetaPageData *) PageGetContents(p))
2000-10-04 02:04:43 +02:00
2001-03-22 05:01:46 +01:00
#define BTREE_METAPAGE 0 /* first page is meta */
#define BTREE_MAGIC 0x053162 /* magic number of btree pages */
#define BTREE_VERSION 2 /* current version number */
2001-03-22 05:01:46 +01:00
/*
* We actually need to be able to fit three items on every page,
* so restrict any one item to 1/3 the per-page available space.
*/
#define BTMaxItemSize(page) \
((PageGetPageSize(page) - \
sizeof(PageHeaderData) - \
MAXALIGN(sizeof(BTPageOpaqueData))) / 3 - sizeof(ItemIdData))
/*
* BTItems are what we store in the btree. Each item is an index tuple,
* including key and pointer values. (In some cases either the key or the
* pointer may go unused, see backend/access/nbtree/README for details.)
*
* Old comments:
* In addition, we must guarantee that all tuples in the index are unique,
* in order to satisfy some assumptions in Lehman and Yao. The way that we
* do this is by generating a new OID for every insertion that we do in the
* tree. This adds eight bytes to the size of btree index tuples. Note
* that we do not use the OID as part of a composite key; the OID only
* serves as a unique identifier for a given index tuple (logical position
* within a page).
*
* New comments:
* actually, we must guarantee that all tuples in A LEVEL
* are unique, not in ALL INDEX. So, we can use bti_itup->t_tid
* as unique identifier for a given index tuple (logical position
* within a level). - vadim 04/09/97
*/
typedef struct BTItemData
{
IndexTupleData bti_itup;
1997-09-08 22:59:27 +02:00
} BTItemData;
typedef BTItemData *BTItem;
#define CopyBTItem(btitem) ((BTItem) CopyIndexTuple((IndexTuple) (btitem)))
2001-03-22 05:01:46 +01:00
/*
* For XLOG: size without alignment. Sizeof works as long as
2000-10-04 02:04:43 +02:00
* IndexTupleData has exactly 8 bytes.
*/
#define SizeOfBTItem sizeof(BTItemData)
/* Test whether items are the "same" per the above notes */
#define BTTidSame(i1, i2) \
( (i1).ip_blkid.bi_hi == (i2).ip_blkid.bi_hi && \
(i1).ip_blkid.bi_lo == (i2).ip_blkid.bi_lo && \
(i1).ip_posid == (i2).ip_posid )
#define BTItemSame(i1, i2) \
BTTidSame((i1)->bti_itup.t_tid, (i2)->bti_itup.t_tid)
/*
* In general, the btree code tries to localize its knowledge about
* page layout to a couple of routines. However, we need a special
* value to indicate "no page number" in those places where we expect
* page numbers. We can use zero for this because we never need to
* make a pointer to the metadata page.
*/
#define P_NONE 0
/*
* Macros to test whether a page is leftmost or rightmost on its tree level,
* as well as other state info kept in the opaque data.
*/
#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE)
#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE)
#define P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF)
#define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT)
#define P_ISDELETED(opaque) ((opaque)->btpo_flags & BTP_DELETED)
#define P_IGNORE(opaque) ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
/*
* Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
* page. The high key is not a data key, but gives info about what range of
* keys is supposed to be on this page. The high key on a page is required
* to be greater than or equal to any data key that appears on the page.
* If we find ourselves trying to insert a key > high key, we know we need
* to move right (this should only happen if the page was split since we
* examined the parent page).
*
* Our insertion algorithm guarantees that we can use the initial least key
* on our right sibling as the high key. Once a page is created, its high
* key changes only if the page is split.
*
* On a non-rightmost page, the high key lives in item 1 and data items
* start in item 2. Rightmost pages have no high key, so we store data
* items beginning in item 1.
*/
2001-02-23 00:02:33 +01:00
#define P_HIKEY ((OffsetNumber) 1)
#define P_FIRSTKEY ((OffsetNumber) 2)
2001-03-22 05:01:46 +01:00
#define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
2000-09-12 08:07:52 +02:00
/*
* XLOG records for btree operations
*
2000-09-12 08:07:52 +02:00
* XLOG allows to store some information in high 4 bits of log
* record xl_info field
*/
#define XLOG_BTREE_INSERT_LEAF 0x00 /* add btitem without split */
2003-08-04 02:43:34 +02:00
#define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */
#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
#define XLOG_BTREE_SPLIT_L 0x30 /* add btitem with split */
#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
2003-08-04 02:43:34 +02:00
#define XLOG_BTREE_SPLIT_L_ROOT 0x50 /* add btitem with split of root */
#define XLOG_BTREE_SPLIT_R_ROOT 0x60 /* as above, new item on right */
#define XLOG_BTREE_DELETE 0x70 /* delete leaf btitem */
#define XLOG_BTREE_DELETE_PAGE 0x80 /* delete an entire page */
2005-10-15 04:49:52 +02:00
#define XLOG_BTREE_DELETE_PAGE_META 0x90 /* same, plus update metapage */
#define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
#define XLOG_BTREE_NEWMETA 0xB0 /* update metadata page */
2000-09-12 08:07:52 +02:00
/*
* All that we need to find changed index tuple
2000-09-12 08:07:52 +02:00
*/
typedef struct xl_btreetid
{
2001-03-22 05:01:46 +01:00
RelFileNode node;
ItemPointerData tid; /* changed tuple id */
2000-09-12 08:07:52 +02:00
} xl_btreetid;
2000-10-04 02:04:43 +02:00
/*
* All that we need to regenerate the meta-data page
2000-10-04 02:04:43 +02:00
*/
typedef struct xl_btree_metadata
2000-09-12 08:07:52 +02:00
{
BlockNumber root;
uint32 level;
BlockNumber fastroot;
uint32 fastlevel;
} xl_btree_metadata;
2000-09-12 08:07:52 +02:00
2001-03-22 05:01:46 +01:00
/*
* This is what we need to know about simple (without split) insert.
*
* This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
* Note that INSERT_META implies it's not a leaf page.
2000-10-04 02:04:43 +02:00
*/
2000-09-12 08:07:52 +02:00
typedef struct xl_btree_insert
{
2001-03-22 05:01:46 +01:00
xl_btreetid target; /* inserted tuple id */
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */
/* BTITEM FOLLOWS AT END OF STRUCT */
2000-09-12 08:07:52 +02:00
} xl_btree_insert;
2000-10-04 02:04:43 +02:00
#define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData)
2000-09-12 08:07:52 +02:00
2001-03-22 05:01:46 +01:00
/*
* On insert with split we save items of both left and right siblings
* and restore content of both pages from log record. This way takes less
* xlog space than the normal approach, because if we did it standardly,
* XLogInsert would almost always think the right page is new and store its
* whole page image.
*
* Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
* The _L and _R variants indicate whether the inserted btitem went into the
* left or right split page (and thus, whether otherblk is the right or left
* page of the split pair). The _ROOT variants indicate that we are splitting
* the root page, and thus that a newroot record rather than an insert or
2003-08-04 02:43:34 +02:00
* split record should follow. Note that a split record never carries a
* metapage update --- we'll do that in the parent-level update.
2000-10-04 02:04:43 +02:00
*/
2000-09-12 08:07:52 +02:00
typedef struct xl_btree_split
{
2001-03-22 05:01:46 +01:00
xl_btreetid target; /* inserted tuple id */
BlockNumber otherblk; /* second block participated in split: */
2001-03-22 05:01:46 +01:00
/* first one is stored in target' tid */
BlockNumber leftblk; /* prev/left block */
BlockNumber rightblk; /* next/right block */
uint32 level; /* tree level of page being split */
2001-03-22 05:01:46 +01:00
uint16 leftlen; /* len of left page items below */
/* LEFT AND RIGHT PAGES TUPLES FOLLOW AT THE END */
2000-09-12 08:07:52 +02:00
} xl_btree_split;
#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
2000-10-04 02:04:43 +02:00
2001-03-22 05:01:46 +01:00
/*
* This is what we need to know about delete of individual leaf btitems.
* The WAL record can represent deletion of any number of btitems on a
* single index page.
*/
typedef struct xl_btree_delete
{
RelFileNode node;
BlockNumber block;
/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
} xl_btree_delete;
#define SizeOfBtreeDelete (offsetof(xl_btree_delete, block) + sizeof(BlockNumber))
/*
* This is what we need to know about deletion of a btree page. The target
* identifies the tuple removed from the parent page (note that we remove
2003-08-04 02:43:34 +02:00
* this tuple's downlink and the *following* tuple's key). Note we do not
* store any content for the deleted page --- it is just rewritten as empty
* during recovery.
*/
typedef struct xl_btree_delete_page
{
xl_btreetid target; /* deleted tuple id in parent page */
BlockNumber deadblk; /* child block being deleted */
BlockNumber leftblk; /* child block's left sibling, if any */
BlockNumber rightblk; /* child block's right sibling */
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_DELETE_PAGE_META */
} xl_btree_delete_page;
#define SizeOfBtreeDeletePage (offsetof(xl_btree_delete_page, rightblk) + sizeof(BlockNumber))
/*
* New root log record. There are zero btitems if this is to establish an
* empty root, or two if it is the result of splitting an old root.
*
* Note that although this implies rewriting the metadata page, we don't need
* an xl_btree_metadata record --- the rootblk and level are sufficient.
2000-10-04 02:04:43 +02:00
*/
typedef struct xl_btree_newroot
{
2001-03-22 05:01:46 +01:00
RelFileNode node;
BlockNumber rootblk; /* location of new root */
uint32 level; /* its tree level */
2000-10-04 02:04:43 +02:00
/* 0 or 2 BTITEMS FOLLOW AT END OF STRUCT */
} xl_btree_newroot;
#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
/*
* New metapage log record. This is not issued during routine operations;
* it's only used when initializing an empty index.
*/
typedef struct xl_btree_newmeta
{
RelFileNode node;
xl_btree_metadata meta;
} xl_btree_newmeta;
#define SizeOfBtreeNewmeta (sizeof(xl_btree_newmeta))
2000-09-12 08:07:52 +02:00
/*
* Operator strategy numbers for B-tree have been moved to access/skey.h,
* because many places need to use them in ScanKeyInit() calls.
*/
/*
* When a new operator class is declared, we require that the user
* supply us with an amproc procedure for determining whether, for
* two keys a and b, a < b, a = b, or a > b. This routine must
* return < 0, 0, > 0, respectively, in these three cases. Since we
* only have one such proc in amproc, it's number 1.
*/
#define BTORDER_PROC 1
/*
* We need to be able to tell the difference between read and write
* requests for pages, in order to do locking correctly.
*/
#define BT_READ BUFFER_LOCK_SHARE
#define BT_WRITE BUFFER_LOCK_EXCLUSIVE
/*
* BTStackData -- As we descend a tree, we push the (location, downlink)
* pairs from internal pages onto a private stack. If we split a
* leaf, we use this stack to walk back up the tree and insert data
* into parent pages (and possibly to split them, too). Lehman and
* Yao's update algorithm guarantees that under no circumstances can
* our private stack give us an irredeemably bad picture up the tree.
* Again, see the paper for details.
*/
typedef struct BTStackData
{
BlockNumber bts_blkno;
OffsetNumber bts_offset;
BTItemData bts_btitem;
struct BTStackData *bts_parent;
} BTStackData;
typedef BTStackData *BTStack;
/*
* BTScanOpaqueData is used to remember which buffers we're currently
* examining in the scan. We keep these buffers pinned (but not locked,
* see nbtree.c) and recorded in the opaque entry of the scan to avoid
* doing a ReadBuffer() for every tuple in the index.
*
* And it's used to remember actual scankey info (we need it
* if some scankeys evaled at runtime).
*
* curHeapIptr & mrkHeapIptr are heap iptr-s from current/marked
* index tuples: we don't adjust scans on insertions (and, if LLL
* is ON, don't hold locks on index pages between passes) - we
* use these pointers to restore index scan positions...
* - vadim 07/29/98
*/
typedef struct BTScanOpaqueData
{
Buffer btso_curbuf;
Buffer btso_mrkbuf;
ItemPointerData curHeapIptr;
ItemPointerData mrkHeapIptr;
/* these fields are set by _bt_preprocess_keys(): */
bool qual_ok; /* false if qual can never be satisfied */
int numberOfKeys; /* number of preprocessed scan keys */
2005-10-15 04:49:52 +02:00
int numberOfRequiredKeys; /* number of keys that must be matched
* to continue the scan */
ScanKey keyData; /* array of preprocessed scan keys */
} BTScanOpaqueData;
typedef BTScanOpaqueData *BTScanOpaque;
/*
* prototypes for functions in nbtree.c (external entry points for btree)
*/
extern Datum btbuild(PG_FUNCTION_ARGS);
extern Datum btinsert(PG_FUNCTION_ARGS);
extern Datum btbeginscan(PG_FUNCTION_ARGS);
extern Datum btgettuple(PG_FUNCTION_ARGS);
extern Datum btgetmulti(PG_FUNCTION_ARGS);
extern Datum btrescan(PG_FUNCTION_ARGS);
extern Datum btendscan(PG_FUNCTION_ARGS);
extern Datum btmarkpos(PG_FUNCTION_ARGS);
extern Datum btrestrpos(PG_FUNCTION_ARGS);
extern Datum btbulkdelete(PG_FUNCTION_ARGS);
extern Datum btvacuumcleanup(PG_FUNCTION_ARGS);
/*
* prototypes for functions in nbtinsert.c
*/
extern void _bt_doinsert(Relation rel, BTItem btitem,
bool index_is_unique, Relation heapRel);
extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access);
extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
2003-08-04 02:43:34 +02:00
BTStack stack, bool is_root, bool is_only);
/*
* prototypes for functions in nbtpage.c
*/
extern void _bt_metapinit(Relation rel);
extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
extern Buffer _bt_getroot(Relation rel, int access);
extern Buffer _bt_gettrueroot(Relation rel);
extern void _bt_checkpage(Relation rel, Buffer buf);
extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
2004-08-29 07:07:03 +02:00
BlockNumber blkno, int access);
extern void _bt_relbuf(Relation rel, Buffer buf);
extern void _bt_wrtbuf(Relation rel, Buffer buf);
extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
extern void _bt_pageinit(Page page, Size size);
extern bool _bt_page_recyclable(Page page);
extern void _bt_delitems(Relation rel, Buffer buf,
2003-08-04 02:43:34 +02:00
OffsetNumber *itemnos, int nitems);
extern int _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full);
/*
* prototypes for functions in nbtsearch.c
*/
extern BTStack _bt_search(Relation rel,
2004-08-29 07:07:03 +02:00
int keysz, ScanKey scankey, bool nextkey,
Buffer *bufP, int access);
extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
ScanKey scankey, bool nextkey, int access);
extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
ScanKey scankey, bool nextkey);
extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
2001-03-22 05:01:46 +01:00
Page page, OffsetNumber offnum);
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
1997-09-08 22:59:27 +02:00
extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
/*
* prototypes for functions in nbtutils.c
*/
extern ScanKey _bt_mkscankey(Relation rel, IndexTuple itup);
extern ScanKey _bt_mkscankey_nodata(Relation rel);
extern void _bt_freeskey(ScanKey skey);
extern void _bt_freestack(BTStack stack);
extern void _bt_preprocess_keys(IndexScanDesc scan);
extern bool _bt_checkkeys(IndexScanDesc scan,
Page page, OffsetNumber offnum,
ScanDirection dir, bool *continuescan);
extern BTItem _bt_formitem(IndexTuple itup);
/*
* prototypes for functions in nbtsort.c
*/
typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */
extern BTSpool *_bt_spoolinit(Relation index, bool isunique, bool isdead);
extern void _bt_spooldestroy(BTSpool *btspool);
extern void _bt_spool(BTItem btitem, BTSpool *btspool);
extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
/*
* prototypes for functions in nbtxlog.c
*/
extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
extern void btree_desc(char *buf, uint8 xl_info, char *rec);
extern void btree_xlog_startup(void);
extern void btree_xlog_cleanup(void);
#endif /* NBTREE_H */