From 70508ba7aed76954b7e630a4952e1360c15db830 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 21 Feb 2003 00:06:22 +0000
Subject: [PATCH] Make btree index structure adjustments and WAL logging
 changes needed to support btree compaction, as per proposal of a few days
 ago.  btree index pages no longer store parent links, instead they have a
 level indicator (counting up from zero for leaf pages).  The FixBTree
 recovery logic is removed, and replaced by code that detects missing
 parent-level insertions during WAL replay.  Also, generate appropriate WAL
 entries when updating btree metapage and when building a btree index from
 scratch.  I believe btree indexes are now completely WAL-legal for the first
 time. initdb forced due to index and WAL changes.

---
 src/backend/access/nbtree/Makefile    |    4 +-
 src/backend/access/nbtree/README      |  506 +++++++----
 src/backend/access/nbtree/nbtinsert.c | 1199 +++++++------------------
 src/backend/access/nbtree/nbtpage.c   |  260 ++++--
 src/backend/access/nbtree/nbtree.c    |  399 +-------
 src/backend/access/nbtree/nbtsearch.c |  133 ++-
 src/backend/access/nbtree/nbtsort.c   |   98 +-
 src/backend/access/nbtree/nbtxlog.c   |  780 ++++++++++++++++
 src/backend/access/transam/rmgr.c     |   37 +-
 src/backend/access/transam/xlog.c     |   58 +-
 src/include/access/nbtree.h           |  570 +++++++-----
 src/include/access/xlog.h             |    6 +-
 src/include/catalog/catversion.h      |    4 +-
 13 files changed, 2179 insertions(+), 1875 deletions(-)
 create mode 100644 src/backend/access/nbtree/nbtxlog.c

diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile
index bdc366dd0a..cf525f9f1f 100644
--- a/src/backend/access/nbtree/Makefile
+++ b/src/backend/access/nbtree/Makefile
@@ -4,7 +4,7 @@
 #    Makefile for access/nbtree
 #
 # IDENTIFICATION
-#    $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.11 2001/07/15 22:48:16 tgl Exp $
+#    $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.12 2003/02/21 00:06:21 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -13,7 +13,7 @@ top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtsearch.o \
-       nbtstrat.o nbtutils.o nbtsort.o
+       nbtstrat.o nbtutils.o nbtsort.o nbtxlog.o
 
 all: SUBSYS.o
 
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 964b8b4e11..8fc0c5c7bf 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -1,186 +1,378 @@
-$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.6 2002/10/20 20:47:31 tgl Exp $
+$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.7 2003/02/21 00:06:21 tgl Exp $
 
 This directory contains a correct implementation of Lehman and Yao's
 high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
 Efficient Locking for Concurrent Operations on B-Trees, ACM Transactions
-on Database Systems, Vol 6, No. 4, December 1981, pp 650-670).
+on Database Systems, Vol 6, No. 4, December 1981, pp 650-670).  We also
+use a simplified version of the deletion logic described in Lanin and
+Shasha (V. Lanin and D. Shasha, A Symmetric Concurrent B-Tree Algorithm,
+Proceedings of 1986 Fall Joint Computer Conference, pp 380-389).
 
-We have made the following changes in order to incorporate their algorithm
+The Lehman and Yao algorithm and insertions
+-------------------------------------------
+
+We have made the following changes in order to incorporate the L&Y algorithm
 into Postgres:
 
-+  The requirement that all btree keys be unique is too onerous,
-   but the algorithm won't work correctly without it.  Fortunately, it is
-   only necessary that keys be unique on a single tree level, because L&Y
-   only use the assumption of key uniqueness when re-finding a key in a
-   parent node (to determine where to insert the key for a split page).
-   Therefore, we can use the link field to disambiguate multiple
-   occurrences of the same user key: only one entry in the parent level
-   will be pointing at the page we had split.  (Indeed we need not look at
-   the real "key" at all, just at the link field.)  We can distinguish
-   items at the leaf level in the same way, by examining their links to
-   heap tuples; we'd never have two items for the same heap tuple.
+The requirement that all btree keys be unique is too onerous,
+but the algorithm won't work correctly without it.  Fortunately, it is
+only necessary that keys be unique on a single tree level, because L&Y
+only use the assumption of key uniqueness when re-finding a key in a
+parent page (to determine where to insert the key for a split page).
+Therefore, we can use the link field to disambiguate multiple
+occurrences of the same user key: only one entry in the parent level
+will be pointing at the page we had split.  (Indeed we need not look at
+the real "key" at all, just at the link field.)  We can distinguish
+items at the leaf level in the same way, by examining their links to
+heap tuples; we'd never have two items for the same heap tuple.
 
-+  Lehman and Yao assume that the key range for a subtree S is described
-   by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the parent
-   node.  This does not work for nonunique keys (for example, if we have
-   enough equal keys to spread across several leaf pages, there *must* be
-   some equal bounding keys in the first level up).  Therefore we assume
-   Ki <= v <= Ki+1 instead.  A search that finds exact equality to a
-   bounding key in an upper tree level must descend to the left of that
-   key to ensure it finds any equal keys in the preceding page.  An
-   insertion that sees the high key of its target page is equal to the key
-   to be inserted has a choice whether or not to move right, since the new
-   key could go on either page.  (Currently, we try to find a page where
-   there is room for the new key without a split.)
+Lehman and Yao assume that the key range for a subtree S is described
+by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the parent
+page.  This does not work for nonunique keys (for example, if we have
+enough equal keys to spread across several leaf pages, there *must* be
+some equal bounding keys in the first level up).  Therefore we assume
+Ki <= v <= Ki+1 instead.  A search that finds exact equality to a
+bounding key in an upper tree level must descend to the left of that
+key to ensure it finds any equal keys in the preceding page.  An
+insertion that sees the high key of its target page is equal to the key
+to be inserted has a choice whether or not to move right, since the new
+key could go on either page.  (Currently, we try to find a page where
+there is room for the new key without a split.)
 
-+  Lehman and Yao don't require read locks, but assume that in-memory
-   copies of tree nodes are unshared.  Postgres shares in-memory buffers
-   among backends.  As a result, we do page-level read locking on btree
-   nodes in order to guarantee that no record is modified while we are
-   examining it.  This reduces concurrency but guaranteees correct
-   behavior.  An advantage is that when trading in a read lock for a
-   write lock, we need not re-read the page after getting the write lock.
-   Since we're also holding a pin on the shared buffer containing the
-   page, we know that buffer still contains the page and is up-to-date.
+Lehman and Yao don't require read locks, but assume that in-memory
+copies of tree pages are unshared.  Postgres shares in-memory buffers
+among backends.  As a result, we do page-level read locking on btree
+pages in order to guarantee that no record is modified while we are
+examining it.  This reduces concurrency but guaranteees correct
+behavior.  An advantage is that when trading in a read lock for a
+write lock, we need not re-read the page after getting the write lock.
+Since we're also holding a pin on the shared buffer containing the
+page, we know that buffer still contains the page and is up-to-date.
 
-+  We support the notion of an ordered "scan" of an index as well as
-   insertions, deletions, and simple lookups.  A scan in the forward
-   direction is no problem, we just use the right-sibling pointers that
-   L&Y require anyway.  (Thus, once we have descended the tree to the
-   correct start point for the scan, the scan looks only at leaf pages
-   and never at higher tree levels.)  To support scans in the backward
-   direction, we also store a "left sibling" link much like the "right
-   sibling".  (This adds an extra step to the L&Y split algorithm: while
-   holding the write lock on the page being split, we also lock its former
-   right sibling to update that page's left-link.  This is safe since no
-   writer of that page can be interested in acquiring a write lock on our
-   page.)  A backwards scan has one additional bit of complexity: after
-   following the left-link we must account for the possibility that the
-   left sibling page got split before we could read it.  So, we have to
-   move right until we find a page whose right-link matches the page we
-   came from.
+We support the notion of an ordered "scan" of an index as well as
+insertions, deletions, and simple lookups.  A scan in the forward
+direction is no problem, we just use the right-sibling pointers that
+L&Y require anyway.  (Thus, once we have descended the tree to the
+correct start point for the scan, the scan looks only at leaf pages
+and never at higher tree levels.)  To support scans in the backward
+direction, we also store a "left sibling" link much like the "right
+sibling".  (This adds an extra step to the L&Y split algorithm: while
+holding the write lock on the page being split, we also lock its former
+right sibling to update that page's left-link.  This is safe since no
+writer of that page can be interested in acquiring a write lock on our
+page.)  A backwards scan has one additional bit of complexity: after
+following the left-link we must account for the possibility that the
+left sibling page got split before we could read it.  So, we have to
+move right until we find a page whose right-link matches the page we
+came from.  (Actually, it's even harder than that; see deletion discussion
+below.)
 
-+  Read locks on a page are held for as long as a scan is examining a page.
-   But nbtree.c arranges to drop the read lock, but not the buffer pin,
-   on the current page of a scan before control leaves nbtree.  When we
-   come back to resume the scan, we have to re-grab the read lock and
-   then move right if the current item moved (see _bt_restscan()).  Keeping
-   the pin ensures that the current item cannot move left or be deleted
-   (see btbulkdelete).
+Read locks on a page are held for as long as a scan is examining a page.
+But nbtree.c arranges to drop the read lock, but not the buffer pin,
+on the current page of a scan before control leaves nbtree.  When we
+come back to resume the scan, we have to re-grab the read lock and
+then move right if the current item moved (see _bt_restscan()).  Keeping
+the pin ensures that the current item cannot move left or be deleted
+(see btbulkdelete).
 
-+  In most cases we release our lock and pin on a page before attempting
-   to acquire pin and lock on the page we are moving to.  In a few places
-   it is necessary to lock the next page before releasing the current one.
-   This is safe when moving right or up, but not when moving left or down
-   (else we'd create the possibility of deadlocks).
+In most cases we release our lock and pin on a page before attempting
+to acquire pin and lock on the page we are moving to.  In a few places
+it is necessary to lock the next page before releasing the current one.
+This is safe when moving right or up, but not when moving left or down
+(else we'd create the possibility of deadlocks).
 
-+  Lehman and Yao fail to discuss what must happen when the root page
-   becomes full and must be split.  Our implementation is to split the
-   root in the same way that any other page would be split, then construct
-   a new root page holding pointers to both of the resulting pages (which
-   now become siblings on level 2 of the tree).  The new root page is then
-   installed by altering the root pointer in the meta-data page (see
-   below).  This works because the root is not treated specially in any
-   other way --- in particular, searches will move right using its link
-   pointer if the link is set.  Therefore, searches will find the data
-   that's been moved into the right sibling even if they read the metadata
-   page before it got updated.  This is the same reasoning that makes a
-   split of a non-root page safe.  The locking considerations are similar too.
+Lehman and Yao fail to discuss what must happen when the root page
+becomes full and must be split.  Our implementation is to split the
+root in the same way that any other page would be split, then construct
+a new root page holding pointers to both of the resulting pages (which
+now become siblings on the next level of the tree).  The new root page
+is then installed by altering the root pointer in the meta-data page (see
+below).  This works because the root is not treated specially in any
+other way --- in particular, searches will move right using its link
+pointer if the link is set.  Therefore, searches will find the data
+that's been moved into the right sibling even if they read the meta-data
+page before it got updated.  This is the same reasoning that makes a
+split of a non-root page safe.  The locking considerations are similar too.
 
-+  Lehman and Yao assume fixed-size keys, but we must deal with
-   variable-size keys.  Therefore there is not a fixed maximum number of
-   keys per page; we just stuff in as many as will fit.  When we split a
-   page, we try to equalize the number of bytes, not items, assigned to
-   each of the resulting pages.  Note we must include the incoming item in
-   this calculation, otherwise it is possible to find that the incoming
-   item doesn't fit on the split page where it needs to go!
+When an inserter recurses up the tree, splitting internal pages to insert
+links to pages inserted on the level below, it is possible that it will
+need to access a page above the level that was the root when it began its
+descent (or more accurately, the level that was the root when it read the
+meta-data page).  In this case the stack it made while descending does not
+help for finding the correct page.  When this happens, we find the correct
+place by re-descending the tree until we reach the level one above the
+level we need to insert a link to, and then moving right as necessary.
+(Typically this will take only two fetches, the meta-data page and the new
+root, but in principle there could have been more than one root split
+since we saw the root.  We can identify the correct tree level by means of
+the level numbers stored in each page.  The situation is rare enough that
+we do not need a more efficient solution.)
 
-In addition, the following things are handy to know:
+Lehman and Yao assume fixed-size keys, but we must deal with
+variable-size keys.  Therefore there is not a fixed maximum number of
+keys per page; we just stuff in as many as will fit.  When we split a
+page, we try to equalize the number of bytes, not items, assigned to
+each of the resulting pages.  Note we must include the incoming item in
+this calculation, otherwise it is possible to find that the incoming
+item doesn't fit on the split page where it needs to go!
 
-+  Page zero of every btree is a meta-data page.  This page stores
-   the location of the root page, a pointer to a list of free
-   pages, and other stuff that's handy to know.  (Currently, we
-   never shrink btree indexes so there are never any free pages.)
+The deletion algorithm
+----------------------
 
-+  The algorithm assumes we can fit at least three items per page
-   (a "high key" and two real data items).  Therefore it's unsafe
-   to accept items larger than 1/3rd page size.  Larger items would
-   work sometimes, but could cause failures later on depending on
-   what else gets put on their page.
+Deletions of leaf items are handled by getting a super-exclusive lock on
+the target page, so that no other backend has a pin on the page when the
+deletion starts.  This means no scan is pointing at the page, so no other
+backend can lose its place due to the item deletion.
 
-+  This algorithm doesn't guarantee btree consistency after a kernel crash
-   or hardware failure.  To do that, we'd need ordered writes, and UNIX
-   doesn't support ordered writes (short of fsync'ing every update, which
-   is too high a price).  Rebuilding corrupted indexes during restart
-   seems more attractive.
+The above does not work for deletion of items in internal pages, since
+other backends keep no lock nor pin on a page they have descended past.
+Instead, when a backend is ascending the tree using its stack, it must
+be prepared for the possibility that the item it wants is to the left of
+the recorded position (but it can't have moved left out of the recorded
+page).  Since we hold a lock on the lower page (per L&Y) until we have
+re-found the parent item that links to it, we can be assured that the
+parent item does still exist and can't have been deleted.  Also, because
+we are matching downlink page numbers and not data keys, we don't have any
+problem with possibly misidentifying the parent item.
 
-+  Deletions are handled by getting a super-exclusive lock on the target
-   page, so that no other backend has a pin on the page when the deletion
-   starts.  This means no scan is pointing at the page.  This is OK for
-   deleting leaf items, probably not OK for deleting internal nodes;
-   will need to think harder when it's time to support index compaction.
+We consider deleting an entire page from the btree only when it's become
+completely empty of items.  (Merging partly-full pages would allow better
+space reuse, but it seems impractical to move existing data items left or
+right to make this happen --- a scan moving in the opposite direction
+might miss the items if so.  We could do it during VACUUM FULL, though.)
+Also, we *never* delete the rightmost page on a tree level (this
+restriction simplifies the traversal algorithms, as explained below).
 
-+  "ScanKey" data structures are used in two fundamentally different ways
-   in this code.  Searches for the initial position for a scan, as well as
-   insertions, use scankeys in which the comparison function is a 3-way
-   comparator (<0, =0, >0 result).  These scankeys are built within the
-   btree code (eg, by _bt_mkscankey()) and used by _bt_compare().  Once we
-   are positioned, sequential examination of tuples in a scan is done by
-   _bt_checkkeys() using scankeys in which the comparison functions return
-   booleans --- for example, int4lt might be used.  These scankeys are the
-   ones originally passed in from outside the btree code.  Same
-   representation, but different comparison functions!
+To delete an empty page, we acquire write lock on its left sibling (if
+any), the target page itself, the right sibling (there must be one), and
+the parent page, in that order.  The parent page must be found using the
+same type of search as used to find the parent during an insertion split.
+Then we update the side-links in the siblings, mark the target page
+deleted, and remove the downlink from the parent, as well as the parent's
+upper bounding key for the target (the one separating it from its right
+sibling).  This causes the target page's key space to effectively belong
+to its right sibling.  (Neither the left nor right sibling pages need to
+change their "high key" if any; so there is no problem with possibly not
+having enough space to replace a high key.)  The side-links in the target
+page are not changed.
 
-Notes about data representation:
+(Note: Lanin and Shasha prefer to make the key space move left, but their
+argument for doing so hinges on not having left-links, which we have
+anyway.  So we simplify the algorithm by moving key space right.)
 
-+  The right-sibling link required by L&Y is kept in the page "opaque
-   data" area, as is the left-sibling link and some flags.
+To preserve consistency on the parent level, we cannot merge the key space
+of a page into its right sibling unless the right sibling is a child of
+the same parent --- otherwise, the parent's key space assignment changes
+too, meaning we'd have to make bounding-key updates in its parent, and
+perhaps all the way up the tree.  Since we can't possibly do that
+atomically, we forbid this case.  That means that the rightmost child of a
+parent node can't be deleted unless it's the only remaining child.
 
-+  We also keep a parent link in the opaque data, but this link is not
-   very trustworthy because it is not updated when the parent page splits.
-   Thus, it points to some page on the parent level, but possibly a page
-   well to the left of the page's actual current parent.  In most cases
-   we do not need this link at all.  Normally we return to a parent page
-   using a stack of entries that are made as we descend the tree, as in L&Y.
-   There is exactly one case where the stack will not help: concurrent
-   root splits.  If an inserter process needs to split what had been the
-   root when it started its descent, but finds that that page is no longer
-   the root (because someone else split it meanwhile), then it uses the
-   parent link to move up to the next level.  This is OK because we do fix
-   the parent link in a former root page when splitting it.  This logic
-   will work even if the root is split multiple times (even up to creation
-   of multiple new levels) before an inserter returns to it.  The same
-   could not be said of finding the new root via the metapage, since that
-   would work only for a single level of added root.
+When we delete the last remaining child of a parent page, we mark the
+parent page "half-dead" as part of the atomic update that deletes the
+child page.  This implicitly transfers the parent's key space to its right
+sibling (which it must have, since we never delete the overall-rightmost
+page of a level).  No future insertions into the parent level are allowed
+to insert keys into the half-dead page --- they must move right to its
+sibling, instead.  The parent remains empty and can be deleted in a
+separate atomic action.  (However, if it's the rightmost child of its own
+parent, it might have to stay half-dead for awhile, until it's also the
+only child.)
 
-+  The Postgres disk block data format (an array of items) doesn't fit
-   Lehman and Yao's alternating-keys-and-pointers notion of a disk page,
-   so we have to play some games.
+Note that an empty leaf page is a valid tree state, but an empty interior
+page is not legal (an interior page must have children to delegate its
+key space to).  So an interior page *must* be marked half-dead as soon
+as its last child is deleted.
 
-+  On a page that is not rightmost in its tree level, the "high key" is
-   kept in the page's first item, and real data items start at item 2.
-   The link portion of the "high key" item goes unused.  A page that is
-   rightmost has no "high key", so data items start with the first item.
-   Putting the high key at the left, rather than the right, may seem odd,
-   but it avoids moving the high key as we add data items.
+The notion of a half-dead page means that the key space relationship between
+the half-dead page's level and its parent's level may be a little out of
+whack: key space that appears to belong to the half-dead page's parent on the
+parent level may really belong to its right sibling.  We can tolerate this,
+however, because insertions and deletions on upper tree levels are always
+done by reference to child page numbers, not keys.  The only cost is that
+searches may sometimes descend to the half-dead page and then have to move
+right, rather than going directly to the sibling page.
 
-+  On a leaf page, the data items are simply links to (TIDs of) tuples
-   in the relation being indexed, with the associated key values.
+A deleted page cannot be reclaimed immediately, since there may be other
+processes waiting to reference it (ie, search processes that just left the
+parent, or scans moving right or left from one of the siblings).  These
+processes must observe that the page is marked dead and recover
+accordingly.  Searches and forward scans simply follow the right-link
+until they find a non-dead page --- this will be where the deleted page's
+key-space moved to.
 
-+  On a non-leaf page, the data items are down-links to child pages with
-   bounding keys.  The key in each data item is the *lower* bound for
-   keys on that child page, so logically the key is to the left of that
-   downlink.  The high key (if present) is the upper bound for the last
-   downlink.  The first data item on each such page has no lower bound
-   --- or lower bound of minus infinity, if you prefer.  The comparison
-   routines must treat it accordingly.  The actual key stored in the
-   item is irrelevant, and need not be stored at all.  This arrangement
-   corresponds to the fact that an L&Y non-leaf page has one more pointer
-   than key.
+Stepping left in a backward scan is complicated because we must consider
+the possibility that the left sibling was just split (meaning we must find
+the rightmost page derived from the left sibling), plus the possibility
+that the page we were just on has now been deleted and hence isn't in the
+sibling chain at all anymore.  So the move-left algorithm becomes:
+0. Remember the page we are on as the "original page".
+1. Follow the original page's left-link (we're done if this is zero).
+2. If the current page is live and its right-link matches the "original
+   page", we are done.
+3. Otherwise, move right one or more times looking for a live page whose
+   right-link matches the "original page".  If found, we are done.  (In
+   principle we could scan all the way to the right end of the index, but
+   in practice it seems better to give up after a small number of tries.
+   It's unlikely the original page's sibling split more than a few times
+   while we were in flight to it; if we do not find a matching link in a
+   few tries, then most likely the original page is deleted.)
+4. Return to the "original page".  If it is still live, return to step 1
+   (we guessed wrong about it being deleted, and should restart with its
+   current left-link).  If it is dead, move right until a non-dead page
+   is found (there must be one, since rightmost pages are never deleted),
+   mark that as the new "original page", and return to step 1.
+This algorithm is correct because the live page found by step 4 will have
+the same left keyspace boundary as the page we started from.  Therefore,
+when we ultimately exit, it must be on a page whose right keyspace
+boundary matches the left boundary of where we started --- which is what
+we need to be sure we don't miss or re-scan any items.
 
-Notes to operator class implementors:
+A deleted page can only be reclaimed once there is no scan or search that
+has a reference to it; until then, it must stay in place with its
+right-link undisturbed.  We implement this by waiting until all
+transactions that were running at the time of deletion are dead; which is
+overly strong, but is simple to implement within Postgres.  When marked
+dead, a deleted page is labeled with the next-transaction counter value.
+VACUUM can reclaim the page for re-use when this transaction number is
+older than the oldest open transaction.  (NOTE: VACUUM FULL can reclaim
+such pages immediately.)
 
-+  With this implementation, we require each supported datatype to supply
-   us with a comparison procedure via pg_amproc.  This procedure must take
-   two nonnull values A and B and return an int32 < 0, 0, or > 0 if A < B,
-   A = B, or A > B, respectively.  See nbtcompare.c for examples.
+Reclaiming a page doesn't actually change its state on disk --- we simply
+record it in the shared-memory free space map, from which it will be
+handed out the next time a new page is needed for a page split.  The
+deleted page's contents will be overwritten by the split operation.
+(Note: if we find a deleted page with an extremely old transaction
+number, it'd be worthwhile to re-mark it with FrozenTransactionId so that
+a later xid wraparound can't cause us to think the page is unreclaimable.
+But in more normal situations this would be a waste of a disk write.)
+
+Because we never delete the rightmost page of any level (and in particular
+never delete the root), it's impossible for the height of the tree to
+decrease.  After massive deletions we might have a scenario in which the
+tree is "skinny", with several single-page levels below the root.
+Operations will still be correct in this case, but we'd waste cycles
+descending through the single-page levels.  To handle this we use an idea
+from Lanin and Shasha: we keep track of the "fast root" level, which is
+the lowest single-page level.  The meta-data page keeps a pointer to this
+level as well as the true root.  All ordinary operations initiate their
+searches at the fast root not the true root.  When we split a page that is
+alone on its level or delete the next-to-last page on a level (both cases
+are easily detected), we have to make sure that the fast root pointer is
+adjusted appropriately.  In the split case, we do this work as part of the
+atomic update for the insertion into the parent level; in the delete case
+as part of the atomic update for the delete (either way, the metapage has
+to be the last page locked in the update to avoid deadlock risks).  This
+avoids race conditions if two such operations are executing concurrently.
+
+VACUUM needs to do a linear scan of an index to search for empty leaf
+pages and half-dead parent pages that can be deleted, as well as deleted
+pages that can be reclaimed because they are older than all open
+transactions.
+
+WAL considerations
+------------------
+
+The insertion and deletion algorithms in themselves don't guarantee btree
+consistency after a crash.  To provide robustness, we depend on WAL
+replay.  A single WAL entry is effectively an atomic action --- we can
+redo it from the log if it fails to complete.
+
+Ordinary item insertions (that don't force a page split) are of course
+single WAL entries, since they only affect one page.  The same for
+leaf-item deletions (if the deletion brings the leaf page to zero items,
+it is now a candidate to be deleted, but that is a separate action).
+
+An insertion that causes a page split is logged as a single WAL entry for
+the changes occuring on the insertion's level --- including update of the
+right sibling's left-link --- followed by a second WAL entry for the
+insertion on the parent level (which might itself be a page split, requiring
+an additional insertion above that, etc).
+
+For a root split, the followon WAL entry is a "new root" entry rather than
+an "insertion" entry, but details are otherwise much the same.
+
+Because insertion involves multiple atomic actions, the WAL replay logic
+has to detect the case where a page split isn't followed by a matching
+insertion on the parent level, and then do that insertion on its own (and
+recursively for any subsequent parent insertion, of course).  This is
+feasible because the WAL entry for the split contains enough info to know
+what must be inserted in the parent level.
+
+When splitting a non-root page that is alone on its level, the required
+metapage update (of the "fast root" link) is performed and logged as part
+of the insertion into the parent level.  When splitting the root page, the
+metapage update is handled as part of the "new root" action.
+
+A page deletion is logged as a single WAL entry covering all four
+required page updates (target page, left and right siblings, and parent)
+as an atomic event.  (Any required fast-root link update is also part
+of the WAL entry.)  If the parent page becomes half-dead but is not
+immediately deleted due to a subsequent crash, there is no loss of
+consistency, and the empty page will be picked up by the next VACUUM.
+
+Other things that are handy to know
+-----------------------------------
+
+Page zero of every btree is a meta-data page.  This page stores the
+location of the root page --- both the true root and the current effective
+root ("fast" root).
+
+The algorithm assumes we can fit at least three items per page
+(a "high key" and two real data items).  Therefore it's unsafe
+to accept items larger than 1/3rd page size.  Larger items would
+work sometimes, but could cause failures later on depending on
+what else gets put on their page.
+
+"ScanKey" data structures are used in two fundamentally different ways
+in this code.  Searches for the initial position for a scan, as well as
+insertions, use scankeys in which the comparison function is a 3-way
+comparator (<0, =0, >0 result).  These scankeys are built within the
+btree code (eg, by _bt_mkscankey()) and used by _bt_compare().  Once we
+are positioned, sequential examination of tuples in a scan is done by
+_bt_checkkeys() using scankeys in which the comparison functions return
+booleans --- for example, int4lt might be used.  These scankeys are the
+ones originally passed in from outside the btree code.  Same
+representation, but different comparison functions!
+
+Notes about data representation
+-------------------------------
+
+The right-sibling link required by L&Y is kept in the page "opaque
+data" area, as is the left-sibling link, the page level, and some flags.
+The page level counts upwards from zero at the leaf level, to the tree
+depth minus 1 at the root.  (Counting up from the leaves ensures that we
+don't need to renumber any existing pages when splitting the root.)
+
+The Postgres disk block data format (an array of items) doesn't fit
+Lehman and Yao's alternating-keys-and-pointers notion of a disk page,
+so we have to play some games.
+
+On a page that is not rightmost in its tree level, the "high key" is
+kept in the page's first item, and real data items start at item 2.
+The link portion of the "high key" item goes unused.  A page that is
+rightmost has no "high key", so data items start with the first item.
+Putting the high key at the left, rather than the right, may seem odd,
+but it avoids moving the high key as we add data items.
+
+On a leaf page, the data items are simply links to (TIDs of) tuples
+in the relation being indexed, with the associated key values.
+
+On a non-leaf page, the data items are down-links to child pages with
+bounding keys.  The key in each data item is the *lower* bound for
+keys on that child page, so logically the key is to the left of that
+downlink.  The high key (if present) is the upper bound for the last
+downlink.  The first data item on each such page has no lower bound
+--- or lower bound of minus infinity, if you prefer.  The comparison
+routines must treat it accordingly.  The actual key stored in the
+item is irrelevant, and need not be stored at all.  This arrangement
+corresponds to the fact that an L&Y non-leaf page has one more pointer
+than key.
+
+Notes to operator class implementors
+------------------------------------
+
+With this implementation, we require each supported datatype to supply
+us with a comparison procedure via pg_amproc.  This procedure must take
+two nonnull values A and B and return an int32 < 0, 0, or > 0 if A < B,
+A = B, or A > B, respectively.  See nbtcompare.c for examples.
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 99011a5c95..a93a9fed8c 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.96 2002/09/04 20:31:09 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.97 2003/02/21 00:06:21 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -35,15 +35,6 @@ typedef struct
 	int			best_delta;		/* best size delta so far */
 } FindSplitData;
 
-extern bool FixBTree;
-
-Buffer		_bt_fixroot(Relation rel, Buffer oldrootbuf, bool release);
-static void _bt_fixtree(Relation rel, BlockNumber blkno);
-static void _bt_fixbranch(Relation rel, BlockNumber lblkno,
-			  BlockNumber rblkno, BTStack true_stack);
-static void _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit);
-static void _bt_fixup(Relation rel, Buffer buf);
-static OffsetNumber _bt_getoff(Page page, BlockNumber blkno);
 
 static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
 
@@ -54,9 +45,8 @@ static InsertIndexResult _bt_insertonpg(Relation rel, Buffer buf,
 			   BTStack stack,
 			   int keysz, ScanKey scankey,
 			   BTItem btitem,
-			   OffsetNumber afteritem);
-static void _bt_insertuple(Relation rel, Buffer buf,
-			   Size itemsz, BTItem btitem, OffsetNumber newitemoff);
+			   OffsetNumber afteritem,
+			   bool split_only_page);
 static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 		  OffsetNumber newitemoff, Size newitemsz,
 		  BTItem newitem, bool newitemonleft,
@@ -149,7 +139,8 @@ top:
 	}
 
 	/* do the insertion */
-	res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey, btitem, 0);
+	res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey, btitem,
+						 0, false);
 
 	/* be tidy */
 	_bt_freestack(stack);
@@ -320,6 +311,7 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
  *			   right using information stored in the parent stack).
  *			+  invokes itself with the appropriate tuple for the right
  *			   child page on the parent.
+ *			+  updates the metapage if a true root or fast root is split.
  *
  *		On entry, we must have the right buffer on which to do the
  *		insertion, and the buffer must be pinned and locked.  On return,
@@ -358,7 +350,8 @@ _bt_insertonpg(Relation rel,
 			   int keysz,
 			   ScanKey scankey,
 			   BTItem btitem,
-			   OffsetNumber afteritem)
+			   OffsetNumber afteritem,
+			   bool split_only_page)
 {
 	InsertIndexResult res;
 	Page		page;
@@ -458,11 +451,10 @@ _bt_insertonpg(Relation rel,
 	 */
 	if (PageGetFreeSpace(page) < itemsz)
 	{
-		Buffer		rbuf;
-		BlockNumber bknum = BufferGetBlockNumber(buf);
-		BlockNumber rbknum;
 		bool		is_root = P_ISROOT(lpageop);
+		bool		is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
 		bool		newitemonleft;
+		Buffer		rbuf;
 
 		/* Choose the split point */
 		firstright = _bt_findsplitloc(rel, page,
@@ -488,128 +480,127 @@ _bt_insertonpg(Relation rel,
 		 * locks for the child pages until we locate the parent, but we can
 		 * release them before doing the actual insertion (see Lehman and Yao
 		 * for the reasoning).
-		 *
-		 * Here we have to do something Lehman and Yao don't talk about:
-		 * deal with a root split and construction of a new root.  If our
-		 * stack is empty then we have just split a node on what had been
-		 * the root level when we descended the tree.  If it is still the
-		 * root then we perform a new-root construction.  If it *wasn't*
-		 * the root anymore, use the parent pointer to get up to the root
-		 * level that someone constructed meanwhile, and find the right
-		 * place to insert as for the normal case.
 		 *----------
 		 */
-
-		if (is_root)
-		{
-			Buffer		rootbuf;
-
-			Assert(stack == (BTStack) NULL);
-			/* create a new root node and release the split buffers */
-			rootbuf = _bt_newroot(rel, buf, rbuf);
-			_bt_wrtbuf(rel, rootbuf);
-			_bt_wrtbuf(rel, rbuf);
-			_bt_wrtbuf(rel, buf);
-		}
-		else
-		{
-			InsertIndexResult newres;
-			BTItem		new_item;
-			BTStackData fakestack;
-			BTItem		ritem;
-			Buffer		pbuf;
-
-			/* If root page was splitted */
-			if (stack == (BTStack) NULL)
-			{
-				elog(LOG, "btree: concurrent ROOT page split");
-
-				/*
-				 * If root page splitter failed to create new root page
-				 * then old root' btpo_parent still points to metapage. We
-				 * have to fix root page in this case.
-				 */
-				if (BTreeInvalidParent(lpageop))
-				{
-					if (!FixBTree)
-						elog(ERROR, "bt_insertonpg[%s]: no root page found", RelationGetRelationName(rel));
-					_bt_wrtbuf(rel, rbuf);
-					_bt_wrtnorelbuf(rel, buf);
-					elog(WARNING, "bt_insertonpg[%s]: root page unfound - fixing upper levels", RelationGetRelationName(rel));
-					_bt_fixup(rel, buf);
-					goto formres;
-				}
-
-				/*
-				 * Set up a phony stack entry if we haven't got a real one
-				 */
-				stack = &fakestack;
-				stack->bts_blkno = lpageop->btpo_parent;
-				stack->bts_offset = InvalidOffsetNumber;
-				/* bts_btitem will be initialized below */
-				stack->bts_parent = NULL;
-			}
-
-			/* get high key from left page == lowest key on new right page */
-			ritem = (BTItem) PageGetItem(page,
-										 PageGetItemId(page, P_HIKEY));
-
-			/* form an index tuple that points at the new right page */
-			new_item = _bt_formitem(&(ritem->bti_itup));
-			rbknum = BufferGetBlockNumber(rbuf);
-			ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY);
-
-			/*
-			 * Find the parent buffer and get the parent page.
-			 *
-			 * Oops - if we were moved right then we need to change stack
-			 * item! We want to find parent pointing to where we are,
-			 * right ?	  - vadim 05/27/97
-			 *
-			 * Interestingly, this means we didn't *really* need to stack the
-			 * parent key at all; all we really care about is the saved
-			 * block and offset as a starting point for our search...
-			 */
-			ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid),
-						   bknum, P_HIKEY);
-
-			pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
-
-			/* Now we can write and unlock the children */
-			_bt_wrtbuf(rel, rbuf);
-			_bt_wrtbuf(rel, buf);
-
-			if (pbuf == InvalidBuffer)
-			{
-				if (!FixBTree)
-					elog(ERROR, "_bt_getstackbuf: my bits moved right off the end of the world!"
-						 "\n\tRecreate index %s.", RelationGetRelationName(rel));
-				pfree(new_item);
-				elog(WARNING, "bt_insertonpg[%s]: parent page unfound - fixing branch", RelationGetRelationName(rel));
-				_bt_fixbranch(rel, bknum, rbknum, stack);
-				goto formres;
-			}
-			/* Recursively update the parent */
-			newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
-									0, NULL, new_item, stack->bts_offset);
-
-			/* be tidy */
-			pfree(newres);
-			pfree(new_item);
-		}
+		_bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only);
 	}
 	else
 	{
+		Buffer		metabuf = InvalidBuffer;
+		Page		metapg = NULL;
+		BTMetaPageData *metad = NULL;
+
 		itup_off = newitemoff;
 		itup_blkno = BufferGetBlockNumber(buf);
 
-		_bt_insertuple(rel, buf, itemsz, btitem, newitemoff);
+		/*
+		 * If we are doing this insert because we split a page that was
+		 * the only one on its tree level, but was not the root, it may
+		 * have been the "fast root".  We need to ensure that the fast root
+		 * link points at or above the current page.  We can safely acquire
+		 * a lock on the metapage here --- see comments for _bt_newroot().
+		 */
+		if (split_only_page)
+		{
+			metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+			metapg = BufferGetPage(metabuf);
+			metad = BTPageGetMeta(metapg);
+
+			if (metad->btm_fastlevel >= lpageop->btpo.level)
+			{
+				/* no update wanted */
+				_bt_relbuf(rel, metabuf);
+				metabuf = InvalidBuffer;
+			}
+		}
+
+		/* Do the actual update.  No elog(ERROR) until changes are logged */
+		START_CRIT_SECTION();
+
+		_bt_pgaddtup(rel, page, itemsz, btitem, newitemoff, "page");
+
+		if (BufferIsValid(metabuf))
+		{
+			metad->btm_fastroot = itup_blkno;
+			metad->btm_fastlevel = lpageop->btpo.level;
+		}
+
+		/* XLOG stuff */
+		if (!rel->rd_istemp)
+		{
+			xl_btree_insert xlrec;
+			xl_btree_metadata xlmeta;
+			uint8		xlinfo;
+			XLogRecPtr	recptr;
+			XLogRecData rdata[3];
+			XLogRecData *nextrdata;
+			BTItemData	truncitem;
+
+			xlrec.target.node = rel->rd_node;
+			ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off);
+
+			rdata[0].buffer = InvalidBuffer;
+			rdata[0].data = (char *) &xlrec;
+			rdata[0].len = SizeOfBtreeInsert;
+			rdata[0].next = nextrdata = &(rdata[1]);
+
+			if (BufferIsValid(metabuf))
+			{
+				xlmeta.root = metad->btm_root;
+				xlmeta.level = metad->btm_level;
+				xlmeta.fastroot = metad->btm_fastroot;
+				xlmeta.fastlevel = metad->btm_fastlevel;
+
+				nextrdata->buffer = InvalidBuffer;
+				nextrdata->data = (char *) &xlmeta;
+				nextrdata->len = sizeof(xl_btree_metadata);
+				nextrdata->next = nextrdata + 1;
+				nextrdata++;
+				xlinfo = XLOG_BTREE_INSERT_META;
+			}
+			else if (P_ISLEAF(lpageop))
+				xlinfo = XLOG_BTREE_INSERT_LEAF;
+			else
+				xlinfo = XLOG_BTREE_INSERT_UPPER;
+
+			/* Read comments in _bt_pgaddtup */
+			if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
+			{
+				truncitem = *btitem;
+				truncitem.bti_itup.t_info = sizeof(BTItemData);
+				nextrdata->data = (char *) &truncitem;
+				nextrdata->len = sizeof(BTItemData);
+			}
+			else
+			{
+				nextrdata->data = (char *) btitem;
+				nextrdata->len = IndexTupleDSize(btitem->bti_itup) +
+					(sizeof(BTItemData) - sizeof(IndexTupleData));
+			}
+			nextrdata->buffer = buf;
+			nextrdata->next = NULL;
+
+			recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
+
+			if (BufferIsValid(metabuf))
+			{
+				PageSetLSN(metapg, recptr);
+				PageSetSUI(metapg, ThisStartUpID);
+			}
+
+			PageSetLSN(page, recptr);
+			PageSetSUI(page, ThisStartUpID);
+		}
+
+		END_CRIT_SECTION();
 
 		/* Write out the updated page and release pin/lock */
+		if (BufferIsValid(metabuf))
+			_bt_wrtbuf(rel, metabuf);
+
 		_bt_wrtbuf(rel, buf);
 	}
 
-formres:;
 	/* by here, the new tuple is inserted at itup_blkno/itup_off */
 	res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
 	ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
@@ -617,61 +608,6 @@ formres:;
 	return res;
 }
 
-static void
-_bt_insertuple(Relation rel, Buffer buf,
-			   Size itemsz, BTItem btitem, OffsetNumber newitemoff)
-{
-	Page		page = BufferGetPage(buf);
-	BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
-
-	START_CRIT_SECTION();
-
-	_bt_pgaddtup(rel, page, itemsz, btitem, newitemoff, "page");
-
-	/* XLOG stuff */
-	if (!rel->rd_istemp)
-	{
-		xl_btree_insert xlrec;
-		uint8		flag = XLOG_BTREE_INSERT;
-		XLogRecPtr	recptr;
-		XLogRecData rdata[2];
-		BTItemData	truncitem;
-
-		xlrec.target.node = rel->rd_node;
-		ItemPointerSet(&(xlrec.target.tid), BufferGetBlockNumber(buf), newitemoff);
-		rdata[0].buffer = InvalidBuffer;
-		rdata[0].data = (char *) &xlrec;
-		rdata[0].len = SizeOfBtreeInsert;
-		rdata[0].next = &(rdata[1]);
-
-		/* Read comments in _bt_pgaddtup */
-		if (!(P_ISLEAF(pageop)) && newitemoff == P_FIRSTDATAKEY(pageop))
-		{
-			truncitem = *btitem;
-			truncitem.bti_itup.t_info = sizeof(BTItemData);
-			rdata[1].data = (char *) &truncitem;
-			rdata[1].len = sizeof(BTItemData);
-		}
-		else
-		{
-			rdata[1].data = (char *) btitem;
-			rdata[1].len = IndexTupleDSize(btitem->bti_itup) +
-				(sizeof(BTItemData) - sizeof(IndexTupleData));
-		}
-		rdata[1].buffer = buf;
-		rdata[1].next = NULL;
-		if (P_ISLEAF(pageop))
-			flag |= XLOG_BTREE_LEAF;
-
-		recptr = XLogInsert(RM_BTREE_ID, flag, rdata);
-
-		PageSetLSN(page, recptr);
-		PageSetSUI(page, ThisStartUpID);
-	}
-
-	END_CRIT_SECTION();
-}
-
 /*
  *	_bt_split() -- split a page in the btree.
  *
@@ -729,13 +665,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 	lopaque->btpo_next = BufferGetBlockNumber(rbuf);
 	ropaque->btpo_prev = BufferGetBlockNumber(buf);
 	ropaque->btpo_next = oopaque->btpo_next;
-
-	/*
-	 * Must copy the original parent link into both new pages, even though
-	 * it might be quite obsolete by now.  We might need it if this level
-	 * is or recently was the root (see README).
-	 */
-	lopaque->btpo_parent = ropaque->btpo_parent = oopaque->btpo_parent;
+	lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
 
 	/*
 	 * If the page we're splitting is not the rightmost page at its level
@@ -876,34 +806,29 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 	if (!rel->rd_istemp)
 	{
 		xl_btree_split xlrec;
-		int			flag = (newitemonleft) ?
-		XLOG_BTREE_SPLEFT : XLOG_BTREE_SPLIT;
-		BlockNumber blkno;
+		uint8		xlinfo;
 		XLogRecPtr	recptr;
 		XLogRecData rdata[4];
 
 		xlrec.target.node = rel->rd_node;
 		ItemPointerSet(&(xlrec.target.tid), *itup_blkno, *itup_off);
 		if (newitemonleft)
-		{
-			blkno = BufferGetBlockNumber(rbuf);
-			BlockIdSet(&(xlrec.otherblk), blkno);
-		}
+			xlrec.otherblk = BufferGetBlockNumber(rbuf);
 		else
-		{
-			blkno = BufferGetBlockNumber(buf);
-			BlockIdSet(&(xlrec.otherblk), blkno);
-		}
-		BlockIdSet(&(xlrec.parentblk), lopaque->btpo_parent);
-		BlockIdSet(&(xlrec.leftblk), lopaque->btpo_prev);
-		BlockIdSet(&(xlrec.rightblk), ropaque->btpo_next);
+			xlrec.otherblk = BufferGetBlockNumber(buf);
+		xlrec.leftblk = lopaque->btpo_prev;
+		xlrec.rightblk = ropaque->btpo_next;
+		xlrec.level = lopaque->btpo.level;
 
 		/*
 		 * Direct access to page is not good but faster - we should
-		 * implement some new func in page API.
+		 * implement some new func in page API.  Note we only store the
+		 * tuples themselves, knowing that the item pointers are in the
+		 * same order and can be reconstructed by scanning the tuples.
 		 */
 		xlrec.leftlen = ((PageHeader) leftpage)->pd_special -
 			((PageHeader) leftpage)->pd_upper;
+
 		rdata[0].buffer = InvalidBuffer;
 		rdata[0].data = (char *) &xlrec;
 		rdata[0].len = SizeOfBtreeSplit;
@@ -933,10 +858,12 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 			rdata[3].next = NULL;
 		}
 
-		if (P_ISLEAF(lopaque))
-			flag |= XLOG_BTREE_LEAF;
+		if (P_ISROOT(oopaque))
+			xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;
+		else
+			xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
 
-		recptr = XLogInsert(RM_BTREE_ID, flag, rdata);
+		recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
 
 		PageSetLSN(leftpage, recptr);
 		PageSetSUI(leftpage, ThisStartUpID);
@@ -1175,48 +1102,179 @@ _bt_checksplitloc(FindSplitData *state, OffsetNumber firstright,
 	}
 }
 
+/*
+ * _bt_insert_parent() -- Insert downlink into parent after a page split.
+ *
+ * On entry, buf and rbuf are the left and right split pages, which we
+ * still hold write locks on per the L&Y algorithm.  We release the
+ * write locks once we have write lock on the parent page.  (Any sooner,
+ * and it'd be possible for some other process to try to split or delete
+ * one of these pages, and get confused because it cannot find the downlink.)
+ *
+ * stack - stack showing how we got here.  May be NULL in cases that don't
+ *			have to be efficient (concurrent ROOT split, WAL recovery)
+ * is_root - we split the true root
+ * is_only - we split a page alone on its level (might have been fast root)
+ *
+ * This is exported so it can be called by nbtxlog.c.
+ */
+void
+_bt_insert_parent(Relation rel,
+				  Buffer buf,
+				  Buffer rbuf,
+				  BTStack stack,
+				  bool is_root,
+				  bool is_only)
+{
+	/*
+	 * Here we have to do something Lehman and Yao don't talk about:
+	 * deal with a root split and construction of a new root.  If our
+	 * stack is empty then we have just split a node on what had been
+	 * the root level when we descended the tree.  If it was still the
+	 * root then we perform a new-root construction.  If it *wasn't*
+	 * the root anymore, search to find the next higher level that
+	 * someone constructed meanwhile, and find the right place to insert
+	 * as for the normal case.
+	 *
+	 * If we have to search for the parent level, we do so by
+	 * re-descending from the root.  This is not super-efficient,
+	 * but it's rare enough not to matter.  (This path is also taken
+	 * when called from WAL recovery --- we have no stack in that case.)
+	 */
+	if (is_root)
+	{
+		Buffer		rootbuf;
+
+		Assert(stack == (BTStack) NULL);
+		Assert(is_only);
+		/* create a new root node and update the metapage */
+		rootbuf = _bt_newroot(rel, buf, rbuf);
+		/* release the split buffers */
+		_bt_wrtbuf(rel, rootbuf);
+		_bt_wrtbuf(rel, rbuf);
+		_bt_wrtbuf(rel, buf);
+	}
+	else
+	{
+		BlockNumber bknum = BufferGetBlockNumber(buf);
+		BlockNumber rbknum = BufferGetBlockNumber(rbuf);
+		Page		page = BufferGetPage(buf);
+		InsertIndexResult newres;
+		BTItem		new_item;
+		BTStackData fakestack;
+		BTItem		ritem;
+		Buffer		pbuf;
+
+		if (stack == (BTStack) NULL)
+		{
+			BTPageOpaque lpageop;
+
+			if (!InRecovery)
+				elog(DEBUG1, "_bt_insert_parent: concurrent ROOT page split");
+			lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+			/* Find the leftmost page at the next level up */
+			pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false);
+			/* Set up a phony stack entry pointing there */
+			stack = &fakestack;
+			stack->bts_blkno = BufferGetBlockNumber(pbuf);
+			stack->bts_offset = InvalidOffsetNumber;
+			/* bts_btitem will be initialized below */
+			stack->bts_parent = NULL;
+			_bt_relbuf(rel, pbuf);
+		}
+
+		/* get high key from left page == lowest key on new right page */
+		ritem = (BTItem) PageGetItem(page,
+									 PageGetItemId(page, P_HIKEY));
+
+		/* form an index tuple that points at the new right page */
+		new_item = _bt_formitem(&(ritem->bti_itup));
+		ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY);
+
+		/*
+		 * Find the parent buffer and get the parent page.
+		 *
+		 * Oops - if we were moved right then we need to change stack
+		 * item! We want to find parent pointing to where we are,
+		 * right ?	  - vadim 05/27/97
+		 */
+		ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid),
+					   bknum, P_HIKEY);
+
+		pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
+
+		/* Now we can write and unlock the children */
+		_bt_wrtbuf(rel, rbuf);
+		_bt_wrtbuf(rel, buf);
+
+		/* Check for error only after writing children */
+		if (pbuf == InvalidBuffer)
+			elog(ERROR, "_bt_getstackbuf: my bits moved right off the end of the world!"
+				 "\n\tRecreate index %s.", RelationGetRelationName(rel));
+
+		/* Recursively update the parent */
+		newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
+								0, NULL, new_item, stack->bts_offset,
+								is_only);
+
+		/* be tidy */
+		pfree(newres);
+		pfree(new_item);
+	}
+}
+
 /*
  *	_bt_getstackbuf() -- Walk back up the tree one step, and find the item
  *						 we last looked at in the parent.
  *
- *		This is possible because we save a bit image of the last item
- *		we looked at in the parent, and the update algorithm guarantees
- *		that if items above us in the tree move, they only move right.
+ *		This is possible because we save the downlink from the parent item,
+ *		which is enough to uniquely identify it.  Insertions into the parent
+ *		level could cause the item to move right; deletions could cause it
+ *		to move left, but not left of the page we previously found it in.
  *
- *		Also, re-set bts_blkno & bts_offset if changed.
+ *		Adjusts bts_blkno & bts_offset if changed.
+ *
+ *		Returns InvalidBuffer if item not found (should not happen).
  */
 static Buffer
 _bt_getstackbuf(Relation rel, BTStack stack, int access)
 {
 	BlockNumber blkno;
-	Buffer		buf;
-	OffsetNumber start,
-				offnum,
-				maxoff;
-	Page		page;
-	ItemId		itemid;
-	BTItem		item;
-	BTPageOpaque opaque;
+	OffsetNumber start;
 
 	blkno = stack->bts_blkno;
-	buf = _bt_getbuf(rel, blkno, access);
-	page = BufferGetPage(buf);
-	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-	maxoff = PageGetMaxOffsetNumber(page);
-
 	start = stack->bts_offset;
 
-	/*
-	 * _bt_insertonpg set bts_offset to InvalidOffsetNumber in the case of
-	 * concurrent ROOT page split.	Also, watch out for possibility that
-	 * page has a high key now when it didn't before.
-	 */
-	if (start < P_FIRSTDATAKEY(opaque))
-		start = P_FIRSTDATAKEY(opaque);
-
 	for (;;)
 	{
-		/* see if it's on this page */
+		Buffer		buf;
+		Page		page;
+		BTPageOpaque opaque;
+		OffsetNumber offnum,
+					minoff,
+					maxoff;
+		ItemId		itemid;
+		BTItem		item;
+
+		buf = _bt_getbuf(rel, blkno, access);
+		page = BufferGetPage(buf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		minoff = P_FIRSTDATAKEY(opaque);
+		maxoff = PageGetMaxOffsetNumber(page);
+
+		/*
+		 * start = InvalidOffsetNumber means "search the whole page".
+		 * We need this test anyway due to possibility that
+		 * page has a high key now when it didn't before.
+		 */
+		if (start < minoff)
+			start = minoff;
+
+		/*
+		 * These loops will check every item on the page --- but in an order
+		 * that's attuned to the probability of where it actually is.  Scan
+		 * to the right first, then to the left.
+		 */
 		for (offnum = start;
 			 offnum <= maxoff;
 			 offnum = OffsetNumberNext(offnum))
@@ -1232,23 +1290,32 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access)
 			}
 		}
 
+		for (offnum = OffsetNumberPrev(start);
+			 offnum >= minoff;
+			 offnum = OffsetNumberPrev(offnum))
+		{
+			itemid = PageGetItemId(page, offnum);
+			item = (BTItem) PageGetItem(page, itemid);
+			if (BTItemSame(item, &stack->bts_btitem))
+			{
+				/* Return accurate pointer to where link is now */
+				stack->bts_blkno = blkno;
+				stack->bts_offset = offnum;
+				return buf;
+			}
+		}
+
 		/*
-		 * by here, the item we're looking for moved right at least one
-		 * page
+		 * The item we're looking for moved right at least one page.
 		 */
 		if (P_RIGHTMOST(opaque))
 		{
 			_bt_relbuf(rel, buf);
 			return (InvalidBuffer);
 		}
-
 		blkno = opaque->btpo_next;
+		start = InvalidOffsetNumber;
 		_bt_relbuf(rel, buf);
-		buf = _bt_getbuf(rel, blkno, access);
-		page = BufferGetPage(buf);
-		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-		maxoff = PageGetMaxOffsetNumber(page);
-		start = P_FIRSTDATAKEY(opaque);
 	}
 }
 
@@ -1289,6 +1356,11 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	Page		metapg;
 	BTMetaPageData *metad;
 
+	lbkno = BufferGetBlockNumber(lbuf);
+	rbkno = BufferGetBlockNumber(rbuf);
+	lpage = BufferGetPage(lbuf);
+	rpage = BufferGetPage(rbuf);
+
 	/* get a new root page */
 	rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
 	rootpage = BufferGetPage(rootbuf);
@@ -1303,22 +1375,15 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	/* set btree special data */
 	rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 	rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
-	rootopaque->btpo_flags |= BTP_ROOT;
-	rootopaque->btpo_parent = BTREE_METAPAGE;
+	rootopaque->btpo_flags = BTP_ROOT;
+	rootopaque->btpo.level =
+		((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
 
-	lbkno = BufferGetBlockNumber(lbuf);
-	rbkno = BufferGetBlockNumber(rbuf);
-	lpage = BufferGetPage(lbuf);
-	rpage = BufferGetPage(rbuf);
-
-	/*
-	 * Make sure pages in old root level have valid parent links --- we
-	 * will need this in _bt_insertonpg() if a concurrent root split
-	 * happens (see README).
-	 */
-	((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_parent =
-		((BTPageOpaque) PageGetSpecialPointer(rpage))->btpo_parent =
-		rootblknum;
+	/* update metapage data */
+	metad->btm_root = rootblknum;
+	metad->btm_level = rootopaque->btpo.level;
+	metad->btm_fastroot = rootblknum;
+	metad->btm_fastlevel = rootopaque->btpo.level;
 
 	/*
 	 * Create downlink item for left page (old root).  Since this will be
@@ -1356,9 +1421,6 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 		elog(PANIC, "btree: failed to add rightkey to new root page");
 	pfree(new_item);
 
-	metad->btm_root = rootblknum;
-	(metad->btm_level)++;
-
 	/* XLOG stuff */
 	if (!rel->rd_istemp)
 	{
@@ -1367,8 +1429,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 		XLogRecData rdata[2];
 
 		xlrec.node = rel->rd_node;
+		xlrec.rootblk = rootblknum;
 		xlrec.level = metad->btm_level;
-		BlockIdSet(&(xlrec.rootblk), rootblknum);
+
 		rdata[0].buffer = InvalidBuffer;
 		rdata[0].data = (char *) &xlrec;
 		rdata[0].len = SizeOfBtreeNewroot;
@@ -1390,8 +1453,6 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 		PageSetSUI(rootpage, ThisStartUpID);
 		PageSetLSN(metapg, recptr);
 		PageSetSUI(metapg, ThisStartUpID);
-
-		/* we changed their btpo_parent */
 		PageSetLSN(lpage, recptr);
 		PageSetSUI(lpage, ThisStartUpID);
 		PageSetLSN(rpage, recptr);
@@ -1406,620 +1467,6 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	return (rootbuf);
 }
 
-/*
- * In the event old root page was splitted but no new one was created we
- * build required parent levels keeping write lock on old root page.
- * Note: it's assumed that old root page' btpo_parent points to meta page,
- * ie not to parent page. On exit, new root page buffer is write locked.
- * If "release" is TRUE then oldrootbuf will be released immediately
- * after upper level is builded.
- */
-Buffer
-_bt_fixroot(Relation rel, Buffer oldrootbuf, bool release)
-{
-	Buffer		rootbuf;
-	BlockNumber rootblk;
-	Page		rootpage;
-	XLogRecPtr	rootLSN;
-	Page		oldrootpage = BufferGetPage(oldrootbuf);
-	BTPageOpaque oldrootopaque = (BTPageOpaque)
-	PageGetSpecialPointer(oldrootpage);
-	Buffer		buf,
-				leftbuf,
-				rightbuf;
-	Page		page,
-				leftpage,
-				rightpage;
-	BTPageOpaque opaque,
-				leftopaque,
-				rightopaque;
-	OffsetNumber newitemoff;
-	BTItem		btitem,
-				ritem;
-	Size		itemsz;
-
-	if (!P_LEFTMOST(oldrootopaque) || P_RIGHTMOST(oldrootopaque))
-		elog(ERROR, "bt_fixroot: not valid old root page");
-
-	/* Read right neighbor and create new root page */
-	leftbuf = _bt_getbuf(rel, oldrootopaque->btpo_next, BT_WRITE);
-	leftpage = BufferGetPage(leftbuf);
-	leftopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
-	rootbuf = _bt_newroot(rel, oldrootbuf, leftbuf);
-	rootpage = BufferGetPage(rootbuf);
-	rootLSN = PageGetLSN(rootpage);
-	rootblk = BufferGetBlockNumber(rootbuf);
-
-	/* parent page where to insert pointers */
-	buf = rootbuf;
-	page = BufferGetPage(buf);
-	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
-	/*
-	 * Now read other pages (if any) on level and add them to new root.
-	 * Here we break one of our locking rules - never hold lock on parent
-	 * page when acquiring lock on its child, - but we free from deadlock:
-	 *
-	 * If concurrent process will split one of pages on this level then it
-	 * will see either btpo_parent == metablock or btpo_parent == rootblk.
-	 * In first case it will give up its locks and walk to the leftmost
-	 * page (oldrootbuf) in _bt_fixup() - ie it will wait for us and let
-	 * us continue. In second case it will try to lock rootbuf keeping its
-	 * locks on buffers we already passed, also waiting for us. If we'll
-	 * have to unlock rootbuf (split it) and that process will have to
-	 * split page of new level we created (level of rootbuf) then it will
-	 * wait while we create upper level. Etc.
-	 */
-	while (!P_RIGHTMOST(leftopaque))
-	{
-		rightbuf = _bt_getbuf(rel, leftopaque->btpo_next, BT_WRITE);
-		rightpage = BufferGetPage(rightbuf);
-		rightopaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
-
-		/*
-		 * Update LSN & StartUpID of child page buffer to ensure that it
-		 * will be written on disk after flushing log record for new root
-		 * creation. Unfortunately, for the moment (?) we do not log this
-		 * operation and so possibly break our rule to log entire page
-		 * content on first after checkpoint modification.
-		 */
-		HOLD_INTERRUPTS();
-		rightopaque->btpo_parent = rootblk;
-		if (XLByteLT(PageGetLSN(rightpage), rootLSN))
-			PageSetLSN(rightpage, rootLSN);
-		PageSetSUI(rightpage, ThisStartUpID);
-		RESUME_INTERRUPTS();
-
-		ritem = (BTItem) PageGetItem(leftpage, PageGetItemId(leftpage, P_HIKEY));
-		btitem = _bt_formitem(&(ritem->bti_itup));
-		ItemPointerSet(&(btitem->bti_itup.t_tid), leftopaque->btpo_next, P_HIKEY);
-		itemsz = IndexTupleDSize(btitem->bti_itup)
-			+ (sizeof(BTItemData) - sizeof(IndexTupleData));
-		itemsz = MAXALIGN(itemsz);
-
-		newitemoff = OffsetNumberNext(PageGetMaxOffsetNumber(page));
-
-		if (PageGetFreeSpace(page) < itemsz)
-		{
-			Buffer		newbuf;
-			OffsetNumber firstright;
-			OffsetNumber itup_off;
-			BlockNumber itup_blkno;
-			bool		newitemonleft;
-
-			firstright = _bt_findsplitloc(rel, page,
-									 newitemoff, itemsz, &newitemonleft);
-			newbuf = _bt_split(rel, buf, firstright,
-							   newitemoff, itemsz, btitem, newitemonleft,
-							   &itup_off, &itup_blkno);
-			/* Keep lock on new "root" buffer ! */
-			if (buf != rootbuf)
-				_bt_relbuf(rel, buf);
-			buf = newbuf;
-			page = BufferGetPage(buf);
-			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-		}
-		else
-			_bt_insertuple(rel, buf, itemsz, btitem, newitemoff);
-
-		/* give up left buffer */
-		_bt_wrtbuf(rel, leftbuf);
-		pfree(btitem);
-		leftbuf = rightbuf;
-		leftpage = rightpage;
-		leftopaque = rightopaque;
-	}
-
-	/* give up rightmost page buffer */
-	_bt_wrtbuf(rel, leftbuf);
-
-	/*
-	 * Here we hold locks on old root buffer, new root buffer we've
-	 * created with _bt_newroot() - rootbuf, - and buf we've used for last
-	 * insert ops - buf. If rootbuf != buf then we have to create at least
-	 * one more level. And if "release" is TRUE then we give up
-	 * oldrootbuf.
-	 */
-	if (release)
-		_bt_wrtbuf(rel, oldrootbuf);
-
-	if (rootbuf != buf)
-	{
-		_bt_wrtbuf(rel, buf);
-		return (_bt_fixroot(rel, rootbuf, true));
-	}
-
-	return (rootbuf);
-}
-
-/*
- * Using blkno of leftmost page on a level inside tree this func
- * checks/fixes tree from this level up to the root page.
- */
-static void
-_bt_fixtree(Relation rel, BlockNumber blkno)
-{
-	Buffer		buf;
-	Page		page;
-	BTPageOpaque opaque;
-	BlockNumber pblkno;
-
-	for (;;)
-	{
-		buf = _bt_getbuf(rel, blkno, BT_READ);
-		page = BufferGetPage(buf);
-		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-		if (!P_LEFTMOST(opaque) || P_ISLEAF(opaque))
-			elog(ERROR, "bt_fixtree[%s]: invalid start page (need to recreate index)", RelationGetRelationName(rel));
-		pblkno = opaque->btpo_parent;
-
-		/* check/fix entire level */
-		_bt_fixlevel(rel, buf, InvalidBlockNumber);
-
-		/*
-		 * No pins/locks are held here. Re-read start page if its
-		 * btpo_parent pointed to meta page else go up one level.
-		 *
-		 * XXX have to catch InvalidBlockNumber at the moment -:(
-		 */
-		if (pblkno == BTREE_METAPAGE || pblkno == InvalidBlockNumber)
-		{
-			buf = _bt_getbuf(rel, blkno, BT_WRITE);
-			page = BufferGetPage(buf);
-			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-			if (P_ISROOT(opaque))
-			{
-				/* Tree is Ok now */
-				_bt_relbuf(rel, buf);
-				return;
-			}
-			/* Call _bt_fixroot() if there is no upper level */
-			if (BTreeInvalidParent(opaque))
-			{
-				elog(WARNING, "bt_fixtree[%s]: fixing root page", RelationGetRelationName(rel));
-				buf = _bt_fixroot(rel, buf, true);
-				_bt_relbuf(rel, buf);
-				return;
-			}
-			/* Have to go up one level */
-			pblkno = opaque->btpo_parent;
-			_bt_relbuf(rel, buf);
-		}
-		blkno = pblkno;
-	}
-
-}
-
-/*
- * Check/fix level starting from page in buffer buf up to block
- * limit on *child* level (or till rightmost child page if limit
- * is InvalidBlockNumber). Start buffer must be read locked.
- * No pins/locks are held on exit.
- */
-static void
-_bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit)
-{
-	BlockNumber blkno = BufferGetBlockNumber(buf);
-	Page		page;
-	BTPageOpaque opaque;
-	BlockNumber cblkno[3];
-	OffsetNumber coff[3];
-	Buffer		cbuf[3];
-	Page		cpage[3];
-	BTPageOpaque copaque[3];
-	BTItem		btitem;
-	int			cidx,
-				i;
-	bool		goodbye = false;
-	char		tbuf[BLCKSZ];
-
-	page = BufferGetPage(buf);
-	/* copy page to temp storage */
-	memmove(tbuf, page, PageGetPageSize(page));
-	_bt_relbuf(rel, buf);
-
-	page = (Page) tbuf;
-	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
-	/* Initialize first child data */
-	coff[0] = P_FIRSTDATAKEY(opaque);
-	if (coff[0] > PageGetMaxOffsetNumber(page))
-		elog(ERROR, "bt_fixlevel[%s]: invalid maxoff on start page (need to recreate index)", RelationGetRelationName(rel));
-	btitem = (BTItem) PageGetItem(page, PageGetItemId(page, coff[0]));
-	cblkno[0] = ItemPointerGetBlockNumber(&(btitem->bti_itup.t_tid));
-	cbuf[0] = _bt_getbuf(rel, cblkno[0], BT_READ);
-	cpage[0] = BufferGetPage(cbuf[0]);
-	copaque[0] = (BTPageOpaque) PageGetSpecialPointer(cpage[0]);
-	if (P_LEFTMOST(opaque) && !P_LEFTMOST(copaque[0]))
-		elog(ERROR, "bt_fixtlevel[%s]: non-leftmost child page of leftmost parent (need to recreate index)", RelationGetRelationName(rel));
-	/* caller should take care and avoid this */
-	if (P_RIGHTMOST(copaque[0]))
-		elog(ERROR, "bt_fixtlevel[%s]: invalid start child (need to recreate index)", RelationGetRelationName(rel));
-
-	for (;;)
-	{
-		/*
-		 * Read up to 2 more child pages and look for pointers to them in
-		 * *saved* parent page
-		 */
-		coff[1] = coff[2] = InvalidOffsetNumber;
-		for (cidx = 0; cidx < 2;)
-		{
-			cidx++;
-			cblkno[cidx] = (copaque[cidx - 1])->btpo_next;
-			cbuf[cidx] = _bt_getbuf(rel, cblkno[cidx], BT_READ);
-			cpage[cidx] = BufferGetPage(cbuf[cidx]);
-			copaque[cidx] = (BTPageOpaque) PageGetSpecialPointer(cpage[cidx]);
-			coff[cidx] = _bt_getoff(page, cblkno[cidx]);
-
-			/* sanity check */
-			if (coff[cidx] != InvalidOffsetNumber)
-			{
-				for (i = cidx - 1; i >= 0; i--)
-				{
-					if (coff[i] == InvalidOffsetNumber)
-						continue;
-					if (coff[cidx] != coff[i] + 1)
-						elog(ERROR, "bt_fixlevel[%s]: invalid item order(1) (need to recreate index)", RelationGetRelationName(rel));
-					break;
-				}
-			}
-
-			if (P_RIGHTMOST(copaque[cidx]))
-				break;
-		}
-
-		/*
-		 * Read parent page and insert missed pointers.
-		 */
-		if (coff[1] == InvalidOffsetNumber ||
-			(cidx == 2 && coff[2] == InvalidOffsetNumber))
-		{
-			Buffer		newbuf;
-			Page		newpage;
-			BTPageOpaque newopaque;
-			BTItem		ritem;
-			Size		itemsz;
-			OffsetNumber newitemoff;
-			BlockNumber parblk[3];
-			BTStackData stack;
-
-			stack.bts_parent = NULL;
-			stack.bts_blkno = blkno;
-			stack.bts_offset = InvalidOffsetNumber;
-			ItemPointerSet(&(stack.bts_btitem.bti_itup.t_tid),
-						   cblkno[0], P_HIKEY);
-
-			buf = _bt_getstackbuf(rel, &stack, BT_WRITE);
-			if (buf == InvalidBuffer)
-				elog(ERROR, "bt_fixlevel[%s]: pointer disappeared (need to recreate index)", RelationGetRelationName(rel));
-
-			page = BufferGetPage(buf);
-			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-			coff[0] = stack.bts_offset;
-			blkno = BufferGetBlockNumber(buf);
-			parblk[0] = blkno;
-
-			/* Check/insert missed pointers */
-			for (i = 1; i <= cidx; i++)
-			{
-				coff[i] = _bt_getoff(page, cblkno[i]);
-
-				/* sanity check */
-				parblk[i] = BufferGetBlockNumber(buf);
-				if (coff[i] != InvalidOffsetNumber)
-				{
-					if (parblk[i] == parblk[i - 1] &&
-						coff[i] != coff[i - 1] + 1)
-						elog(ERROR, "bt_fixlevel[%s]: invalid item order(2) (need to recreate index)", RelationGetRelationName(rel));
-					continue;
-				}
-				/* Have to check next page ? */
-				if ((!P_RIGHTMOST(opaque)) &&
-					coff[i - 1] == PageGetMaxOffsetNumber(page))		/* yes */
-				{
-					newbuf = _bt_getbuf(rel, opaque->btpo_next, BT_WRITE);
-					newpage = BufferGetPage(newbuf);
-					newopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
-					coff[i] = _bt_getoff(newpage, cblkno[i]);
-					if (coff[i] != InvalidOffsetNumber) /* found ! */
-					{
-						if (coff[i] != P_FIRSTDATAKEY(newopaque))
-							elog(ERROR, "bt_fixlevel[%s]: invalid item order(3) (need to recreate index)", RelationGetRelationName(rel));
-						_bt_relbuf(rel, buf);
-						buf = newbuf;
-						page = newpage;
-						opaque = newopaque;
-						blkno = BufferGetBlockNumber(buf);
-						parblk[i] = blkno;
-						continue;
-					}
-					/* unfound - need to insert on current page */
-					_bt_relbuf(rel, newbuf);
-				}
-				/* insert pointer */
-				ritem = (BTItem) PageGetItem(cpage[i - 1],
-								   PageGetItemId(cpage[i - 1], P_HIKEY));
-				btitem = _bt_formitem(&(ritem->bti_itup));
-				ItemPointerSet(&(btitem->bti_itup.t_tid), cblkno[i], P_HIKEY);
-				itemsz = IndexTupleDSize(btitem->bti_itup)
-					+ (sizeof(BTItemData) - sizeof(IndexTupleData));
-				itemsz = MAXALIGN(itemsz);
-
-				newitemoff = coff[i - 1] + 1;
-
-				if (PageGetFreeSpace(page) < itemsz)
-				{
-					OffsetNumber firstright;
-					OffsetNumber itup_off;
-					BlockNumber itup_blkno;
-					bool		newitemonleft;
-
-					firstright = _bt_findsplitloc(rel, page,
-									 newitemoff, itemsz, &newitemonleft);
-					newbuf = _bt_split(rel, buf, firstright,
-							   newitemoff, itemsz, btitem, newitemonleft,
-									   &itup_off, &itup_blkno);
-					/* what buffer we need in ? */
-					if (newitemonleft)
-						_bt_relbuf(rel, newbuf);
-					else
-					{
-						_bt_relbuf(rel, buf);
-						buf = newbuf;
-						page = BufferGetPage(buf);
-						opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-					}
-					blkno = BufferGetBlockNumber(buf);
-					coff[i] = itup_off;
-				}
-				else
-				{
-					_bt_insertuple(rel, buf, itemsz, btitem, newitemoff);
-					coff[i] = newitemoff;
-				}
-
-				pfree(btitem);
-				parblk[i] = blkno;
-			}
-
-			/* copy page with pointer to cblkno[cidx] to temp storage */
-			memmove(tbuf, page, PageGetPageSize(page));
-			_bt_relbuf(rel, buf);
-			page = (Page) tbuf;
-			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-		}
-
-		/* Continue if current check/fix level page is rightmost */
-		if (P_RIGHTMOST(opaque))
-			goodbye = false;
-
-		/* Pointers to child pages are Ok - right end of child level ? */
-		_bt_relbuf(rel, cbuf[0]);
-		_bt_relbuf(rel, cbuf[1]);
-		if (cidx == 1 ||
-			(cidx == 2 && (P_RIGHTMOST(copaque[2]) || goodbye)))
-		{
-			if (cidx == 2)
-				_bt_relbuf(rel, cbuf[2]);
-			return;
-		}
-		if (cblkno[0] == limit || cblkno[1] == limit)
-			goodbye = true;
-		cblkno[0] = cblkno[2];
-		cbuf[0] = cbuf[2];
-		cpage[0] = cpage[2];
-		copaque[0] = copaque[2];
-		coff[0] = coff[2];
-	}
-}
-
-/*
- * Check/fix part of tree - branch - up from parent of level with blocks
- * lblkno and rblknum. We first ensure that parent level has pointers
- * to both lblkno & rblknum and if those pointers are on different
- * parent pages then do the same for parent level, etc. No locks must
- * be held on target level and upper on entry. No locks will be held
- * on exit. Stack created when traversing tree down should be provided and
- * it must points to parent level. rblkno must be on the right from lblkno.
- * (This function is special edition of more expensive _bt_fixtree(),
- * but it doesn't guarantee full consistency of tree.)
- */
-static void
-_bt_fixbranch(Relation rel, BlockNumber lblkno,
-			  BlockNumber rblkno, BTStack true_stack)
-{
-	BlockNumber blkno = true_stack->bts_blkno;
-	BTStackData stack;
-	BTPageOpaque opaque;
-	Buffer		buf,
-				rbuf;
-	Page		page;
-	OffsetNumber offnum;
-
-	true_stack = true_stack->bts_parent;
-	for (;;)
-	{
-		buf = _bt_getbuf(rel, blkno, BT_READ);
-
-		/* Check/fix parent level pointed by blkno */
-		_bt_fixlevel(rel, buf, rblkno);
-
-		/*
-		 * Here parent level should have pointers for both lblkno and
-		 * rblkno and we have to find them.
-		 */
-		stack.bts_parent = NULL;
-		stack.bts_blkno = blkno;
-		stack.bts_offset = InvalidOffsetNumber;
-		ItemPointerSet(&(stack.bts_btitem.bti_itup.t_tid), lblkno, P_HIKEY);
-		buf = _bt_getstackbuf(rel, &stack, BT_READ);
-		if (buf == InvalidBuffer)
-			elog(ERROR, "bt_fixbranch[%s]: left pointer unfound (need to recreate index)", RelationGetRelationName(rel));
-		page = BufferGetPage(buf);
-		offnum = _bt_getoff(page, rblkno);
-
-		if (offnum != InvalidOffsetNumber)		/* right pointer found */
-		{
-			if (offnum <= stack.bts_offset)
-				elog(ERROR, "bt_fixbranch[%s]: invalid item order (need to recreate index)", RelationGetRelationName(rel));
-			_bt_relbuf(rel, buf);
-			return;
-		}
-
-		/* Pointers are on different parent pages - find right one */
-		lblkno = BufferGetBlockNumber(buf);
-		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-		if (P_RIGHTMOST(opaque))
-			elog(ERROR, "bt_fixbranch[%s]: right pointer unfound(1) (need to recreate index)", RelationGetRelationName(rel));
-
-		stack.bts_parent = NULL;
-		stack.bts_blkno = opaque->btpo_next;
-		stack.bts_offset = InvalidOffsetNumber;
-		ItemPointerSet(&(stack.bts_btitem.bti_itup.t_tid), rblkno, P_HIKEY);
-		rbuf = _bt_getstackbuf(rel, &stack, BT_READ);
-		if (rbuf == InvalidBuffer)
-			elog(ERROR, "bt_fixbranch[%s]: right pointer unfound(2) (need to recreate index)", RelationGetRelationName(rel));
-		rblkno = BufferGetBlockNumber(rbuf);
-		_bt_relbuf(rel, rbuf);
-
-		/*
-		 * If we have parent item in true_stack then go up one level and
-		 * ensure that it has pointers to new lblkno & rblkno.
-		 */
-		if (true_stack)
-		{
-			_bt_relbuf(rel, buf);
-			blkno = true_stack->bts_blkno;
-			true_stack = true_stack->bts_parent;
-			continue;
-		}
-
-		/*
-		 * Well, we are on the level that was root or unexistent when we
-		 * started traversing tree down. If btpo_parent is updated then
-		 * we'll use it to continue, else we'll fix/restore upper levels
-		 * entirely.
-		 */
-		if (!BTreeInvalidParent(opaque))
-		{
-			blkno = opaque->btpo_parent;
-			_bt_relbuf(rel, buf);
-			continue;
-		}
-
-		/* Have to switch to excl buf lock and re-check btpo_parent */
-		_bt_relbuf(rel, buf);
-		buf = _bt_getbuf(rel, blkno, BT_WRITE);
-		page = BufferGetPage(buf);
-		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-		if (!BTreeInvalidParent(opaque))
-		{
-			blkno = opaque->btpo_parent;
-			_bt_relbuf(rel, buf);
-			continue;
-		}
-
-		/*
-		 * We hold excl lock on some internal page with unupdated
-		 * btpo_parent - time for _bt_fixup.
-		 */
-		break;
-	}
-
-	elog(WARNING, "bt_fixbranch[%s]: fixing upper levels", RelationGetRelationName(rel));
-	_bt_fixup(rel, buf);
-
-	return;
-}
-
-/*
- * Having buf excl locked this routine walks to the left on level and
- * uses either _bt_fixtree() or _bt_fixroot() to create/check&fix upper
- * levels. No buffer pins/locks will be held on exit.
- */
-static void
-_bt_fixup(Relation rel, Buffer buf)
-{
-	Page		page;
-	BTPageOpaque opaque;
-	BlockNumber blkno;
-
-	for (;;)
-	{
-		page = BufferGetPage(buf);
-		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
-		/*
-		 * If someone else already created parent pages then it's time for
-		 * _bt_fixtree() to check upper levels and fix them, if required.
-		 */
-		if (!BTreeInvalidParent(opaque))
-		{
-			blkno = opaque->btpo_parent;
-			_bt_relbuf(rel, buf);
-			elog(WARNING, "bt_fixup[%s]: checking/fixing upper levels", RelationGetRelationName(rel));
-			_bt_fixtree(rel, blkno);
-			return;
-		}
-		if (P_LEFTMOST(opaque))
-			break;
-		blkno = opaque->btpo_prev;
-		_bt_relbuf(rel, buf);
-		buf = _bt_getbuf(rel, blkno, BT_WRITE);
-	}
-
-	/*
-	 * Ok, we are on the leftmost page, it's write locked by us and its
-	 * btpo_parent points to meta page - time for _bt_fixroot().
-	 */
-	elog(WARNING, "bt_fixup[%s]: fixing root page", RelationGetRelationName(rel));
-	buf = _bt_fixroot(rel, buf, true);
-	_bt_relbuf(rel, buf);
-}
-
-static OffsetNumber
-_bt_getoff(Page page, BlockNumber blkno)
-{
-	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-	OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
-	OffsetNumber offnum = P_FIRSTDATAKEY(opaque);
-	BlockNumber curblkno;
-	ItemId		itemid;
-	BTItem		item;
-
-	for (; offnum <= maxoff; offnum++)
-	{
-		itemid = PageGetItemId(page, offnum);
-		item = (BTItem) PageGetItem(page, itemid);
-		curblkno = ItemPointerGetBlockNumber(&(item->bti_itup.t_tid));
-		if (curblkno == blkno)
-			return (offnum);
-	}
-
-	return (InvalidOffsetNumber);
-}
-
 /*
  *	_bt_pgaddtup() -- add a tuple to a particular page in the index.
  *
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 110de69406..c9879b73ae 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.58 2002/08/06 02:36:33 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.59 2003/02/21 00:06:21 tgl Exp $
  *
  *	NOTES
  *	   Postgres btree pages look like ordinary relation pages.	The opaque
@@ -47,15 +47,16 @@ extern Buffer _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release);
 
 #define USELOCKING		(!BuildingBtree && !IsInitProcessingMode())
 
+
 /*
- *	_bt_metapinit() -- Initialize the metadata page of a btree.
+ *	_bt_metapinit() -- Initialize the metadata page of a new btree.
  */
 void
 _bt_metapinit(Relation rel)
 {
 	Buffer		buf;
 	Page		pg;
-	BTMetaPageData metad;
+	BTMetaPageData *metad;
 	BTPageOpaque op;
 
 	/* can't be sharing this with anyone, now... */
@@ -67,18 +68,51 @@ _bt_metapinit(Relation rel)
 			 RelationGetRelationName(rel));
 
 	buf = ReadBuffer(rel, P_NEW);
+	Assert(BufferGetBlockNumber(buf) == BTREE_METAPAGE);
 	pg = BufferGetPage(buf);
+
+	/* NO ELOG(ERROR) from here till newmeta op is logged */
+	START_CRIT_SECTION();
+
 	_bt_pageinit(pg, BufferGetPageSize(buf));
 
-	metad.btm_magic = BTREE_MAGIC;
-	metad.btm_version = BTREE_VERSION;
-	metad.btm_root = P_NONE;
-	metad.btm_level = 0;
-	memcpy((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad));
+	metad = BTPageGetMeta(pg);
+	metad->btm_magic = BTREE_MAGIC;
+	metad->btm_version = BTREE_VERSION;
+	metad->btm_root = P_NONE;
+	metad->btm_level = 0;
+	metad->btm_fastroot = P_NONE;
+	metad->btm_fastlevel = 0;
 
 	op = (BTPageOpaque) PageGetSpecialPointer(pg);
 	op->btpo_flags = BTP_META;
 
+	/* XLOG stuff */
+	if (!rel->rd_istemp)
+	{
+		xl_btree_newmeta xlrec;
+		XLogRecPtr	recptr;
+		XLogRecData rdata[1];
+
+		xlrec.node = rel->rd_node;
+		xlrec.meta.root = metad->btm_root;
+		xlrec.meta.level = metad->btm_level;
+		xlrec.meta.fastroot = metad->btm_fastroot;
+		xlrec.meta.fastlevel = metad->btm_fastlevel;
+
+		rdata[0].buffer = InvalidBuffer;
+		rdata[0].data = (char *) &xlrec;
+		rdata[0].len = SizeOfBtreeNewmeta;
+		rdata[0].next = NULL;
+
+		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWMETA, rdata);
+
+		PageSetLSN(pg, recptr);
+		PageSetSUI(pg, ThisStartUpID);
+	}
+
+	END_CRIT_SECTION();
+
 	WriteBuffer(buf);
 
 	/* all done */
@@ -102,6 +136,14 @@ _bt_metapinit(Relation rel)
  *		NOTE that the returned root page will have only a read lock set
  *		on it even if access = BT_WRITE!
  *
+ *		The returned page is not necessarily the true root --- it could be
+ *		a "fast root" (a page that is alone in its level due to deletions).
+ *		Also, if the root page is split while we are "in flight" to it,
+ *		what we will return is the old root, which is now just the leftmost
+ *		page on a probably-not-very-wide level.  For most purposes this is
+ *		as good as or better than the true root, so we do not bother to
+ *		insist on finding the true root.
+ *
  *		On successful return, the root page is pinned and read-locked.
  *		The metadata page is not locked or pinned on exit.
  */
@@ -162,15 +204,19 @@ _bt_getroot(Relation rel, int access)
 			rootblkno = BufferGetBlockNumber(rootbuf);
 			rootpage = BufferGetPage(rootbuf);
 
+			_bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
+			rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+			rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+			rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
+			rootopaque->btpo.level = 0;
+
 			/* NO ELOG(ERROR) till meta is updated */
 			START_CRIT_SECTION();
 
 			metad->btm_root = rootblkno;
-			metad->btm_level = 1;
-
-			_bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
-			rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
-			rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT);
+			metad->btm_level = 0;
+			metad->btm_fastroot = rootblkno;
+			metad->btm_fastlevel = 0;
 
 			/* XLOG stuff */
 			if (!rel->rd_istemp)
@@ -180,16 +226,15 @@ _bt_getroot(Relation rel, int access)
 				XLogRecData rdata;
 
 				xlrec.node = rel->rd_node;
-				xlrec.level = 1;
-				BlockIdSet(&(xlrec.rootblk), rootblkno);
+				xlrec.rootblk = rootblkno;
+				xlrec.level = 0;
+
 				rdata.buffer = InvalidBuffer;
 				rdata.data = (char *) &xlrec;
 				rdata.len = SizeOfBtreeNewroot;
 				rdata.next = NULL;
 
-				recptr = XLogInsert(RM_BTREE_ID,
-									XLOG_BTREE_NEWROOT | XLOG_BTREE_LEAF,
-									&rdata);
+				recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);
 
 				PageSetLSN(rootpage, recptr);
 				PageSetSUI(rootpage, ThisStartUpID);
@@ -201,7 +246,11 @@ _bt_getroot(Relation rel, int access)
 
 			_bt_wrtnorelbuf(rel, rootbuf);
 
-			/* swap write lock for read lock */
+			/*
+			 * swap root write lock for read lock.  There is no danger of
+			 * anyone else accessing the new root page while it's unlocked,
+			 * since no one else knows where it is yet.
+			 */
 			LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
 			LockBuffer(rootbuf, BT_READ);
 
@@ -221,86 +270,72 @@ _bt_getroot(Relation rel, int access)
 	}
 	else
 	{
-		rootblkno = metad->btm_root;
+		rootblkno = metad->btm_fastroot;
+
 		_bt_relbuf(rel, metabuf);		/* done with the meta page */
 
 		rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
 	}
 
 	/*
-	 * Race condition:	If the root page split between the time we looked
-	 * at the metadata page and got the root buffer, then we got the wrong
-	 * buffer.	Release it and try again.
+	 * By here, we have a pin and read lock on the root page, and no
+	 * lock set on the metadata page.  Return the root page's buffer.
 	 */
-	rootpage = BufferGetPage(rootbuf);
-	rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+	return rootbuf;
+}
 
-	if (!P_ISROOT(rootopaque))
+/*
+ *	_bt_gettrueroot() -- Get the true root page of the btree.
+ *
+ *		This is the same as the BT_READ case of _bt_getroot(), except
+ *		we follow the true-root link not the fast-root link.
+ *
+ * By the time we acquire lock on the root page, it might have been split and
+ * not be the true root anymore.  This is okay for the present uses of this
+ * routine; we only really need to be able to move up at least one tree level
+ * from whatever non-root page we were at.  If we ever do need to lock the
+ * one true root page, we could loop here, re-reading the metapage on each
+ * failure.  (Note that it wouldn't do to hold the lock on the metapage while
+ * moving to the root --- that'd deadlock against any concurrent root split.)
+ */
+Buffer
+_bt_gettrueroot(Relation rel)
+{
+	Buffer		metabuf;
+	Page		metapg;
+	BTPageOpaque metaopaque;
+	Buffer		rootbuf;
+	BlockNumber rootblkno;
+	BTMetaPageData *metad;
+
+	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+	metapg = BufferGetPage(metabuf);
+	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+	metad = BTPageGetMeta(metapg);
+
+	if (!(metaopaque->btpo_flags & BTP_META) ||
+		metad->btm_magic != BTREE_MAGIC)
+		elog(ERROR, "Index %s is not a btree",
+			 RelationGetRelationName(rel));
+
+	if (metad->btm_version != BTREE_VERSION)
+		elog(ERROR, "Version mismatch on %s: version %d file, version %d code",
+			 RelationGetRelationName(rel),
+			 metad->btm_version, BTREE_VERSION);
+
+	/* if no root page initialized yet, fail */
+	if (metad->btm_root == P_NONE)
 	{
-		/*
-		 * It happened, but if root page splitter failed to create new
-		 * root page then we'll go in loop trying to call _bt_getroot
-		 * again and again.
-		 */
-		if (FixBTree)
-		{
-			Buffer		newrootbuf;
-
-	check_parent:;
-			if (BTreeInvalidParent(rootopaque)) /* unupdated! */
-			{
-				LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
-				LockBuffer(rootbuf, BT_WRITE);
-
-				/* handle concurrent fix of root page */
-				if (BTreeInvalidParent(rootopaque))		/* unupdated! */
-				{
-					elog(WARNING, "bt_getroot[%s]: fixing root page", RelationGetRelationName(rel));
-					newrootbuf = _bt_fixroot(rel, rootbuf, true);
-					LockBuffer(newrootbuf, BUFFER_LOCK_UNLOCK);
-					LockBuffer(newrootbuf, BT_READ);
-					rootbuf = newrootbuf;
-					rootpage = BufferGetPage(rootbuf);
-					rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
-					/* New root might be splitted while changing lock */
-					if (P_ISROOT(rootopaque))
-						return (rootbuf);
-					/* rootbuf is read locked */
-					goto check_parent;
-				}
-				else
-				{
-					/* someone else already fixed root */
-					LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
-					LockBuffer(rootbuf, BT_READ);
-				}
-			}
-
-			/*
-			 * Ok, here we have old root page with btpo_parent pointing to
-			 * upper level - check parent page because of there is good
-			 * chance that parent is root page.
-			 */
-			newrootbuf = _bt_getbuf(rel, rootopaque->btpo_parent, BT_READ);
-			_bt_relbuf(rel, rootbuf);
-			rootbuf = newrootbuf;
-			rootpage = BufferGetPage(rootbuf);
-			rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
-			if (P_ISROOT(rootopaque))
-				return (rootbuf);
-			/* no luck -:( */
-		}
-
-		/* try again */
-		_bt_relbuf(rel, rootbuf);
-		return _bt_getroot(rel, access);
+		_bt_relbuf(rel, metabuf);
+		return InvalidBuffer;
 	}
 
-	/*
-	 * By here, we have a correct lock on the root block, its reference
-	 * count is correct, and we have no lock set on the metadata page.
-	 * Return the root block.
-	 */
+	rootblkno = metad->btm_root;
+
+	_bt_relbuf(rel, metabuf);	/* done with the meta page */
+
+	rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+
 	return rootbuf;
 }
 
@@ -397,13 +432,14 @@ _bt_wrtnorelbuf(Relation rel, Buffer buf)
 
 /*
  *	_bt_pageinit() -- Initialize a new page.
+ *
+ * On return, the page header is initialized; data space is empty;
+ * special space is zeroed out.
  */
 void
 _bt_pageinit(Page page, Size size)
 {
 	PageInit(page, size, sizeof(BTPageOpaqueData));
-	((BTPageOpaque) PageGetSpecialPointer(page))->btpo_parent =
-		InvalidBlockNumber;
 }
 
 /*
@@ -418,9 +454,12 @@ _bt_pageinit(Page page, Size size)
  *		at least the old root page when you call this, you're making a big
  *		mistake.  On exit, metapage data is correct and we no longer have
  *		a pin or lock on the metapage.
+ *
+ * XXX this is not used for splitting anymore, only in nbtsort.c at the
+ * completion of btree building.
  */
 void
-_bt_metaproot(Relation rel, BlockNumber rootbknum, int level)
+_bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
 {
 	Buffer		metabuf;
 	Page		metap;
@@ -431,12 +470,42 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, int level)
 	metap = BufferGetPage(metabuf);
 	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
 	Assert(metaopaque->btpo_flags & BTP_META);
+
+	/* NO ELOG(ERROR) from here till newmeta op is logged */
+	START_CRIT_SECTION();
+
 	metad = BTPageGetMeta(metap);
 	metad->btm_root = rootbknum;
-	if (level == 0)				/* called from _do_insert */
-		metad->btm_level += 1;
-	else
-		metad->btm_level = level;		/* called from btsort */
+	metad->btm_level = level;
+	metad->btm_fastroot = rootbknum;
+	metad->btm_fastlevel = level;
+
+	/* XLOG stuff */
+	if (!rel->rd_istemp)
+	{
+		xl_btree_newmeta xlrec;
+		XLogRecPtr	recptr;
+		XLogRecData rdata[1];
+
+		xlrec.node = rel->rd_node;
+		xlrec.meta.root = metad->btm_root;
+		xlrec.meta.level = metad->btm_level;
+		xlrec.meta.fastroot = metad->btm_fastroot;
+		xlrec.meta.fastlevel = metad->btm_fastlevel;
+
+		rdata[0].buffer = InvalidBuffer;
+		rdata[0].data = (char *) &xlrec;
+		rdata[0].len = SizeOfBtreeNewmeta;
+		rdata[0].next = NULL;
+
+		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWMETA, rdata);
+
+		PageSetLSN(metap, recptr);
+		PageSetSUI(metap, ThisStartUpID);
+	}
+
+	END_CRIT_SECTION();
+
 	_bt_wrtbuf(rel, metabuf);
 }
 
@@ -467,6 +536,7 @@ _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
 
 		xlrec.target.node = rel->rd_node;
 		xlrec.target.tid = *tid;
+
 		rdata[0].buffer = InvalidBuffer;
 		rdata[0].data = (char *) &xlrec;
 		rdata[0].len = SizeOfBtreeDelete;
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 3244beb121..de6765415f 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,21 +12,17 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.94 2002/11/15 01:26:08 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.95 2003/02/21 00:06:21 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
-
 #include "postgres.h"
 
 #include "access/genam.h"
 #include "access/heapam.h"
 #include "access/nbtree.h"
 #include "catalog/index.h"
-#include "executor/executor.h"
 #include "miscadmin.h"
-#include "storage/sinval.h"
-#include "access/xlogutils.h"
 
 
 /* Working state for btbuild and its callback */
@@ -817,396 +813,3 @@ _bt_restscan(IndexScanDesc scan)
 		ItemPointerSet(current, blkno, offnum);
 	}
 }
-
-static void
-_bt_restore_page(Page page, char *from, int len)
-{
-	BTItemData	btdata;
-	Size		itemsz;
-	char	   *end = from + len;
-
-	for (; from < end;)
-	{
-		memcpy(&btdata, from, sizeof(BTItemData));
-		itemsz = IndexTupleDSize(btdata.bti_itup) +
-			(sizeof(BTItemData) - sizeof(IndexTupleData));
-		itemsz = MAXALIGN(itemsz);
-		if (PageAddItem(page, (Item) from, itemsz,
-					  FirstOffsetNumber, LP_USED) == InvalidOffsetNumber)
-			elog(PANIC, "_bt_restore_page: can't add item to page");
-		from += itemsz;
-	}
-}
-
-static void
-btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
-{
-	xl_btree_delete *xlrec;
-	Relation	reln;
-	Buffer		buffer;
-	Page		page;
-
-	if (!redo || (record->xl_info & XLR_BKP_BLOCK_1))
-		return;
-
-	xlrec = (xl_btree_delete *) XLogRecGetData(record);
-	reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
-	if (!RelationIsValid(reln))
-		return;
-	buffer = XLogReadBuffer(false, reln,
-						ItemPointerGetBlockNumber(&(xlrec->target.tid)));
-	if (!BufferIsValid(buffer))
-		elog(PANIC, "btree_delete_redo: block unfound");
-	page = (Page) BufferGetPage(buffer);
-	if (PageIsNew((PageHeader) page))
-		elog(PANIC, "btree_delete_redo: uninitialized page");
-
-	if (XLByteLE(lsn, PageGetLSN(page)))
-	{
-		UnlockAndReleaseBuffer(buffer);
-		return;
-	}
-
-	PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid)));
-
-	PageSetLSN(page, lsn);
-	PageSetSUI(page, ThisStartUpID);
-	UnlockAndWriteBuffer(buffer);
-
-	return;
-}
-
-static void
-btree_xlog_insert(bool redo, XLogRecPtr lsn, XLogRecord *record)
-{
-	xl_btree_insert *xlrec;
-	Relation	reln;
-	Buffer		buffer;
-	Page		page;
-	BTPageOpaque pageop;
-
-	if (redo && (record->xl_info & XLR_BKP_BLOCK_1))
-		return;
-
-	xlrec = (xl_btree_insert *) XLogRecGetData(record);
-	reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
-	if (!RelationIsValid(reln))
-		return;
-	buffer = XLogReadBuffer(false, reln,
-						ItemPointerGetBlockNumber(&(xlrec->target.tid)));
-	if (!BufferIsValid(buffer))
-		elog(PANIC, "btree_insert_%sdo: block unfound", (redo) ? "re" : "un");
-	page = (Page) BufferGetPage(buffer);
-	if (PageIsNew((PageHeader) page))
-		elog(PANIC, "btree_insert_%sdo: uninitialized page", (redo) ? "re" : "un");
-	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
-
-	if (redo)
-	{
-		if (XLByteLE(lsn, PageGetLSN(page)))
-		{
-			UnlockAndReleaseBuffer(buffer);
-			return;
-		}
-		if (PageAddItem(page, (Item) ((char *) xlrec + SizeOfBtreeInsert),
-						record->xl_len - SizeOfBtreeInsert,
-						ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
-						LP_USED) == InvalidOffsetNumber)
-			elog(PANIC, "btree_insert_redo: failed to add item");
-
-		PageSetLSN(page, lsn);
-		PageSetSUI(page, ThisStartUpID);
-		UnlockAndWriteBuffer(buffer);
-	}
-	else
-	{
-		if (XLByteLT(PageGetLSN(page), lsn))
-			elog(PANIC, "btree_insert_undo: bad page LSN");
-
-		if (!P_ISLEAF(pageop))
-		{
-			UnlockAndReleaseBuffer(buffer);
-			return;
-		}
-
-		elog(PANIC, "btree_insert_undo: unimplemented");
-	}
-
-	return;
-}
-
-static void
-btree_xlog_split(bool redo, bool onleft, XLogRecPtr lsn, XLogRecord *record)
-{
-	xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
-	Relation	reln;
-	BlockNumber blkno;
-	Buffer		buffer;
-	Page		page;
-	BTPageOpaque pageop;
-	char	   *op = (redo) ? "redo" : "undo";
-	bool		isleaf = (record->xl_info & XLOG_BTREE_LEAF);
-
-	reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
-	if (!RelationIsValid(reln))
-		return;
-
-	/* Left (original) sibling */
-	blkno = (onleft) ? ItemPointerGetBlockNumber(&(xlrec->target.tid)) :
-		BlockIdGetBlockNumber(&(xlrec->otherblk));
-	buffer = XLogReadBuffer(false, reln, blkno);
-	if (!BufferIsValid(buffer))
-		elog(PANIC, "btree_split_%s: lost left sibling", op);
-
-	page = (Page) BufferGetPage(buffer);
-	if (redo)
-		_bt_pageinit(page, BufferGetPageSize(buffer));
-	else if (PageIsNew((PageHeader) page))
-		elog(PANIC, "btree_split_undo: uninitialized left sibling");
-	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
-
-	if (redo)
-	{
-		pageop->btpo_parent = BlockIdGetBlockNumber(&(xlrec->parentblk));
-		pageop->btpo_prev = BlockIdGetBlockNumber(&(xlrec->leftblk));
-		if (onleft)
-			pageop->btpo_next = BlockIdGetBlockNumber(&(xlrec->otherblk));
-		else
-			pageop->btpo_next = ItemPointerGetBlockNumber(&(xlrec->target.tid));
-		pageop->btpo_flags = (isleaf) ? BTP_LEAF : 0;
-
-		_bt_restore_page(page, (char *) xlrec + SizeOfBtreeSplit, xlrec->leftlen);
-
-		PageSetLSN(page, lsn);
-		PageSetSUI(page, ThisStartUpID);
-		UnlockAndWriteBuffer(buffer);
-	}
-	else
-/* undo */
-	{
-		if (XLByteLT(PageGetLSN(page), lsn))
-			elog(PANIC, "btree_split_undo: bad left sibling LSN");
-		elog(PANIC, "btree_split_undo: unimplemented");
-	}
-
-	/* Right (new) sibling */
-	blkno = (onleft) ? BlockIdGetBlockNumber(&(xlrec->otherblk)) :
-		ItemPointerGetBlockNumber(&(xlrec->target.tid));
-	buffer = XLogReadBuffer((redo) ? true : false, reln, blkno);
-	if (!BufferIsValid(buffer))
-		elog(PANIC, "btree_split_%s: lost right sibling", op);
-
-	page = (Page) BufferGetPage(buffer);
-	if (redo)
-		_bt_pageinit(page, BufferGetPageSize(buffer));
-	else if (PageIsNew((PageHeader) page))
-		elog(PANIC, "btree_split_undo: uninitialized right sibling");
-	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
-
-	if (redo)
-	{
-		pageop->btpo_parent = BlockIdGetBlockNumber(&(xlrec->parentblk));
-		pageop->btpo_prev = (onleft) ?
-			ItemPointerGetBlockNumber(&(xlrec->target.tid)) :
-			BlockIdGetBlockNumber(&(xlrec->otherblk));
-		pageop->btpo_next = BlockIdGetBlockNumber(&(xlrec->rightblk));
-		pageop->btpo_flags = (isleaf) ? BTP_LEAF : 0;
-
-		_bt_restore_page(page,
-					  (char *) xlrec + SizeOfBtreeSplit + xlrec->leftlen,
-					 record->xl_len - SizeOfBtreeSplit - xlrec->leftlen);
-
-		PageSetLSN(page, lsn);
-		PageSetSUI(page, ThisStartUpID);
-		UnlockAndWriteBuffer(buffer);
-	}
-	else
-/* undo */
-	{
-		if (XLByteLT(PageGetLSN(page), lsn))
-			elog(PANIC, "btree_split_undo: bad right sibling LSN");
-		elog(PANIC, "btree_split_undo: unimplemented");
-	}
-
-	if (!redo || (record->xl_info & XLR_BKP_BLOCK_1))
-		return;
-
-	/* Right (next) page */
-	blkno = BlockIdGetBlockNumber(&(xlrec->rightblk));
-	if (blkno == P_NONE)
-		return;
-
-	buffer = XLogReadBuffer(false, reln, blkno);
-	if (!BufferIsValid(buffer))
-		elog(PANIC, "btree_split_redo: lost next right page");
-
-	page = (Page) BufferGetPage(buffer);
-	if (PageIsNew((PageHeader) page))
-		elog(PANIC, "btree_split_redo: uninitialized next right page");
-
-	if (XLByteLE(lsn, PageGetLSN(page)))
-	{
-		UnlockAndReleaseBuffer(buffer);
-		return;
-	}
-	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
-	pageop->btpo_prev = (onleft) ?
-		BlockIdGetBlockNumber(&(xlrec->otherblk)) :
-		ItemPointerGetBlockNumber(&(xlrec->target.tid));
-
-	PageSetLSN(page, lsn);
-	PageSetSUI(page, ThisStartUpID);
-	UnlockAndWriteBuffer(buffer);
-}
-
-static void
-btree_xlog_newroot(bool redo, XLogRecPtr lsn, XLogRecord *record)
-{
-	xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record);
-	Relation	reln;
-	Buffer		buffer;
-	Page		page;
-	BTPageOpaque pageop;
-	Buffer		metabuf;
-	Page		metapg;
-	BTMetaPageData md;
-
-	if (!redo)
-		return;
-
-	reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
-	if (!RelationIsValid(reln))
-		return;
-	buffer = XLogReadBuffer(true, reln, BlockIdGetBlockNumber(&(xlrec->rootblk)));
-	if (!BufferIsValid(buffer))
-		elog(PANIC, "btree_newroot_redo: no root page");
-	metabuf = XLogReadBuffer(false, reln, BTREE_METAPAGE);
-	if (!BufferIsValid(buffer))
-		elog(PANIC, "btree_newroot_redo: no metapage");
-	page = (Page) BufferGetPage(buffer);
-	_bt_pageinit(page, BufferGetPageSize(buffer));
-	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
-
-	pageop->btpo_flags |= BTP_ROOT;
-	pageop->btpo_prev = pageop->btpo_next = P_NONE;
-	pageop->btpo_parent = BTREE_METAPAGE;
-
-	if (record->xl_info & XLOG_BTREE_LEAF)
-		pageop->btpo_flags |= BTP_LEAF;
-
-	if (record->xl_len > SizeOfBtreeNewroot)
-		_bt_restore_page(page,
-						 (char *) xlrec + SizeOfBtreeNewroot,
-						 record->xl_len - SizeOfBtreeNewroot);
-
-	PageSetLSN(page, lsn);
-	PageSetSUI(page, ThisStartUpID);
-	UnlockAndWriteBuffer(buffer);
-
-	metapg = BufferGetPage(metabuf);
-	_bt_pageinit(metapg, BufferGetPageSize(metabuf));
-	md.btm_magic = BTREE_MAGIC;
-	md.btm_version = BTREE_VERSION;
-	md.btm_root = BlockIdGetBlockNumber(&(xlrec->rootblk));
-	md.btm_level = xlrec->level;
-	memcpy((char *) BTPageGetMeta(metapg), (char *) &md, sizeof(md));
-
-	pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
-	pageop->btpo_flags = BTP_META;
-
-	PageSetLSN(metapg, lsn);
-	PageSetSUI(metapg, ThisStartUpID);
-	UnlockAndWriteBuffer(metabuf);
-}
-
-void
-btree_redo(XLogRecPtr lsn, XLogRecord *record)
-{
-	uint8		info = record->xl_info & ~XLR_INFO_MASK;
-
-	info &= ~XLOG_BTREE_LEAF;
-	if (info == XLOG_BTREE_DELETE)
-		btree_xlog_delete(true, lsn, record);
-	else if (info == XLOG_BTREE_INSERT)
-		btree_xlog_insert(true, lsn, record);
-	else if (info == XLOG_BTREE_SPLIT)
-		btree_xlog_split(true, false, lsn, record);		/* new item on the right */
-	else if (info == XLOG_BTREE_SPLEFT)
-		btree_xlog_split(true, true, lsn, record);		/* new item on the left */
-	else if (info == XLOG_BTREE_NEWROOT)
-		btree_xlog_newroot(true, lsn, record);
-	else
-		elog(PANIC, "btree_redo: unknown op code %u", info);
-}
-
-void
-btree_undo(XLogRecPtr lsn, XLogRecord *record)
-{
-	uint8		info = record->xl_info & ~XLR_INFO_MASK;
-
-	info &= ~XLOG_BTREE_LEAF;
-	if (info == XLOG_BTREE_DELETE)
-		btree_xlog_delete(false, lsn, record);
-	else if (info == XLOG_BTREE_INSERT)
-		btree_xlog_insert(false, lsn, record);
-	else if (info == XLOG_BTREE_SPLIT)
-		btree_xlog_split(false, false, lsn, record);	/* new item on the right */
-	else if (info == XLOG_BTREE_SPLEFT)
-		btree_xlog_split(false, true, lsn, record);		/* new item on the left */
-	else if (info == XLOG_BTREE_NEWROOT)
-		btree_xlog_newroot(false, lsn, record);
-	else
-		elog(PANIC, "btree_undo: unknown op code %u", info);
-}
-
-static void
-out_target(char *buf, xl_btreetid *target)
-{
-	sprintf(buf + strlen(buf), "node %u/%u; tid %u/%u",
-			target->node.tblNode, target->node.relNode,
-			ItemPointerGetBlockNumber(&(target->tid)),
-			ItemPointerGetOffsetNumber(&(target->tid)));
-}
-
-void
-btree_desc(char *buf, uint8 xl_info, char *rec)
-{
-	uint8		info = xl_info & ~XLR_INFO_MASK;
-
-	info &= ~XLOG_BTREE_LEAF;
-	if (info == XLOG_BTREE_INSERT)
-	{
-		xl_btree_insert *xlrec = (xl_btree_insert *) rec;
-
-		strcat(buf, "insert: ");
-		out_target(buf, &(xlrec->target));
-	}
-	else if (info == XLOG_BTREE_DELETE)
-	{
-		xl_btree_delete *xlrec = (xl_btree_delete *) rec;
-
-		strcat(buf, "delete: ");
-		out_target(buf, &(xlrec->target));
-	}
-	else if (info == XLOG_BTREE_SPLIT || info == XLOG_BTREE_SPLEFT)
-	{
-		xl_btree_split *xlrec = (xl_btree_split *) rec;
-
-		sprintf(buf + strlen(buf), "split(%s): ",
-				(info == XLOG_BTREE_SPLIT) ? "right" : "left");
-		out_target(buf, &(xlrec->target));
-		sprintf(buf + strlen(buf), "; oth %u; rgh %u",
-				BlockIdGetBlockNumber(&xlrec->otherblk),
-				BlockIdGetBlockNumber(&xlrec->rightblk));
-	}
-	else if (info == XLOG_BTREE_NEWROOT)
-	{
-		xl_btree_newroot *xlrec = (xl_btree_newroot *) rec;
-
-		sprintf(buf + strlen(buf), "root: node %u/%u; blk %u",
-				xlrec->node.tblNode, xlrec->node.relNode,
-				BlockIdGetBlockNumber(&xlrec->rootblk));
-	}
-	else
-		strcat(buf, "UNKNOWN");
-}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 66f2428cd4..0daae3cd58 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.72 2002/06/20 20:29:25 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.73 2003/02/21 00:06:21 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -895,6 +895,89 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 	return true;
 }
 
+/*
+ * _bt_get_endpoint() -- Find the first or last page on a given tree level
+ *
+ * If the index is empty, we will return InvalidBuffer; any other failure
+ * condition causes elog().
+ *
+ * The returned buffer is pinned and read-locked.
+ */
+Buffer
+_bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
+{
+	Buffer		buf;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber offnum;
+	BlockNumber blkno;
+	BTItem		btitem;
+	IndexTuple	itup;
+
+	/*
+	 * If we are looking for a leaf page, okay to descend from fast root;
+	 * otherwise better descend from true root.  (There is no point in being
+	 * smarter about intermediate levels.)
+	 */
+	if (level == 0)
+		buf = _bt_getroot(rel, BT_READ);
+	else
+		buf = _bt_gettrueroot(rel);
+
+	if (!BufferIsValid(buf))
+	{
+		/* empty index... */
+		return InvalidBuffer;
+	}
+
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	for (;;)
+	{
+		/*
+		 * If we landed on a deleted page, step right to find a live page
+		 * (there must be one).  Also, if we want the rightmost page,
+		 * step right if needed to get to it (this could happen if the
+		 * page split since we obtained a pointer to it).
+		 */
+		while (P_ISDELETED(opaque) ||
+			   (rightmost && !P_RIGHTMOST(opaque)))
+		{
+			blkno = opaque->btpo_next;
+			if (blkno == P_NONE)
+				elog(ERROR, "_bt_get_endpoint: ran off end of btree");
+			_bt_relbuf(rel, buf);
+			buf = _bt_getbuf(rel, blkno, BT_READ);
+			page = BufferGetPage(buf);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		}
+
+		/* Done? */
+		if (opaque->btpo.level == level)
+			break;
+		if (opaque->btpo.level < level)
+			elog(ERROR, "_bt_get_endpoint: btree level %u not found", level);
+
+		/* Step to leftmost or rightmost child page */
+		if (rightmost)
+			offnum = PageGetMaxOffsetNumber(page);
+		else
+			offnum = P_FIRSTDATAKEY(opaque);
+
+		btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+		itup = &(btitem->bti_itup);
+		blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+
+		_bt_relbuf(rel, buf);
+		buf = _bt_getbuf(rel, blkno, BT_READ);
+		page = BufferGetPage(buf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	}
+
+	return buf;
+}
+
 /*
  *	_bt_endpoint() -- Find the first or last key in the index.
  *
@@ -910,8 +993,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 	Page		page;
 	BTPageOpaque opaque;
 	ItemPointer current;
-	OffsetNumber offnum,
-				maxoff;
+	OffsetNumber maxoff;
 	OffsetNumber start;
 	BlockNumber blkno;
 	BTItem		btitem;
@@ -929,7 +1011,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 	 * simplified version of _bt_search().	We don't maintain a stack
 	 * since we know we won't need it.
 	 */
-	buf = _bt_getroot(rel, BT_READ);
+	buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
 
 	if (!BufferIsValid(buf))
 	{
@@ -942,51 +1024,14 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 	blkno = BufferGetBlockNumber(buf);
 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	Assert(P_ISLEAF(opaque));
 
-	for (;;)
-	{
-		if (P_ISLEAF(opaque))
-			break;
-
-		if (ScanDirectionIsForward(dir))
-			offnum = P_FIRSTDATAKEY(opaque);
-		else
-			offnum = PageGetMaxOffsetNumber(page);
-
-		btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
-		itup = &(btitem->bti_itup);
-		blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
-
-		_bt_relbuf(rel, buf);
-		buf = _bt_getbuf(rel, blkno, BT_READ);
-
-		page = BufferGetPage(buf);
-		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
-		/*
-		 * Race condition: If the child page we just stepped onto was just
-		 * split, we need to make sure we're all the way at the right edge
-		 * of the tree.  See the paper by Lehman and Yao.
-		 */
-		if (ScanDirectionIsBackward(dir) && !P_RIGHTMOST(opaque))
-		{
-			do
-			{
-				blkno = opaque->btpo_next;
-				_bt_relbuf(rel, buf);
-				buf = _bt_getbuf(rel, blkno, BT_READ);
-				page = BufferGetPage(buf);
-				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-			} while (!P_RIGHTMOST(opaque));
-		}
-	}
-
-	/* okay, we've got the {left,right}-most page in the tree */
 	maxoff = PageGetMaxOffsetNumber(page);
 
 	if (ScanDirectionIsForward(dir))
 	{
-		Assert(P_LEFTMOST(opaque));
+		/* There could be dead pages to the left, so not this: */
+		/* Assert(P_LEFTMOST(opaque)); */
 
 		start = P_FIRSTDATAKEY(opaque);
 	}
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index b0c6db8720..f9d227ecd0 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -35,7 +35,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.70 2002/11/15 01:26:08 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.71 2003/02/21 00:06:21 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -43,6 +43,7 @@
 #include "postgres.h"
 
 #include "access/nbtree.h"
+#include "miscadmin.h"
 #include "utils/tuplesort.h"
 
 
@@ -76,7 +77,7 @@ typedef struct BTPageState
 	BTItem		btps_minkey;	/* copy of minimum key (first item) on
 								 * page */
 	OffsetNumber btps_lastoff;	/* last item offset loaded */
-	int			btps_level;		/* tree level (0 = leaf) */
+	uint32		btps_level;		/* tree level (0 = leaf) */
 	Size		btps_full;		/* "full" if less than this much free
 								 * space */
 	struct BTPageState *btps_next;		/* link to parent level, if any */
@@ -90,8 +91,9 @@ typedef struct BTPageState
 	 0)
 
 
-static void _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags);
-static BTPageState *_bt_pagestate(Relation index, int flags, int level);
+static void _bt_blnewpage(Relation index, Buffer *buf, Page *page,
+						  uint32 level);
+static BTPageState *_bt_pagestate(Relation index, uint32 level);
 static void _bt_slideleft(Relation index, Buffer buf, Page page);
 static void _bt_sortaddtup(Page page, Size itemsize,
 			   BTItem btitem, OffsetNumber itup_off);
@@ -179,7 +181,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
  * allocate a new, clean btree page, not linked to any siblings.
  */
 static void
-_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
+_bt_blnewpage(Relation index, Buffer *buf, Page *page, uint32 level)
 {
 	BTPageOpaque opaque;
 
@@ -192,23 +194,67 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
 	/* Initialize BT opaque state */
 	opaque = (BTPageOpaque) PageGetSpecialPointer(*page);
 	opaque->btpo_prev = opaque->btpo_next = P_NONE;
-	opaque->btpo_flags = flags;
+	opaque->btpo.level = level;
+	opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
 
 	/* Make the P_HIKEY line pointer appear allocated */
 	((PageHeader) *page)->pd_lower += sizeof(ItemIdData);
 }
 
+/*
+ * emit a completed btree page, and release the lock and pin on it.
+ * This is essentially _bt_wrtbuf except we also emit a WAL record.
+ */
+static void
+_bt_blwritepage(Relation index, Buffer buf)
+{
+	Page		pg = BufferGetPage(buf);
+
+	/* NO ELOG(ERROR) from here till newpage op is logged */
+	START_CRIT_SECTION();
+
+	/* XLOG stuff */
+	if (!index->rd_istemp)
+	{
+		xl_btree_newpage xlrec;
+		XLogRecPtr	recptr;
+		XLogRecData rdata[2];
+
+		xlrec.node = index->rd_node;
+		xlrec.blkno = BufferGetBlockNumber(buf);
+
+		rdata[0].buffer = InvalidBuffer;
+		rdata[0].data = (char *) &xlrec;
+		rdata[0].len = SizeOfBtreeNewpage;
+		rdata[0].next = &(rdata[1]);
+
+		rdata[1].buffer = buf;
+		rdata[1].data = (char *) pg;
+		rdata[1].len = BLCKSZ;
+		rdata[1].next = NULL;
+
+		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWPAGE, rdata);
+
+		PageSetLSN(pg, recptr);
+		PageSetSUI(pg, ThisStartUpID);
+	}
+
+	END_CRIT_SECTION();
+
+	_bt_wrtbuf(index, buf);
+}
+
 /*
  * allocate and initialize a new BTPageState.  the returned structure
  * is suitable for immediate use by _bt_buildadd.
  */
 static BTPageState *
-_bt_pagestate(Relation index, int flags, int level)
+_bt_pagestate(Relation index, uint32 level)
 {
 	BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
 
 	/* create initial page */
-	_bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags);
+	_bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), level);
 
 	state->btps_minkey = (BTItem) NULL;
 	/* initialize lastoff so first item goes into P_FIRSTKEY */
@@ -365,9 +411,8 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
 		ItemId		hii;
 		BTItem		obti;
 
-		/* Create new page */
-		_bt_blnewpage(index, &nbuf, &npage,
-					  (state->btps_level > 0) ? 0 : BTP_LEAF);
+		/* Create new page on same level */
+		_bt_blnewpage(index, &nbuf, &npage, state->btps_level);
 
 		/*
 		 * We copy the last item on the page into the new page, and then
@@ -396,10 +441,8 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
 		 * btree level.
 		 */
 		if (state->btps_next == (BTPageState *) NULL)
-		{
-			state->btps_next =
-				_bt_pagestate(index, 0, state->btps_level + 1);
-		}
+			state->btps_next = _bt_pagestate(index, state->btps_level + 1);
+
 		Assert(state->btps_minkey != NULL);
 		ItemPointerSet(&(state->btps_minkey->bti_itup.t_tid),
 					   BufferGetBlockNumber(obuf), P_HIKEY);
@@ -414,16 +457,7 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
 		state->btps_minkey = _bt_formitem(&(obti->bti_itup));
 
 		/*
-		 * Set the sibling links for both pages, and parent links too.
-		 *
-		 * It's not necessary to set the parent link at all, because it's
-		 * only used for handling concurrent root splits, but we may as
-		 * well do it as a debugging aid.  Note we set new page's link as
-		 * well as old's, because if the new page turns out to be the last
-		 * of the level, _bt_uppershutdown won't change it.  The links may
-		 * be out of date by the time the build finishes, but that's OK;
-		 * they need only point to a left-sibling of the true parent.  See
-		 * the README file for more info.
+		 * Set the sibling links for both pages.
 		 */
 		{
 			BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
@@ -431,9 +465,7 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
 
 			oopaque->btpo_next = BufferGetBlockNumber(nbuf);
 			nopaque->btpo_prev = BufferGetBlockNumber(obuf);
-			nopaque->btpo_next = P_NONE;
-			oopaque->btpo_parent = nopaque->btpo_parent =
-				BufferGetBlockNumber(state->btps_next->btps_buf);
+			nopaque->btpo_next = P_NONE; /* redundant */
 		}
 
 		/*
@@ -441,7 +473,7 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
 		 * can give up our lock (if we had one; most likely BuildingBtree
 		 * is set, so we aren't locking).
 		 */
-		_bt_wrtbuf(index, obuf);
+		_bt_blwritepage(index, obuf);
 
 		/*
 		 * Reset last_off to point to new page
@@ -519,7 +551,7 @@ _bt_uppershutdown(Relation index, BTPageState *state)
 		 * slid back one slot.	Then we can dump out the page.
 		 */
 		_bt_slideleft(index, s->btps_buf, s->btps_page);
-		_bt_wrtbuf(index, s->btps_buf);
+		_bt_blwritepage(index, s->btps_buf);
 	}
 }
 
@@ -603,7 +635,7 @@ _bt_load(Relation index, BTSpool *btspool, BTSpool *btspool2)
 
 			/* When we see first tuple, create first index page */
 			if (state == NULL)
-				state = _bt_pagestate(index, BTP_LEAF, 0);
+				state = _bt_pagestate(index, 0);
 
 			if (load1)
 			{
@@ -623,13 +655,13 @@ _bt_load(Relation index, BTSpool *btspool, BTSpool *btspool2)
 		_bt_freeskey(indexScanKey);
 	}
 	else
-/* merge is unnecessary */
 	{
+		/* merge is unnecessary */
 		while (bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, true, &should_free), bti != (BTItem) NULL)
 		{
 			/* When we see first tuple, create first index page */
 			if (state == NULL)
-				state = _bt_pagestate(index, BTP_LEAF, 0);
+				state = _bt_pagestate(index, 0);
 
 			_bt_buildadd(index, state, bti);
 			if (should_free)
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
new file mode 100644
index 0000000000..87a0aaaa7a
--- /dev/null
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -0,0 +1,780 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtxlog.c
+ *	  WAL replay logic for btrees.
+ *
+ *
+ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.1 2003/02/21 00:06:21 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/xlogutils.h"
+
+
+/*
+ * We must keep track of expected insertions due to page splits, and apply
+ * them manually if they are not seen in the WAL log during replay.  This
+ * makes it safe for page insertion to be a multiple-WAL-action process.
+ *
+ * The data structure is a simple linked list --- this should be good enough,
+ * since we don't expect a page split to remain incomplete for long.
+ */
+typedef struct bt_incomplete_split
+{
+	RelFileNode node;			/* the index */
+	BlockNumber	leftblk;		/* left half of split */
+	BlockNumber	rightblk;		/* right half of split */
+	bool		is_root;		/* we split the root */
+} bt_incomplete_split;
+
+static List *incomplete_splits;
+
+
+static void
+log_incomplete_split(RelFileNode node, BlockNumber leftblk,
+					 BlockNumber rightblk, bool is_root)
+{
+	bt_incomplete_split *split = palloc(sizeof(bt_incomplete_split));
+
+	split->node = node;
+	split->leftblk = leftblk;
+	split->rightblk = rightblk;
+	split->is_root = is_root;
+	incomplete_splits = lappend(incomplete_splits, split);
+}
+
+static void
+forget_matching_split(Relation reln, RelFileNode node,
+					  BlockNumber insertblk, OffsetNumber offnum,
+					  bool is_root)
+{
+	Buffer		buffer;
+	Page		page;
+	BTItem		btitem;
+	BlockNumber rightblk;
+	List	   *l;
+
+	/* Get downlink TID from page */
+	buffer = XLogReadBuffer(false, reln, insertblk);
+	if (!BufferIsValid(buffer))
+		elog(PANIC, "forget_matching_split: block unfound");
+	page = (Page) BufferGetPage(buffer);
+	btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+	rightblk = ItemPointerGetBlockNumber(&(btitem->bti_itup.t_tid));
+	Assert(ItemPointerGetOffsetNumber(&(btitem->bti_itup.t_tid)) == P_HIKEY);
+	UnlockAndReleaseBuffer(buffer);
+
+	foreach(l, incomplete_splits)
+	{
+		bt_incomplete_split *split = (bt_incomplete_split *) lfirst(l);
+
+		if (RelFileNodeEquals(node, split->node) &&
+			rightblk == split->rightblk)
+		{
+			if (is_root != split->is_root)
+				elog(LOG, "forget_matching_split: fishy is_root data");
+			incomplete_splits = lremove(split, incomplete_splits);
+			break;				/* need not look further */
+		}
+	}
+}
+
+static void
+_bt_restore_page(Page page, char *from, int len)
+{
+	BTItemData	btdata;
+	Size		itemsz;
+	char	   *end = from + len;
+
+	for (; from < end;)
+	{
+		memcpy(&btdata, from, sizeof(BTItemData));
+		itemsz = IndexTupleDSize(btdata.bti_itup) +
+			(sizeof(BTItemData) - sizeof(IndexTupleData));
+		itemsz = MAXALIGN(itemsz);
+		if (PageAddItem(page, (Item) from, itemsz,
+					  FirstOffsetNumber, LP_USED) == InvalidOffsetNumber)
+			elog(PANIC, "_bt_restore_page: can't add item to page");
+		from += itemsz;
+	}
+}
+
+static void
+_bt_restore_meta(Relation reln, XLogRecPtr lsn, 
+				 BlockNumber root, uint32 level,
+				 BlockNumber fastroot, uint32 fastlevel)
+{
+	Buffer		metabuf;
+	Page		metapg;
+	BTMetaPageData *md;
+	BTPageOpaque pageop;
+
+	metabuf = XLogReadBuffer(true, reln, BTREE_METAPAGE);
+	if (!BufferIsValid(metabuf))
+		elog(PANIC, "_bt_restore_meta: no metapage");
+
+	metapg = BufferGetPage(metabuf);
+	_bt_pageinit(metapg, BufferGetPageSize(metabuf));
+
+	md = BTPageGetMeta(metapg);
+	md->btm_magic = BTREE_MAGIC;
+	md->btm_version = BTREE_VERSION;
+	md->btm_root = root;
+	md->btm_level = level;
+	md->btm_fastroot = fastroot;
+	md->btm_fastlevel = fastlevel;
+
+	pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
+	pageop->btpo_flags = BTP_META;
+
+	PageSetLSN(metapg, lsn);
+	PageSetSUI(metapg, ThisStartUpID);
+	UnlockAndWriteBuffer(metabuf);
+}
+
+static void
+btree_xlog_insert(bool redo, bool isleaf, bool ismeta,
+				  XLogRecPtr lsn, XLogRecord *record)
+{
+	xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
+	Relation	reln;
+	Buffer		buffer;
+	Page		page;
+	BTPageOpaque pageop;
+	char	   *datapos;
+	int			datalen;
+	xl_btree_metadata md;
+
+	datapos = (char *) xlrec + SizeOfBtreeInsert;
+	datalen = record->xl_len - SizeOfBtreeInsert;
+	if (ismeta)
+	{
+		memcpy(&md, datapos, sizeof(xl_btree_metadata));
+		datapos += sizeof(xl_btree_metadata);
+		datalen -= sizeof(xl_btree_metadata);
+	}
+
+	if (redo && (record->xl_info & XLR_BKP_BLOCK_1) && !ismeta &&
+		incomplete_splits == NIL)
+		return;					/* nothing to do */
+
+	reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
+	if (!RelationIsValid(reln))
+		return;
+
+	if (!redo || !(record->xl_info & XLR_BKP_BLOCK_1))
+	{
+		buffer = XLogReadBuffer(false, reln,
+								ItemPointerGetBlockNumber(&(xlrec->target.tid)));
+		if (!BufferIsValid(buffer))
+			elog(PANIC, "btree_insert_%sdo: block unfound", (redo) ? "re" : "un");
+		page = (Page) BufferGetPage(buffer);
+		if (PageIsNew((PageHeader) page))
+			elog(PANIC, "btree_insert_%sdo: uninitialized page", (redo) ? "re" : "un");
+		pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		if (redo)
+		{
+			if (XLByteLE(lsn, PageGetLSN(page)))
+			{
+				UnlockAndReleaseBuffer(buffer);
+			}
+			else
+			{
+				if (PageAddItem(page, (Item) datapos, datalen,
+								ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
+								LP_USED) == InvalidOffsetNumber)
+					elog(PANIC, "btree_insert_redo: failed to add item");
+
+				PageSetLSN(page, lsn);
+				PageSetSUI(page, ThisStartUpID);
+				UnlockAndWriteBuffer(buffer);
+			}
+		}
+		else
+		{
+			if (XLByteLT(PageGetLSN(page), lsn))
+				elog(PANIC, "btree_insert_undo: bad page LSN");
+
+			if (!P_ISLEAF(pageop))
+			{
+				UnlockAndReleaseBuffer(buffer);
+			}
+			else
+			{
+				elog(PANIC, "btree_insert_undo: unimplemented");
+			}
+		}
+	}
+
+	if (redo)					/* metapage changes not undoable */
+	{
+		if (ismeta)
+			_bt_restore_meta(reln, lsn,
+							 md.root, md.level,
+							 md.fastroot, md.fastlevel);
+	}
+
+	/* Forget any split this insertion completes */
+	if (redo && !isleaf && incomplete_splits != NIL)
+	{
+		forget_matching_split(reln, xlrec->target.node,
+							  ItemPointerGetBlockNumber(&(xlrec->target.tid)),
+							  ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
+							  false);
+	}
+}
+
+static void
+btree_xlog_split(bool redo, bool onleft, bool isroot,
+				 XLogRecPtr lsn, XLogRecord *record)
+{
+	xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
+	Relation	reln;
+	BlockNumber	targetblk;
+	BlockNumber	leftsib;
+	BlockNumber	rightsib;
+	Buffer		buffer;
+	Page		page;
+	BTPageOpaque pageop;
+	char	   *op = (redo) ? "redo" : "undo";
+
+	reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
+	if (!RelationIsValid(reln))
+		return;
+
+	targetblk = ItemPointerGetBlockNumber(&(xlrec->target.tid));
+	leftsib = (onleft) ? targetblk : xlrec->otherblk;
+	rightsib = (onleft) ? xlrec->otherblk : targetblk;
+
+	/* Left (original) sibling */
+	buffer = XLogReadBuffer(false, reln, leftsib);
+	if (!BufferIsValid(buffer))
+		elog(PANIC, "btree_split_%s: lost left sibling", op);
+
+	page = (Page) BufferGetPage(buffer);
+	if (redo)
+		_bt_pageinit(page, BufferGetPageSize(buffer));
+	else if (PageIsNew((PageHeader) page))
+		elog(PANIC, "btree_split_undo: uninitialized left sibling");
+	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	if (redo)
+	{
+		pageop->btpo_prev = xlrec->leftblk;
+		pageop->btpo_next = rightsib;
+		pageop->btpo.level = xlrec->level;
+		pageop->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
+
+		_bt_restore_page(page,
+						 (char *) xlrec + SizeOfBtreeSplit,
+						 xlrec->leftlen);
+
+		PageSetLSN(page, lsn);
+		PageSetSUI(page, ThisStartUpID);
+		UnlockAndWriteBuffer(buffer);
+	}
+	else
+	{
+		/* undo */
+		if (XLByteLT(PageGetLSN(page), lsn))
+			elog(PANIC, "btree_split_undo: bad left sibling LSN");
+		elog(PANIC, "btree_split_undo: unimplemented");
+	}
+
+	/* Right (new) sibling */
+	buffer = XLogReadBuffer((redo) ? true : false, reln, rightsib);
+	if (!BufferIsValid(buffer))
+		elog(PANIC, "btree_split_%s: lost right sibling", op);
+
+	page = (Page) BufferGetPage(buffer);
+	if (redo)
+		_bt_pageinit(page, BufferGetPageSize(buffer));
+	else if (PageIsNew((PageHeader) page))
+		elog(PANIC, "btree_split_undo: uninitialized right sibling");
+	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	if (redo)
+	{
+		pageop->btpo_prev = leftsib;
+		pageop->btpo_next = xlrec->rightblk;
+		pageop->btpo.level = xlrec->level;
+		pageop->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
+
+		_bt_restore_page(page,
+					  (char *) xlrec + SizeOfBtreeSplit + xlrec->leftlen,
+					 record->xl_len - SizeOfBtreeSplit - xlrec->leftlen);
+
+		PageSetLSN(page, lsn);
+		PageSetSUI(page, ThisStartUpID);
+		UnlockAndWriteBuffer(buffer);
+	}
+	else
+	{
+		/* undo */
+		if (XLByteLT(PageGetLSN(page), lsn))
+			elog(PANIC, "btree_split_undo: bad right sibling LSN");
+		elog(PANIC, "btree_split_undo: unimplemented");
+	}
+
+	/* Fix left-link of right (next) page */
+	if (redo && !(record->xl_info & XLR_BKP_BLOCK_1))
+	{
+		if (xlrec->rightblk != P_NONE)
+		{
+			buffer = XLogReadBuffer(false, reln, xlrec->rightblk);
+			if (!BufferIsValid(buffer))
+				elog(PANIC, "btree_split_redo: lost next right page");
+
+			page = (Page) BufferGetPage(buffer);
+			if (PageIsNew((PageHeader) page))
+				elog(PANIC, "btree_split_redo: uninitialized next right page");
+
+			if (XLByteLE(lsn, PageGetLSN(page)))
+			{
+				UnlockAndReleaseBuffer(buffer);
+			}
+			else
+			{
+				pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+				pageop->btpo_prev = rightsib;
+
+				PageSetLSN(page, lsn);
+				PageSetSUI(page, ThisStartUpID);
+				UnlockAndWriteBuffer(buffer);
+			}
+		}
+	}
+
+	/* Forget any split this insertion completes */
+	if (redo && xlrec->level > 0 && incomplete_splits != NIL)
+	{
+		forget_matching_split(reln, xlrec->target.node,
+							  ItemPointerGetBlockNumber(&(xlrec->target.tid)),
+							  ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
+							  false);
+	}
+
+	/* The job ain't done till the parent link is inserted... */
+	log_incomplete_split(xlrec->target.node,
+						 leftsib, rightsib, isroot);
+}
+
+static void
+btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
+{
+	xl_btree_delete *xlrec;
+	Relation	reln;
+	Buffer		buffer;
+	Page		page;
+
+	if (!redo || (record->xl_info & XLR_BKP_BLOCK_1))
+		return;
+
+	xlrec = (xl_btree_delete *) XLogRecGetData(record);
+	reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
+	if (!RelationIsValid(reln))
+		return;
+	buffer = XLogReadBuffer(false, reln,
+						ItemPointerGetBlockNumber(&(xlrec->target.tid)));
+	if (!BufferIsValid(buffer))
+		elog(PANIC, "btree_delete_redo: block unfound");
+	page = (Page) BufferGetPage(buffer);
+	if (PageIsNew((PageHeader) page))
+		elog(PANIC, "btree_delete_redo: uninitialized page");
+
+	if (XLByteLE(lsn, PageGetLSN(page)))
+	{
+		UnlockAndReleaseBuffer(buffer);
+		return;
+	}
+
+	PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid)));
+
+	PageSetLSN(page, lsn);
+	PageSetSUI(page, ThisStartUpID);
+	UnlockAndWriteBuffer(buffer);
+}
+
+static void
+btree_xlog_newroot(bool redo, XLogRecPtr lsn, XLogRecord *record)
+{
+	xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record);
+	Relation	reln;
+	Buffer		buffer;
+	Page		page;
+	BTPageOpaque pageop;
+
+	if (!redo)
+		return;					/* not undoable */
+
+	reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
+	if (!RelationIsValid(reln))
+		return;
+	buffer = XLogReadBuffer(true, reln, xlrec->rootblk);
+	if (!BufferIsValid(buffer))
+		elog(PANIC, "btree_newroot_redo: no root page");
+
+	page = (Page) BufferGetPage(buffer);
+	_bt_pageinit(page, BufferGetPageSize(buffer));
+	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	pageop->btpo_flags = BTP_ROOT;
+	pageop->btpo_prev = pageop->btpo_next = P_NONE;
+	pageop->btpo.level = xlrec->level;
+	if (xlrec->level == 0)
+		pageop->btpo_flags |= BTP_LEAF;
+
+	if (record->xl_len > SizeOfBtreeNewroot)
+		_bt_restore_page(page,
+						 (char *) xlrec + SizeOfBtreeNewroot,
+						 record->xl_len - SizeOfBtreeNewroot);
+
+	PageSetLSN(page, lsn);
+	PageSetSUI(page, ThisStartUpID);
+	UnlockAndWriteBuffer(buffer);
+
+	_bt_restore_meta(reln, lsn,
+					 xlrec->rootblk, xlrec->level,
+					 xlrec->rootblk, xlrec->level);
+
+	/* Check to see if this satisfies any incomplete insertions */
+	if (record->xl_len > SizeOfBtreeNewroot &&
+		incomplete_splits != NIL)
+	{
+		forget_matching_split(reln, xlrec->node,
+							  xlrec->rootblk,
+							  P_FIRSTKEY,
+							  true);
+	}
+}
+
+static void
+btree_xlog_newmeta(bool redo, XLogRecPtr lsn, XLogRecord *record)
+{
+	xl_btree_newmeta *xlrec = (xl_btree_newmeta *) XLogRecGetData(record);
+	Relation	reln;
+
+	if (!redo)
+		return;					/* not undoable */
+
+	reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
+	if (!RelationIsValid(reln))
+		return;
+
+	_bt_restore_meta(reln, lsn,
+					 xlrec->meta.root, xlrec->meta.level,
+					 xlrec->meta.fastroot, xlrec->meta.fastlevel);
+}
+
+static void
+btree_xlog_newpage(bool redo, XLogRecPtr lsn, XLogRecord *record)
+{
+	xl_btree_newpage *xlrec = (xl_btree_newpage *) XLogRecGetData(record);
+	Relation	reln;
+	Buffer		buffer;
+	Page		page;
+
+	if (!redo || (record->xl_info & XLR_BKP_BLOCK_1))
+		return;
+
+	reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
+	if (!RelationIsValid(reln))
+		return;
+	buffer = XLogReadBuffer(true, reln, xlrec->blkno);
+	if (!BufferIsValid(buffer))
+		elog(PANIC, "btree_newpage_redo: block unfound");
+	page = (Page) BufferGetPage(buffer);
+
+	Assert(record->xl_len == SizeOfBtreeNewpage + BLCKSZ);
+	memcpy(page, (char *) xlrec + SizeOfBtreeNewpage, BLCKSZ);
+
+	PageSetLSN(page, lsn);
+	PageSetSUI(page, ThisStartUpID);
+	UnlockAndWriteBuffer(buffer);
+}
+
+
+void
+btree_redo(XLogRecPtr lsn, XLogRecord *record)
+{
+	uint8		info = record->xl_info & ~XLR_INFO_MASK;
+
+	switch (info)
+	{
+		case XLOG_BTREE_INSERT_LEAF:
+			btree_xlog_insert(true, true, false, lsn, record);
+			break;
+		case XLOG_BTREE_INSERT_UPPER:
+			btree_xlog_insert(true, false, false, lsn, record);
+			break;
+		case XLOG_BTREE_INSERT_META:
+			btree_xlog_insert(true, false, true, lsn, record);
+			break;
+		case XLOG_BTREE_SPLIT_L:
+			btree_xlog_split(true, true, false, lsn, record);
+			break;
+		case XLOG_BTREE_SPLIT_R:
+			btree_xlog_split(true, false, false, lsn, record);
+			break;
+		case XLOG_BTREE_SPLIT_L_ROOT:
+			btree_xlog_split(true, true, true, lsn, record);
+			break;
+		case XLOG_BTREE_SPLIT_R_ROOT:
+			btree_xlog_split(true, false, true, lsn, record);
+			break;
+		case XLOG_BTREE_DELETE:
+			btree_xlog_delete(true, lsn, record);
+			break;
+		case XLOG_BTREE_DELETE_PAGE:
+		case XLOG_BTREE_DELETE_PAGE_META:
+			// ???
+			break;
+		case XLOG_BTREE_NEWROOT:
+			btree_xlog_newroot(true, lsn, record);
+			break;
+		case XLOG_BTREE_NEWMETA:
+			btree_xlog_newmeta(true, lsn, record);
+			break;
+		case XLOG_BTREE_NEWPAGE:
+			btree_xlog_newpage(true, lsn, record);
+			break;
+		default:
+			elog(PANIC, "btree_redo: unknown op code %u", info);
+	}
+}
+
+void
+btree_undo(XLogRecPtr lsn, XLogRecord *record)
+{
+	uint8		info = record->xl_info & ~XLR_INFO_MASK;
+
+	switch (info)
+	{
+		case XLOG_BTREE_INSERT_LEAF:
+			btree_xlog_insert(false, true, false, lsn, record);
+			break;
+		case XLOG_BTREE_INSERT_UPPER:
+			btree_xlog_insert(false, false, false, lsn, record);
+			break;
+		case XLOG_BTREE_INSERT_META:
+			btree_xlog_insert(false, false, true, lsn, record);
+			break;
+		case XLOG_BTREE_SPLIT_L:
+			btree_xlog_split(false, true, false, lsn, record);
+			break;
+		case XLOG_BTREE_SPLIT_R:
+			btree_xlog_split(false, false, false, lsn, record);
+			break;
+		case XLOG_BTREE_SPLIT_L_ROOT:
+			btree_xlog_split(false, true, true, lsn, record);
+			break;
+		case XLOG_BTREE_SPLIT_R_ROOT:
+			btree_xlog_split(false, false, true, lsn, record);
+			break;
+		case XLOG_BTREE_DELETE:
+			btree_xlog_delete(false, lsn, record);
+			break;
+		case XLOG_BTREE_DELETE_PAGE:
+		case XLOG_BTREE_DELETE_PAGE_META:
+			// ???
+			break;
+		case XLOG_BTREE_NEWROOT:
+			btree_xlog_newroot(false, lsn, record);
+			break;
+		case XLOG_BTREE_NEWMETA:
+			btree_xlog_newmeta(false, lsn, record);
+			break;
+		case XLOG_BTREE_NEWPAGE:
+			btree_xlog_newpage(false, lsn, record);
+			break;
+		default:
+			elog(PANIC, "btree_undo: unknown op code %u", info);
+	}
+}
+
+static void
+out_target(char *buf, xl_btreetid *target)
+{
+	sprintf(buf + strlen(buf), "node %u/%u; tid %u/%u",
+			target->node.tblNode, target->node.relNode,
+			ItemPointerGetBlockNumber(&(target->tid)),
+			ItemPointerGetOffsetNumber(&(target->tid)));
+}
+
+void
+btree_desc(char *buf, uint8 xl_info, char *rec)
+{
+	uint8		info = xl_info & ~XLR_INFO_MASK;
+
+	switch (info)
+	{
+		case XLOG_BTREE_INSERT_LEAF:
+		{
+			xl_btree_insert *xlrec = (xl_btree_insert *) rec;
+
+			strcat(buf, "insert: ");
+			out_target(buf, &(xlrec->target));
+			break;
+		}
+		case XLOG_BTREE_INSERT_UPPER:
+		{
+			xl_btree_insert *xlrec = (xl_btree_insert *) rec;
+
+			strcat(buf, "insert_upper: ");
+			out_target(buf, &(xlrec->target));
+			break;
+		}
+		case XLOG_BTREE_INSERT_META:
+		{
+			xl_btree_insert *xlrec = (xl_btree_insert *) rec;
+
+			strcat(buf, "insert_meta: ");
+			out_target(buf, &(xlrec->target));
+			break;
+		}
+		case XLOG_BTREE_SPLIT_L:
+		{
+			xl_btree_split *xlrec = (xl_btree_split *) rec;
+
+			strcat(buf, "split_l: ");
+			out_target(buf, &(xlrec->target));
+			sprintf(buf + strlen(buf), "; oth %u; rgh %u",
+					xlrec->otherblk, xlrec->rightblk);
+			break;
+		}
+		case XLOG_BTREE_SPLIT_R:
+		{
+			xl_btree_split *xlrec = (xl_btree_split *) rec;
+
+			strcat(buf, "split_r: ");
+			out_target(buf, &(xlrec->target));
+			sprintf(buf + strlen(buf), "; oth %u; rgh %u",
+					xlrec->otherblk, xlrec->rightblk);
+			break;
+		}
+		case XLOG_BTREE_SPLIT_L_ROOT:
+		{
+			xl_btree_split *xlrec = (xl_btree_split *) rec;
+
+			strcat(buf, "split_l_root: ");
+			out_target(buf, &(xlrec->target));
+			sprintf(buf + strlen(buf), "; oth %u; rgh %u",
+					xlrec->otherblk, xlrec->rightblk);
+			break;
+		}
+		case XLOG_BTREE_SPLIT_R_ROOT:
+		{
+			xl_btree_split *xlrec = (xl_btree_split *) rec;
+
+			strcat(buf, "split_r_root: ");
+			out_target(buf, &(xlrec->target));
+			sprintf(buf + strlen(buf), "; oth %u; rgh %u",
+					xlrec->otherblk, xlrec->rightblk);
+			break;
+		}
+		case XLOG_BTREE_DELETE:
+		{
+			xl_btree_delete *xlrec = (xl_btree_delete *) rec;
+
+			strcat(buf, "delete: ");
+			out_target(buf, &(xlrec->target));
+			break;
+		}
+		case XLOG_BTREE_DELETE_PAGE:
+		case XLOG_BTREE_DELETE_PAGE_META:
+		{
+			xl_btree_delete_page *xlrec = (xl_btree_delete_page *) rec;
+
+			strcat(buf, "delete_page: ");
+			out_target(buf, &(xlrec->target));
+			sprintf(buf + strlen(buf), "; dead %u; left %u; right %u",
+					xlrec->deadblk, xlrec->leftblk, xlrec->rightblk);
+			break;
+		}
+		case XLOG_BTREE_NEWROOT:
+		{
+			xl_btree_newroot *xlrec = (xl_btree_newroot *) rec;
+
+			sprintf(buf + strlen(buf), "newroot: node %u/%u; root %u lev %u",
+					xlrec->node.tblNode, xlrec->node.relNode,
+					xlrec->rootblk, xlrec->level);
+			break;
+		}
+		case XLOG_BTREE_NEWMETA:
+		{
+			xl_btree_newmeta *xlrec = (xl_btree_newmeta *) rec;
+
+			sprintf(buf + strlen(buf), "newmeta: node %u/%u; root %u lev %u fast %u lev %u",
+					xlrec->node.tblNode, xlrec->node.relNode,
+					xlrec->meta.root, xlrec->meta.level,
+					xlrec->meta.fastroot, xlrec->meta.fastlevel);
+			break;
+		}
+		case XLOG_BTREE_NEWPAGE:
+		{
+			xl_btree_newpage *xlrec = (xl_btree_newpage *) rec;
+
+			sprintf(buf + strlen(buf), "newpage: node %u/%u; page %u",
+					xlrec->node.tblNode, xlrec->node.relNode,
+					xlrec->blkno);
+			break;
+		}
+		default:
+			strcat(buf, "UNKNOWN");
+			break;
+	}
+}
+
+void
+btree_xlog_startup(void)
+{
+	incomplete_splits = NIL;
+}
+
+void
+btree_xlog_cleanup(void)
+{
+	List	   *l;
+
+	foreach(l, incomplete_splits)
+	{
+		bt_incomplete_split *split = (bt_incomplete_split *) lfirst(l);
+		Relation	reln;
+		Buffer		lbuf,
+					rbuf;
+		Page		lpage,
+					rpage;
+		BTPageOpaque lpageop,
+					rpageop;
+		bool		is_only;
+
+		reln = XLogOpenRelation(true, RM_BTREE_ID, split->node);
+		if (!RelationIsValid(reln))
+			continue;
+		lbuf = XLogReadBuffer(false, reln, split->leftblk);
+		if (!BufferIsValid(lbuf))
+			elog(PANIC, "btree_xlog_cleanup: left block unfound");
+		lpage = (Page) BufferGetPage(lbuf);
+		lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
+		rbuf = XLogReadBuffer(false, reln, split->rightblk);
+		if (!BufferIsValid(rbuf))
+			elog(PANIC, "btree_xlog_cleanup: right block unfound");
+		rpage = (Page) BufferGetPage(rbuf);
+		rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+		/* if the two pages are all of their level, it's a only-page split */
+		is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop);
+
+		_bt_insert_parent(reln, lbuf, rbuf, (BTStack) NULL,
+						  split->is_root, is_only);
+	}
+	incomplete_splits = NIL;
+}
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index b9af3a06c9..59af280802 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -3,7 +3,7 @@
  *
  * Resource managers definition
  *
- * $Header: /cvsroot/pgsql/src/backend/access/transam/rmgr.c,v 1.9 2001/08/25 18:52:41 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/transam/rmgr.c,v 1.10 2003/02/21 00:06:22 tgl Exp $
  */
 #include "postgres.h"
 
@@ -19,21 +19,22 @@
 #include "commands/sequence.h"
 
 
-RmgrData	RmgrTable[] = {
-	{"XLOG", xlog_redo, xlog_undo, xlog_desc},
-	{"Transaction", xact_redo, xact_undo, xact_desc},
-	{"Storage", smgr_redo, smgr_undo, smgr_desc},
-	{"CLOG", clog_redo, clog_undo, clog_desc},
-	{"Reserved 4", NULL, NULL, NULL},
-	{"Reserved 5", NULL, NULL, NULL},
-	{"Reserved 6", NULL, NULL, NULL},
-	{"Reserved 7", NULL, NULL, NULL},
-	{"Reserved 8", NULL, NULL, NULL},
-	{"Reserved 9", NULL, NULL, NULL},
-	{"Heap", heap_redo, heap_undo, heap_desc},
-	{"Btree", btree_redo, btree_undo, btree_desc},
-	{"Hash", hash_redo, hash_undo, hash_desc},
-	{"Rtree", rtree_redo, rtree_undo, rtree_desc},
-	{"Gist", gist_redo, gist_undo, gist_desc},
-	{"Sequence", seq_redo, seq_undo, seq_desc}
+RmgrData	RmgrTable[RM_MAX_ID+1] = {
+	{"XLOG", xlog_redo, xlog_undo, xlog_desc, NULL, NULL},
+	{"Transaction", xact_redo, xact_undo, xact_desc, NULL, NULL},
+	{"Storage", smgr_redo, smgr_undo, smgr_desc, NULL, NULL},
+	{"CLOG", clog_redo, clog_undo, clog_desc, NULL, NULL},
+	{"Reserved 4", NULL, NULL, NULL, NULL, NULL},
+	{"Reserved 5", NULL, NULL, NULL, NULL, NULL},
+	{"Reserved 6", NULL, NULL, NULL, NULL, NULL},
+	{"Reserved 7", NULL, NULL, NULL, NULL, NULL},
+	{"Reserved 8", NULL, NULL, NULL, NULL, NULL},
+	{"Reserved 9", NULL, NULL, NULL, NULL, NULL},
+	{"Heap", heap_redo, heap_undo, heap_desc, NULL, NULL},
+	{"Btree", btree_redo, btree_undo, btree_desc,
+	 btree_xlog_startup, btree_xlog_cleanup},
+	{"Hash", hash_redo, hash_undo, hash_desc, NULL, NULL},
+	{"Rtree", rtree_redo, rtree_undo, rtree_desc, NULL, NULL},
+	{"Gist", gist_redo, gist_undo, gist_desc, NULL, NULL},
+	{"Sequence", seq_redo, seq_undo, seq_desc, NULL, NULL}
 };
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index c35762bba9..3b615f8229 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.111 2003/01/25 03:06:04 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.112 2003/02/21 00:06:22 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1203,16 +1203,6 @@ XLogFlush(XLogRecPtr record)
 	XLogRecPtr	WriteRqstPtr;
 	XLogwrtRqst WriteRqst;
 
-	if (XLOG_DEBUG)
-	{
-		elog(LOG, "XLogFlush%s%s: request %X/%X; write %X/%X; flush %X/%X",
-			 (IsBootstrapProcessingMode()) ? "(bootstrap)" : "",
-			 (InRedo) ? "(redo)" : "",
-			 record.xlogid, record.xrecoff,
-			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
-			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
-	}
-
 	/* Disabled during REDO */
 	if (InRedo)
 		return;
@@ -1221,6 +1211,15 @@ XLogFlush(XLogRecPtr record)
 	if (XLByteLE(record, LogwrtResult.Flush))
 		return;
 
+	if (XLOG_DEBUG)
+	{
+		elog(LOG, "XLogFlush%s: request %X/%X; write %X/%X; flush %X/%X",
+			 (IsBootstrapProcessingMode()) ? "(bootstrap)" : "",
+			 record.xlogid, record.xrecoff,
+			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
+			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
+	}
+
 	START_CRIT_SECTION();
 
 	/*
@@ -2515,6 +2514,12 @@ StartupXLOG(void)
 		elog(LOG, "database system was interrupted at %s",
 			 str_time(ControlFile->time));
 
+	/* This is just to allow attaching to startup process with a debugger */
+#ifdef XLOG_REPLAY_DELAY
+	if (XLOG_DEBUG && ControlFile->state != DB_SHUTDOWNED)
+		sleep(60);
+#endif
+
 	/*
 	 * Get the last valid checkpoint record.  If the latest one according
 	 * to pg_control is broken, try the next-to-last one.
@@ -2578,14 +2583,23 @@ StartupXLOG(void)
 	/* REDO */
 	if (InRecovery)
 	{
+		int		rmid;
+
 		elog(LOG, "database system was not properly shut down; "
 			 "automatic recovery in progress");
 		ControlFile->state = DB_IN_RECOVERY;
 		ControlFile->time = time(NULL);
 		UpdateControlFile();
 
+		/* Start up the recovery environment */
 		XLogInitRelationCache();
 
+		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
+		{
+			if (RmgrTable[rmid].rm_startup != NULL)
+				RmgrTable[rmid].rm_startup();
+		}
+
 		/* Is REDO required ? */
 		if (XLByteLT(checkPoint.redo, RecPtr))
 			record = ReadRecord(&(checkPoint.redo), PANIC, buffer);
@@ -2737,7 +2751,25 @@ StartupXLOG(void)
 
 	if (InRecovery)
 	{
+		int		rmid;
+
 		/*
+		 * Allow resource managers to do any required cleanup.
+		 */
+		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
+		{
+			if (RmgrTable[rmid].rm_cleanup != NULL)
+				RmgrTable[rmid].rm_cleanup();
+		}
+
+		/* suppress in-transaction check in CreateCheckPoint */
+		MyLastRecPtr.xrecoff = 0;
+		MyXactMadeXLogEntry = false;
+		MyXactMadeTempRelUpdate = false;
+
+		/*
+		 * Perform a new checkpoint to update our recovery activity to disk.
+		 *
 		 * In case we had to use the secondary checkpoint, make sure that
 		 * it will still be shown as the secondary checkpoint after this
 		 * CreateCheckPoint operation; we don't want the broken primary
@@ -2745,6 +2777,10 @@ StartupXLOG(void)
 		 */
 		ControlFile->checkPoint = checkPointLoc;
 		CreateCheckPoint(true, true);
+
+		/*
+		 * Close down recovery environment
+		 */
 		XLogCloseRelationCache();
 	}
 
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 7d279ef94d..f4dce1842f 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: nbtree.h,v 1.63 2002/07/02 05:48:44 momjian Exp $
+ * $Id: nbtree.h,v 1.64 2003/02/21 00:06:22 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -22,46 +22,55 @@
 /*
  *	BTPageOpaqueData -- At the end of every page, we store a pointer
  *	to both siblings in the tree.  This is used to do forward/backward
- *	index scans.  See Lehman and Yao's paper for more
- *	info.  In addition, we need to know what type of page this is
- *	(leaf or internal), and whether the page is available for reuse.
+ *	index scans.  The next-page link is also critical for recovery when
+ *	a search has navigated to the wrong page due to concurrent page splits
+ *	or deletions; see src/backend/access/nbtree/README for more info.
  *
- *	We also store a back-link to the parent page, but this cannot be trusted
- *	very far since it does not get updated when the parent is split.
- *	See backend/access/nbtree/README for details.
+ *  In addition, we store the page's btree level (counting upwards from
+ *	zero at a leaf page) as well as some flag bits indicating the page type
+ *	and status.  If the page is deleted, we replace the level with the
+ *	next-transaction-ID value indicating when it is safe to reclaim the page.
+ *
+ *	NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
+ *	instead.
  */
 
 typedef struct BTPageOpaqueData
 {
-	BlockNumber btpo_prev;		/* used for backward index scans */
-	BlockNumber btpo_next;		/* used for forward index scans */
-	BlockNumber btpo_parent;	/* pointer to parent, but not updated on
-								 * parent split */
-	uint16		btpo_flags;		/* LEAF?, ROOT?, FREE?, META?, REORDER? */
-
+	BlockNumber btpo_prev;		/* left sibling, or P_NONE if leftmost */
+	BlockNumber btpo_next;		/* right sibling, or P_NONE if rightmost */
+	union
+	{
+		uint32	level;			/* tree level --- zero for leaf pages */
+		TransactionId xact;		/* next transaction ID, if deleted */
+	} btpo;
+	uint16		btpo_flags;		/* flag bits, see below */
 } BTPageOpaqueData;
 
 typedef BTPageOpaqueData *BTPageOpaque;
 
 /* Bits defined in btpo_flags */
-#define BTP_LEAF		(1 << 0)	/* leaf page, if not internal page */
+#define BTP_LEAF		(1 << 0)	/* leaf page, i.e. not internal page */
 #define BTP_ROOT		(1 << 1)	/* root page (has no parent) */
-#define BTP_FREE		(1 << 2)	/* page not in use */
+#define BTP_DELETED		(1 << 2)	/* page has been deleted from tree */
 #define BTP_META		(1 << 3)	/* meta-page */
-#define BTP_REORDER		(1 << 4)	/* items need reordering */
 
 
 /*
  * The Meta page is always the first page in the btree index.
  * Its primary purpose is to point to the location of the btree root page.
+ * We also point to the "fast" root, which is the current effective root;
+ * see README for discussion.
  */
 
 typedef struct BTMetaPageData
 {
-	uint32		btm_magic;
-	uint32		btm_version;
-	BlockNumber btm_root;
-	int32		btm_level;
+	uint32		btm_magic;		/* should contain BTREE_MAGIC */
+	uint32		btm_version;	/* should contain BTREE_VERSION */
+	BlockNumber btm_root;		/* current root location */
+	uint32		btm_level;		/* tree level of the root page */
+	BlockNumber btm_fastroot;	/* current "fast" root location */
+	uint32		btm_fastlevel;	/* tree level of the "fast" root page */
 } BTMetaPageData;
 
 #define BTPageGetMeta(p) \
@@ -69,12 +78,7 @@ typedef struct BTMetaPageData
 
 #define BTREE_METAPAGE	0		/* first page is meta */
 #define BTREE_MAGIC		0x053162	/* magic number of btree pages */
-
-#define BTreeInvalidParent(opaque)	\
-	(opaque->btpo_parent == InvalidBlockNumber || \
-		opaque->btpo_parent == BTREE_METAPAGE)
-
-#define BTREE_VERSION	1
+#define BTREE_VERSION	2		/* current version number */
 
 /*
  * We actually need to be able to fit three items on every page,
@@ -84,6 +88,295 @@ typedef struct BTMetaPageData
 	((PageGetPageSize(page) - \
 	  sizeof(PageHeaderData) - \
 	  MAXALIGN(sizeof(BTPageOpaqueData))) / 3 - sizeof(ItemIdData))
+
+/*
+ *	BTItems are what we store in the btree.  Each item is an index tuple,
+ *	including key and pointer values.  (In some cases either the key or the
+ *	pointer may go unused, see backend/access/nbtree/README for details.)
+ *
+ *	Old comments:
+ *	In addition, we must guarantee that all tuples in the index are unique,
+ *	in order to satisfy some assumptions in Lehman and Yao.  The way that we
+ *	do this is by generating a new OID for every insertion that we do in the
+ *	tree.  This adds eight bytes to the size of btree index tuples.  Note
+ *	that we do not use the OID as part of a composite key; the OID only
+ *	serves as a unique identifier for a given index tuple (logical position
+ *	within a page).
+ *
+ *	New comments:
+ *	actually, we must guarantee that all tuples in A LEVEL
+ *	are unique, not in ALL INDEX. So, we can use bti_itup->t_tid
+ *	as unique identifier for a given index tuple (logical position
+ *	within a level). - vadim 04/09/97
+ */
+
+typedef struct BTItemData
+{
+	IndexTupleData bti_itup;
+} BTItemData;
+
+typedef BTItemData *BTItem;
+
+/*
+ * For XLOG: size without alignment. Sizeof works as long as
+ * IndexTupleData has exactly 8 bytes.
+ */
+#define SizeOfBTItem	sizeof(BTItemData)
+
+/* Test whether items are the "same" per the above notes */
+#define BTItemSame(i1, i2)	  ( (i1)->bti_itup.t_tid.ip_blkid.bi_hi == \
+								(i2)->bti_itup.t_tid.ip_blkid.bi_hi && \
+								(i1)->bti_itup.t_tid.ip_blkid.bi_lo == \
+								(i2)->bti_itup.t_tid.ip_blkid.bi_lo && \
+								(i1)->bti_itup.t_tid.ip_posid == \
+								(i2)->bti_itup.t_tid.ip_posid )
+
+/*
+ *	In general, the btree code tries to localize its knowledge about
+ *	page layout to a couple of routines.  However, we need a special
+ *	value to indicate "no page number" in those places where we expect
+ *	page numbers.  We can use zero for this because we never need to
+ *	make a pointer to the metadata page.
+ */
+
+#define P_NONE			0
+
+/*
+ * Macros to test whether a page is leftmost or rightmost on its tree level,
+ * as well as other state info kept in the opaque data.
+ */
+#define P_LEFTMOST(opaque)		((opaque)->btpo_prev == P_NONE)
+#define P_RIGHTMOST(opaque)		((opaque)->btpo_next == P_NONE)
+#define P_ISLEAF(opaque)		((opaque)->btpo_flags & BTP_LEAF)
+#define P_ISROOT(opaque)		((opaque)->btpo_flags & BTP_ROOT)
+#define P_ISDELETED(opaque)		((opaque)->btpo_flags & BTP_DELETED)
+
+/*
+ *	Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
+ *	page.  The high key is not a data key, but gives info about what range of
+ *	keys is supposed to be on this page.  The high key on a page is required
+ *	to be greater than or equal to any data key that appears on the page.
+ *	If we find ourselves trying to insert a key > high key, we know we need
+ *	to move right (this should only happen if the page was split since we
+ *	examined the parent page).
+ *
+ *	Our insertion algorithm guarantees that we can use the initial least key
+ *	on our right sibling as the high key.  Once a page is created, its high
+ *	key changes only if the page is split.
+ *
+ *	On a non-rightmost page, the high key lives in item 1 and data items
+ *	start in item 2.  Rightmost pages have no high key, so we store data
+ *	items beginning in item 1.
+ */
+
+#define P_HIKEY				((OffsetNumber) 1)
+#define P_FIRSTKEY			((OffsetNumber) 2)
+#define P_FIRSTDATAKEY(opaque)	(P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
+
+/*
+ * XLOG records for btree operations
+ *
+ * XLOG allows to store some information in high 4 bits of log
+ * record xl_info field
+ */
+#define XLOG_BTREE_INSERT_LEAF	0x00	/* add btitem without split */
+#define XLOG_BTREE_INSERT_UPPER	0x10	/* same, on a non-leaf page */
+#define XLOG_BTREE_INSERT_META	0x20	/* same, plus update metapage */
+#define XLOG_BTREE_SPLIT_L		0x30	/* add btitem with split */
+#define XLOG_BTREE_SPLIT_R		0x40	/* as above, new item on right */
+#define XLOG_BTREE_SPLIT_L_ROOT	0x50	/* add btitem with split of root */
+#define XLOG_BTREE_SPLIT_R_ROOT	0x60	/* as above, new item on right */
+#define XLOG_BTREE_DELETE		0x70	/* delete leaf btitem */
+#define XLOG_BTREE_DELETE_PAGE	0x80	/* delete an entire page */
+#define XLOG_BTREE_DELETE_PAGE_META	0x90 /* same, plus update metapage */
+#define XLOG_BTREE_NEWROOT		0xA0	/* new root page */
+#define XLOG_BTREE_NEWMETA		0xB0	/* update metadata page */
+#define XLOG_BTREE_NEWPAGE		0xC0	/* new index page during build */
+
+/*
+ * All that we need to find changed index tuple
+ */
+typedef struct xl_btreetid
+{
+	RelFileNode node;
+	ItemPointerData tid;		/* changed tuple id */
+} xl_btreetid;
+
+/*
+ * All that we need to regenerate the meta-data page
+ */
+typedef struct xl_btree_metadata
+{
+	BlockNumber root;
+	uint32		level;
+	BlockNumber fastroot;
+	uint32		fastlevel;
+} xl_btree_metadata;
+
+/*
+ * This is what we need to know about simple (without split) insert.
+ *
+ * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
+ * Note that INSERT_META implies it's not a leaf page.
+ */
+typedef struct xl_btree_insert
+{
+	xl_btreetid target;			/* inserted tuple id */
+	/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */
+	/* BTITEM FOLLOWS AT END OF STRUCT */
+} xl_btree_insert;
+
+#define SizeOfBtreeInsert	(offsetof(xl_btreetid, tid) + SizeOfIptrData)
+
+/*
+ * On insert with split we save items of both left and right siblings
+ * and restore content of both pages from log record.  This way takes less
+ * xlog space than the normal approach, because if we did it standardly,
+ * XLogInsert would almost always think the right page is new and store its
+ * whole page image.
+ *
+ * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
+ * The _L and _R variants indicate whether the inserted btitem went into the
+ * left or right split page (and thus, whether otherblk is the right or left
+ * page of the split pair).  The _ROOT variants indicate that we are splitting
+ * the root page, and thus that a newroot record rather than an insert or
+ * split record should follow.  Note that a split record never carries a
+ * metapage update --- we'll do that in the parent-level update.
+ */
+typedef struct xl_btree_split
+{
+	xl_btreetid target;			/* inserted tuple id */
+	BlockNumber otherblk;		/* second block participated in split: */
+	/* first one is stored in target' tid */
+	BlockNumber leftblk;		/* prev/left block */
+	BlockNumber rightblk;		/* next/right block */
+	uint32		level;			/* tree level of page being split */
+	uint16		leftlen;		/* len of left page items below */
+	/* LEFT AND RIGHT PAGES TUPLES FOLLOW AT THE END */
+} xl_btree_split;
+
+#define SizeOfBtreeSplit	(offsetof(xl_btree_split, leftlen) + sizeof(uint16))
+
+/*
+ * This is what we need to know about delete of an individual leaf btitem
+ */
+typedef struct xl_btree_delete
+{
+	xl_btreetid target;			/* deleted tuple id */
+} xl_btree_delete;
+
+#define SizeOfBtreeDelete	(offsetof(xl_btreetid, tid) + SizeOfIptrData)
+
+/*
+ * This is what we need to know about deletion of a btree page.  The target
+ * identifies the tuple removed from the parent page (note that we remove
+ * this tuple's downlink and the *following* tuple's key).  Note we do not
+ * store any content for the deleted page --- it is just rewritten as empty
+ * during recovery.
+ */
+typedef struct xl_btree_delete_page
+{
+	xl_btreetid target;			/* deleted tuple id in parent page */
+	BlockNumber deadblk;		/* child block being deleted */
+	BlockNumber leftblk;		/* child block's left sibling, if any */
+	BlockNumber rightblk;		/* child block's right sibling */
+	/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_DELETE_PAGE_META */
+} xl_btree_delete_page;
+
+#define SizeOfBtreeDeletePage	(offsetof(xl_btree_delete_page, rightblk) + sizeof(BlockNumber))
+
+/*
+ * New root log record.  There are zero btitems if this is to establish an
+ * empty root, or two if it is the result of splitting an old root.
+ *
+ * Note that although this implies rewriting the metadata page, we don't need
+ * an xl_btree_metadata record --- the rootblk and level are sufficient.
+ */
+typedef struct xl_btree_newroot
+{
+	RelFileNode node;
+	BlockNumber rootblk;		/* location of new root */
+	uint32		level;			/* its tree level */
+	/* 0 or 2 BTITEMS FOLLOW AT END OF STRUCT */
+} xl_btree_newroot;
+
+#define SizeOfBtreeNewroot	(offsetof(xl_btree_newroot, level) + sizeof(uint32))
+
+/*
+ * New metapage log record.  This is not issued during routine operations;
+ * it's only used when initializing an empty index and at completion of
+ * index build.
+ */
+typedef struct xl_btree_newmeta
+{
+	RelFileNode node;
+	xl_btree_metadata meta;
+} xl_btree_newmeta;
+
+#define SizeOfBtreeNewmeta	(sizeof(xl_btree_newmeta))
+
+/*
+ * New index page log record.  This is only used while building a new index.
+ */
+typedef struct xl_btree_newpage
+{
+	RelFileNode node;
+	BlockNumber blkno;			/* location of new page */
+	/* entire page contents follow at end of record */
+} xl_btree_newpage;
+
+#define SizeOfBtreeNewpage	(offsetof(xl_btree_newpage, blkno) + sizeof(BlockNumber))
+
+
+/*
+ *	Operator strategy numbers -- ordering of these is <, <=, =, >=, >
+ */
+
+#define BTLessStrategyNumber			1
+#define BTLessEqualStrategyNumber		2
+#define BTEqualStrategyNumber			3
+#define BTGreaterEqualStrategyNumber	4
+#define BTGreaterStrategyNumber			5
+#define BTMaxStrategyNumber				5
+
+/*
+ *	When a new operator class is declared, we require that the user
+ *	supply us with an amproc procedure for determining whether, for
+ *	two keys a and b, a < b, a = b, or a > b.  This routine must
+ *	return < 0, 0, > 0, respectively, in these three cases.  Since we
+ *	only have one such proc in amproc, it's number 1.
+ */
+
+#define BTORDER_PROC	1
+
+/*
+ *	We need to be able to tell the difference between read and write
+ *	requests for pages, in order to do locking correctly.
+ */
+
+#define BT_READ			BUFFER_LOCK_SHARE
+#define BT_WRITE		BUFFER_LOCK_EXCLUSIVE
+
+/*
+ *	BTStackData -- As we descend a tree, we push the (location, downlink)
+ *	pairs from internal pages onto a private stack.  If we split a
+ *	leaf, we use this stack to walk back up the tree and insert data
+ *	into parent pages (and possibly to split them, too).  Lehman and
+ *	Yao's update algorithm guarantees that under no circumstances can
+ *	our private stack give us an irredeemably bad picture up the tree.
+ *	Again, see the paper for details.
+ */
+
+typedef struct BTStackData
+{
+	BlockNumber bts_blkno;
+	OffsetNumber bts_offset;
+	BTItemData	bts_btitem;
+	struct BTStackData *bts_parent;
+} BTStackData;
+
+typedef BTStackData *BTStack;
+
 /*
  *	BTScanOpaqueData is used to remember which buffers we're currently
  *	examining in the scan.	We keep these buffers pinned (but not locked,
@@ -116,212 +409,6 @@ typedef struct BTScanOpaqueData
 
 typedef BTScanOpaqueData *BTScanOpaque;
 
-/*
- *	BTItems are what we store in the btree.  Each item is an index tuple,
- *	including key and pointer values.  (In some cases either the key or the
- *	pointer may go unused, see backend/access/nbtree/README for details.)
- *
- *	Old comments:
- *	In addition, we must guarantee that all tuples in the index are unique,
- *	in order to satisfy some assumptions in Lehman and Yao.  The way that we
- *	do this is by generating a new OID for every insertion that we do in the
- *	tree.  This adds eight bytes to the size of btree index tuples.  Note
- *	that we do not use the OID as part of a composite key; the OID only
- *	serves as a unique identifier for a given index tuple (logical position
- *	within a page).
- *
- *	New comments:
- *	actually, we must guarantee that all tuples in A LEVEL
- *	are unique, not in ALL INDEX. So, we can use bti_itup->t_tid
- *	as unique identifier for a given index tuple (logical position
- *	within a level). - vadim 04/09/97
- */
-
-typedef struct BTItemData
-{
-	IndexTupleData bti_itup;
-} BTItemData;
-
-typedef BTItemData *BTItem;
-
-/*
- * For XLOG: size without alignement. Sizeof works as long as
- * IndexTupleData has exactly 8 bytes.
- */
-#define SizeOfBTItem	sizeof(BTItemData)
-
-/* Test whether items are the "same" per the above notes */
-#define BTItemSame(i1, i2)	  ( (i1)->bti_itup.t_tid.ip_blkid.bi_hi == \
-								(i2)->bti_itup.t_tid.ip_blkid.bi_hi && \
-								(i1)->bti_itup.t_tid.ip_blkid.bi_lo == \
-								(i2)->bti_itup.t_tid.ip_blkid.bi_lo && \
-								(i1)->bti_itup.t_tid.ip_posid == \
-								(i2)->bti_itup.t_tid.ip_posid )
-
-/*
- *	BTStackData -- As we descend a tree, we push the (key, pointer)
- *	pairs from internal nodes onto a private stack.  If we split a
- *	leaf, we use this stack to walk back up the tree and insert data
- *	into parent nodes (and possibly to split them, too).  Lehman and
- *	Yao's update algorithm guarantees that under no circumstances can
- *	our private stack give us an irredeemably bad picture up the tree.
- *	Again, see the paper for details.
- */
-
-typedef struct BTStackData
-{
-	BlockNumber bts_blkno;
-	OffsetNumber bts_offset;
-	BTItemData	bts_btitem;
-	struct BTStackData *bts_parent;
-} BTStackData;
-
-typedef BTStackData *BTStack;
-
-/*
- *	We need to be able to tell the difference between read and write
- *	requests for pages, in order to do locking correctly.
- */
-
-#define BT_READ			BUFFER_LOCK_SHARE
-#define BT_WRITE		BUFFER_LOCK_EXCLUSIVE
-
-/*
- *	In general, the btree code tries to localize its knowledge about
- *	page layout to a couple of routines.  However, we need a special
- *	value to indicate "no page number" in those places where we expect
- *	page numbers.  We can use zero for this because we never need to
- *	make a pointer to the metadata page.
- */
-
-#define P_NONE			0
-
-/*
- * Macros to test whether a page is leftmost or rightmost on its tree level,
- * as well as other state info kept in the opaque data.
- */
-#define P_LEFTMOST(opaque)		((opaque)->btpo_prev == P_NONE)
-#define P_RIGHTMOST(opaque)		((opaque)->btpo_next == P_NONE)
-#define P_ISLEAF(opaque)		((opaque)->btpo_flags & BTP_LEAF)
-#define P_ISROOT(opaque)		((opaque)->btpo_flags & BTP_ROOT)
-
-/*
- *	Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
- *	page.  The high key is not a data key, but gives info about what range of
- *	keys is supposed to be on this page.  The high key on a page is required
- *	to be greater than or equal to any data key that appears on the page.
- *	If we find ourselves trying to insert a key > high key, we know we need
- *	to move right (this should only happen if the page was split since we
- *	examined the parent page).
- *
- *	Our insertion algorithm guarantees that we can use the initial least key
- *	on our right sibling as the high key.  Once a page is created, its high
- *	key changes only if the page is split.
- *
- *	On a non-rightmost page, the high key lives in item 1 and data items
- *	start in item 2.  Rightmost pages have no high key, so we store data
- *	items beginning in item 1.
- */
-
-#define P_HIKEY				((OffsetNumber) 1)
-#define P_FIRSTKEY			((OffsetNumber) 2)
-#define P_FIRSTDATAKEY(opaque)	(P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
-
-/*
- * XLOG allows to store some information in high 4 bits of log
- * record xl_info field
- */
-#define XLOG_BTREE_DELETE	0x00	/* delete btitem */
-#define XLOG_BTREE_INSERT	0x10	/* add btitem without split */
-#define XLOG_BTREE_SPLIT	0x20	/* add btitem with split */
-#define XLOG_BTREE_SPLEFT	0x30	/* as above + flag that new btitem */
- /* goes to the left sibling */
-#define XLOG_BTREE_NEWROOT	0x40	/* new root page */
-
-#define XLOG_BTREE_LEAF		0x80	/* leaf/internal page was changed */
-
-/*
- * All what we need to find changed index tuple
- */
-typedef struct xl_btreetid
-{
-	RelFileNode node;
-	ItemPointerData tid;		/* changed tuple id */
-} xl_btreetid;
-
-/*
- * This is what we need to know about delete
- */
-typedef struct xl_btree_delete
-{
-	xl_btreetid target;			/* deleted tuple id */
-} xl_btree_delete;
-
-#define SizeOfBtreeDelete	(offsetof(xl_btreetid, tid) + SizeOfIptrData)
-
-/*
- * This is what we need to know about pure (without split) insert
- */
-typedef struct xl_btree_insert
-{
-	xl_btreetid target;			/* inserted tuple id */
-	/* BTITEM FOLLOWS AT END OF STRUCT */
-} xl_btree_insert;
-
-#define SizeOfBtreeInsert	(offsetof(xl_btreetid, tid) + SizeOfIptrData)
-
-/*
- * On insert with split we save items of both left and right siblings
- * and restore content of both pages from log record
- */
-typedef struct xl_btree_split
-{
-	xl_btreetid target;			/* inserted tuple id */
-	BlockIdData otherblk;		/* second block participated in split: */
-	/* first one is stored in target' tid */
-	BlockIdData parentblk;		/* parent block */
-	BlockIdData leftblk;		/* prev left block */
-	BlockIdData rightblk;		/* next right block */
-	uint16		leftlen;		/* len of left page items below */
-	/* LEFT AND RIGHT PAGES ITEMS FOLLOW AT THE END */
-} xl_btree_split;
-
-#define SizeOfBtreeSplit	(offsetof(xl_btree_split, leftlen) + sizeof(uint16))
-
-/*
- * New root log record.
- */
-typedef struct xl_btree_newroot
-{
-	RelFileNode node;
-	int32		level;
-	BlockIdData rootblk;
-	/* 0 or 2 BTITEMS FOLLOW AT END OF STRUCT */
-} xl_btree_newroot;
-
-#define SizeOfBtreeNewroot	(offsetof(xl_btree_newroot, rootblk) + sizeof(BlockIdData))
-
-/*
- *	Operator strategy numbers -- ordering of these is <, <=, =, >=, >
- */
-
-#define BTLessStrategyNumber			1
-#define BTLessEqualStrategyNumber		2
-#define BTEqualStrategyNumber			3
-#define BTGreaterEqualStrategyNumber	4
-#define BTGreaterStrategyNumber			5
-#define BTMaxStrategyNumber				5
-
-/*
- *	When a new operator class is declared, we require that the user
- *	supply us with an amproc procedure for determining whether, for
- *	two keys a and b, a < b, a = b, or a > b.  This routine must
- *	return < 0, 0, > 0, respectively, in these three cases.  Since we
- *	only have one such proc in amproc, it's number 1.
- */
-
-#define BTORDER_PROC	1
-
 /*
  * prototypes for functions in nbtree.c (external entry points for btree)
  */
@@ -340,27 +427,26 @@ extern Datum btmarkpos(PG_FUNCTION_ARGS);
 extern Datum btrestrpos(PG_FUNCTION_ARGS);
 extern Datum btbulkdelete(PG_FUNCTION_ARGS);
 
-extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
-extern void btree_undo(XLogRecPtr lsn, XLogRecord *record);
-extern void btree_desc(char *buf, uint8 xl_info, char *rec);
-
 /*
  * prototypes for functions in nbtinsert.c
  */
 extern InsertIndexResult _bt_doinsert(Relation rel, BTItem btitem,
 			 bool index_is_unique, Relation heapRel);
+extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
+							  BTStack stack, bool is_root, bool is_only);
 
 /*
  * prototypes for functions in nbtpage.c
  */
 extern void _bt_metapinit(Relation rel);
 extern Buffer _bt_getroot(Relation rel, int access);
+extern Buffer _bt_gettrueroot(Relation rel);
 extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
 extern void _bt_relbuf(Relation rel, Buffer buf);
 extern void _bt_wrtbuf(Relation rel, Buffer buf);
 extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
 extern void _bt_pageinit(Page page, Size size);
-extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, int level);
+extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level);
 extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid);
 
 /*
@@ -377,6 +463,7 @@ extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
+extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
 
 /*
  * prototypes for functions in nbtstrat.c
@@ -407,4 +494,13 @@ extern void _bt_spooldestroy(BTSpool *btspool);
 extern void _bt_spool(BTItem btitem, BTSpool *btspool);
 extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
 
+/*
+ * prototypes for functions in nbtxlog.c
+ */
+extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
+extern void btree_undo(XLogRecPtr lsn, XLogRecord *record);
+extern void btree_desc(char *buf, uint8 xl_info, char *rec);
+extern void btree_xlog_startup(void);
+extern void btree_xlog_cleanup(void);
+
 #endif   /* NBTREE_H */
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 1659c65b93..a1be9bacf3 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: xlog.h,v 1.40 2002/11/15 02:44:57 momjian Exp $
+ * $Id: xlog.h,v 1.41 2003/02/21 00:06:22 tgl Exp $
  */
 #ifndef XLOG_H
 #define XLOG_H
@@ -145,10 +145,12 @@ typedef XLogPageHeaderData *XLogPageHeader;
  */
 typedef struct RmgrData
 {
-	char	   *rm_name;
+	const char *rm_name;
 	void		(*rm_redo) (XLogRecPtr lsn, XLogRecord *rptr);
 	void		(*rm_undo) (XLogRecPtr lsn, XLogRecord *rptr);
 	void		(*rm_desc) (char *buf, uint8 xl_info, char *rec);
+	void		(*rm_startup) (void);
+	void		(*rm_cleanup) (void);
 } RmgrData;
 
 extern RmgrData RmgrTable[];
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 03e452121f..240889577a 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -37,7 +37,7 @@
  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $Id: catversion.h,v 1.177 2003/02/16 02:30:39 tgl Exp $
+ * $Id: catversion.h,v 1.178 2003/02/21 00:06:22 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	200302151
+#define CATALOG_VERSION_NO	200302171
 
 #endif