From 70508ba7aed76954b7e630a4952e1360c15db830 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 21 Feb 2003 00:06:22 +0000 Subject: [PATCH] Make btree index structure adjustments and WAL logging changes needed to support btree compaction, as per proposal of a few days ago. btree index pages no longer store parent links, instead they have a level indicator (counting up from zero for leaf pages). The FixBTree recovery logic is removed, and replaced by code that detects missing parent-level insertions during WAL replay. Also, generate appropriate WAL entries when updating btree metapage and when building a btree index from scratch. I believe btree indexes are now completely WAL-legal for the first time. initdb forced due to index and WAL changes. --- src/backend/access/nbtree/Makefile | 4 +- src/backend/access/nbtree/README | 506 +++++++---- src/backend/access/nbtree/nbtinsert.c | 1199 +++++++------------------ src/backend/access/nbtree/nbtpage.c | 260 ++++-- src/backend/access/nbtree/nbtree.c | 399 +------- src/backend/access/nbtree/nbtsearch.c | 133 ++- src/backend/access/nbtree/nbtsort.c | 98 +- src/backend/access/nbtree/nbtxlog.c | 780 ++++++++++++++++ src/backend/access/transam/rmgr.c | 37 +- src/backend/access/transam/xlog.c | 58 +- src/include/access/nbtree.h | 570 +++++++----- src/include/access/xlog.h | 6 +- src/include/catalog/catversion.h | 4 +- 13 files changed, 2179 insertions(+), 1875 deletions(-) create mode 100644 src/backend/access/nbtree/nbtxlog.c diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile index bdc366dd0a..cf525f9f1f 100644 --- a/src/backend/access/nbtree/Makefile +++ b/src/backend/access/nbtree/Makefile @@ -4,7 +4,7 @@ # Makefile for access/nbtree # # IDENTIFICATION -# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.11 2001/07/15 22:48:16 tgl Exp $ +# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.12 2003/02/21 00:06:21 tgl Exp $ # #------------------------------------------------------------------------- @@ -13,7 +13,7 @@ top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtsearch.o \ - nbtstrat.o nbtutils.o nbtsort.o + nbtstrat.o nbtutils.o nbtsort.o nbtxlog.o all: SUBSYS.o diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 964b8b4e11..8fc0c5c7bf 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -1,186 +1,378 @@ -$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.6 2002/10/20 20:47:31 tgl Exp $ +$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.7 2003/02/21 00:06:21 tgl Exp $ This directory contains a correct implementation of Lehman and Yao's high-concurrency B-tree management algorithm (P. Lehman and S. Yao, Efficient Locking for Concurrent Operations on B-Trees, ACM Transactions -on Database Systems, Vol 6, No. 4, December 1981, pp 650-670). +on Database Systems, Vol 6, No. 4, December 1981, pp 650-670). We also +use a simplified version of the deletion logic described in Lanin and +Shasha (V. Lanin and D. Shasha, A Symmetric Concurrent B-Tree Algorithm, +Proceedings of 1986 Fall Joint Computer Conference, pp 380-389). -We have made the following changes in order to incorporate their algorithm +The Lehman and Yao algorithm and insertions +------------------------------------------- + +We have made the following changes in order to incorporate the L&Y algorithm into Postgres: -+ The requirement that all btree keys be unique is too onerous, - but the algorithm won't work correctly without it. Fortunately, it is - only necessary that keys be unique on a single tree level, because L&Y - only use the assumption of key uniqueness when re-finding a key in a - parent node (to determine where to insert the key for a split page). - Therefore, we can use the link field to disambiguate multiple - occurrences of the same user key: only one entry in the parent level - will be pointing at the page we had split. (Indeed we need not look at - the real "key" at all, just at the link field.) We can distinguish - items at the leaf level in the same way, by examining their links to - heap tuples; we'd never have two items for the same heap tuple. +The requirement that all btree keys be unique is too onerous, +but the algorithm won't work correctly without it. Fortunately, it is +only necessary that keys be unique on a single tree level, because L&Y +only use the assumption of key uniqueness when re-finding a key in a +parent page (to determine where to insert the key for a split page). +Therefore, we can use the link field to disambiguate multiple +occurrences of the same user key: only one entry in the parent level +will be pointing at the page we had split. (Indeed we need not look at +the real "key" at all, just at the link field.) We can distinguish +items at the leaf level in the same way, by examining their links to +heap tuples; we'd never have two items for the same heap tuple. -+ Lehman and Yao assume that the key range for a subtree S is described - by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the parent - node. This does not work for nonunique keys (for example, if we have - enough equal keys to spread across several leaf pages, there *must* be - some equal bounding keys in the first level up). Therefore we assume - Ki <= v <= Ki+1 instead. A search that finds exact equality to a - bounding key in an upper tree level must descend to the left of that - key to ensure it finds any equal keys in the preceding page. An - insertion that sees the high key of its target page is equal to the key - to be inserted has a choice whether or not to move right, since the new - key could go on either page. (Currently, we try to find a page where - there is room for the new key without a split.) +Lehman and Yao assume that the key range for a subtree S is described +by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the parent +page. This does not work for nonunique keys (for example, if we have +enough equal keys to spread across several leaf pages, there *must* be +some equal bounding keys in the first level up). Therefore we assume +Ki <= v <= Ki+1 instead. A search that finds exact equality to a +bounding key in an upper tree level must descend to the left of that +key to ensure it finds any equal keys in the preceding page. An +insertion that sees the high key of its target page is equal to the key +to be inserted has a choice whether or not to move right, since the new +key could go on either page. (Currently, we try to find a page where +there is room for the new key without a split.) -+ Lehman and Yao don't require read locks, but assume that in-memory - copies of tree nodes are unshared. Postgres shares in-memory buffers - among backends. As a result, we do page-level read locking on btree - nodes in order to guarantee that no record is modified while we are - examining it. This reduces concurrency but guaranteees correct - behavior. An advantage is that when trading in a read lock for a - write lock, we need not re-read the page after getting the write lock. - Since we're also holding a pin on the shared buffer containing the - page, we know that buffer still contains the page and is up-to-date. +Lehman and Yao don't require read locks, but assume that in-memory +copies of tree pages are unshared. Postgres shares in-memory buffers +among backends. As a result, we do page-level read locking on btree +pages in order to guarantee that no record is modified while we are +examining it. This reduces concurrency but guaranteees correct +behavior. An advantage is that when trading in a read lock for a +write lock, we need not re-read the page after getting the write lock. +Since we're also holding a pin on the shared buffer containing the +page, we know that buffer still contains the page and is up-to-date. -+ We support the notion of an ordered "scan" of an index as well as - insertions, deletions, and simple lookups. A scan in the forward - direction is no problem, we just use the right-sibling pointers that - L&Y require anyway. (Thus, once we have descended the tree to the - correct start point for the scan, the scan looks only at leaf pages - and never at higher tree levels.) To support scans in the backward - direction, we also store a "left sibling" link much like the "right - sibling". (This adds an extra step to the L&Y split algorithm: while - holding the write lock on the page being split, we also lock its former - right sibling to update that page's left-link. This is safe since no - writer of that page can be interested in acquiring a write lock on our - page.) A backwards scan has one additional bit of complexity: after - following the left-link we must account for the possibility that the - left sibling page got split before we could read it. So, we have to - move right until we find a page whose right-link matches the page we - came from. +We support the notion of an ordered "scan" of an index as well as +insertions, deletions, and simple lookups. A scan in the forward +direction is no problem, we just use the right-sibling pointers that +L&Y require anyway. (Thus, once we have descended the tree to the +correct start point for the scan, the scan looks only at leaf pages +and never at higher tree levels.) To support scans in the backward +direction, we also store a "left sibling" link much like the "right +sibling". (This adds an extra step to the L&Y split algorithm: while +holding the write lock on the page being split, we also lock its former +right sibling to update that page's left-link. This is safe since no +writer of that page can be interested in acquiring a write lock on our +page.) A backwards scan has one additional bit of complexity: after +following the left-link we must account for the possibility that the +left sibling page got split before we could read it. So, we have to +move right until we find a page whose right-link matches the page we +came from. (Actually, it's even harder than that; see deletion discussion +below.) -+ Read locks on a page are held for as long as a scan is examining a page. - But nbtree.c arranges to drop the read lock, but not the buffer pin, - on the current page of a scan before control leaves nbtree. When we - come back to resume the scan, we have to re-grab the read lock and - then move right if the current item moved (see _bt_restscan()). Keeping - the pin ensures that the current item cannot move left or be deleted - (see btbulkdelete). +Read locks on a page are held for as long as a scan is examining a page. +But nbtree.c arranges to drop the read lock, but not the buffer pin, +on the current page of a scan before control leaves nbtree. When we +come back to resume the scan, we have to re-grab the read lock and +then move right if the current item moved (see _bt_restscan()). Keeping +the pin ensures that the current item cannot move left or be deleted +(see btbulkdelete). -+ In most cases we release our lock and pin on a page before attempting - to acquire pin and lock on the page we are moving to. In a few places - it is necessary to lock the next page before releasing the current one. - This is safe when moving right or up, but not when moving left or down - (else we'd create the possibility of deadlocks). +In most cases we release our lock and pin on a page before attempting +to acquire pin and lock on the page we are moving to. In a few places +it is necessary to lock the next page before releasing the current one. +This is safe when moving right or up, but not when moving left or down +(else we'd create the possibility of deadlocks). -+ Lehman and Yao fail to discuss what must happen when the root page - becomes full and must be split. Our implementation is to split the - root in the same way that any other page would be split, then construct - a new root page holding pointers to both of the resulting pages (which - now become siblings on level 2 of the tree). The new root page is then - installed by altering the root pointer in the meta-data page (see - below). This works because the root is not treated specially in any - other way --- in particular, searches will move right using its link - pointer if the link is set. Therefore, searches will find the data - that's been moved into the right sibling even if they read the metadata - page before it got updated. This is the same reasoning that makes a - split of a non-root page safe. The locking considerations are similar too. +Lehman and Yao fail to discuss what must happen when the root page +becomes full and must be split. Our implementation is to split the +root in the same way that any other page would be split, then construct +a new root page holding pointers to both of the resulting pages (which +now become siblings on the next level of the tree). The new root page +is then installed by altering the root pointer in the meta-data page (see +below). This works because the root is not treated specially in any +other way --- in particular, searches will move right using its link +pointer if the link is set. Therefore, searches will find the data +that's been moved into the right sibling even if they read the meta-data +page before it got updated. This is the same reasoning that makes a +split of a non-root page safe. The locking considerations are similar too. -+ Lehman and Yao assume fixed-size keys, but we must deal with - variable-size keys. Therefore there is not a fixed maximum number of - keys per page; we just stuff in as many as will fit. When we split a - page, we try to equalize the number of bytes, not items, assigned to - each of the resulting pages. Note we must include the incoming item in - this calculation, otherwise it is possible to find that the incoming - item doesn't fit on the split page where it needs to go! +When an inserter recurses up the tree, splitting internal pages to insert +links to pages inserted on the level below, it is possible that it will +need to access a page above the level that was the root when it began its +descent (or more accurately, the level that was the root when it read the +meta-data page). In this case the stack it made while descending does not +help for finding the correct page. When this happens, we find the correct +place by re-descending the tree until we reach the level one above the +level we need to insert a link to, and then moving right as necessary. +(Typically this will take only two fetches, the meta-data page and the new +root, but in principle there could have been more than one root split +since we saw the root. We can identify the correct tree level by means of +the level numbers stored in each page. The situation is rare enough that +we do not need a more efficient solution.) -In addition, the following things are handy to know: +Lehman and Yao assume fixed-size keys, but we must deal with +variable-size keys. Therefore there is not a fixed maximum number of +keys per page; we just stuff in as many as will fit. When we split a +page, we try to equalize the number of bytes, not items, assigned to +each of the resulting pages. Note we must include the incoming item in +this calculation, otherwise it is possible to find that the incoming +item doesn't fit on the split page where it needs to go! -+ Page zero of every btree is a meta-data page. This page stores - the location of the root page, a pointer to a list of free - pages, and other stuff that's handy to know. (Currently, we - never shrink btree indexes so there are never any free pages.) +The deletion algorithm +---------------------- -+ The algorithm assumes we can fit at least three items per page - (a "high key" and two real data items). Therefore it's unsafe - to accept items larger than 1/3rd page size. Larger items would - work sometimes, but could cause failures later on depending on - what else gets put on their page. +Deletions of leaf items are handled by getting a super-exclusive lock on +the target page, so that no other backend has a pin on the page when the +deletion starts. This means no scan is pointing at the page, so no other +backend can lose its place due to the item deletion. -+ This algorithm doesn't guarantee btree consistency after a kernel crash - or hardware failure. To do that, we'd need ordered writes, and UNIX - doesn't support ordered writes (short of fsync'ing every update, which - is too high a price). Rebuilding corrupted indexes during restart - seems more attractive. +The above does not work for deletion of items in internal pages, since +other backends keep no lock nor pin on a page they have descended past. +Instead, when a backend is ascending the tree using its stack, it must +be prepared for the possibility that the item it wants is to the left of +the recorded position (but it can't have moved left out of the recorded +page). Since we hold a lock on the lower page (per L&Y) until we have +re-found the parent item that links to it, we can be assured that the +parent item does still exist and can't have been deleted. Also, because +we are matching downlink page numbers and not data keys, we don't have any +problem with possibly misidentifying the parent item. -+ Deletions are handled by getting a super-exclusive lock on the target - page, so that no other backend has a pin on the page when the deletion - starts. This means no scan is pointing at the page. This is OK for - deleting leaf items, probably not OK for deleting internal nodes; - will need to think harder when it's time to support index compaction. +We consider deleting an entire page from the btree only when it's become +completely empty of items. (Merging partly-full pages would allow better +space reuse, but it seems impractical to move existing data items left or +right to make this happen --- a scan moving in the opposite direction +might miss the items if so. We could do it during VACUUM FULL, though.) +Also, we *never* delete the rightmost page on a tree level (this +restriction simplifies the traversal algorithms, as explained below). -+ "ScanKey" data structures are used in two fundamentally different ways - in this code. Searches for the initial position for a scan, as well as - insertions, use scankeys in which the comparison function is a 3-way - comparator (<0, =0, >0 result). These scankeys are built within the - btree code (eg, by _bt_mkscankey()) and used by _bt_compare(). Once we - are positioned, sequential examination of tuples in a scan is done by - _bt_checkkeys() using scankeys in which the comparison functions return - booleans --- for example, int4lt might be used. These scankeys are the - ones originally passed in from outside the btree code. Same - representation, but different comparison functions! +To delete an empty page, we acquire write lock on its left sibling (if +any), the target page itself, the right sibling (there must be one), and +the parent page, in that order. The parent page must be found using the +same type of search as used to find the parent during an insertion split. +Then we update the side-links in the siblings, mark the target page +deleted, and remove the downlink from the parent, as well as the parent's +upper bounding key for the target (the one separating it from its right +sibling). This causes the target page's key space to effectively belong +to its right sibling. (Neither the left nor right sibling pages need to +change their "high key" if any; so there is no problem with possibly not +having enough space to replace a high key.) The side-links in the target +page are not changed. -Notes about data representation: +(Note: Lanin and Shasha prefer to make the key space move left, but their +argument for doing so hinges on not having left-links, which we have +anyway. So we simplify the algorithm by moving key space right.) -+ The right-sibling link required by L&Y is kept in the page "opaque - data" area, as is the left-sibling link and some flags. +To preserve consistency on the parent level, we cannot merge the key space +of a page into its right sibling unless the right sibling is a child of +the same parent --- otherwise, the parent's key space assignment changes +too, meaning we'd have to make bounding-key updates in its parent, and +perhaps all the way up the tree. Since we can't possibly do that +atomically, we forbid this case. That means that the rightmost child of a +parent node can't be deleted unless it's the only remaining child. -+ We also keep a parent link in the opaque data, but this link is not - very trustworthy because it is not updated when the parent page splits. - Thus, it points to some page on the parent level, but possibly a page - well to the left of the page's actual current parent. In most cases - we do not need this link at all. Normally we return to a parent page - using a stack of entries that are made as we descend the tree, as in L&Y. - There is exactly one case where the stack will not help: concurrent - root splits. If an inserter process needs to split what had been the - root when it started its descent, but finds that that page is no longer - the root (because someone else split it meanwhile), then it uses the - parent link to move up to the next level. This is OK because we do fix - the parent link in a former root page when splitting it. This logic - will work even if the root is split multiple times (even up to creation - of multiple new levels) before an inserter returns to it. The same - could not be said of finding the new root via the metapage, since that - would work only for a single level of added root. +When we delete the last remaining child of a parent page, we mark the +parent page "half-dead" as part of the atomic update that deletes the +child page. This implicitly transfers the parent's key space to its right +sibling (which it must have, since we never delete the overall-rightmost +page of a level). No future insertions into the parent level are allowed +to insert keys into the half-dead page --- they must move right to its +sibling, instead. The parent remains empty and can be deleted in a +separate atomic action. (However, if it's the rightmost child of its own +parent, it might have to stay half-dead for awhile, until it's also the +only child.) -+ The Postgres disk block data format (an array of items) doesn't fit - Lehman and Yao's alternating-keys-and-pointers notion of a disk page, - so we have to play some games. +Note that an empty leaf page is a valid tree state, but an empty interior +page is not legal (an interior page must have children to delegate its +key space to). So an interior page *must* be marked half-dead as soon +as its last child is deleted. -+ On a page that is not rightmost in its tree level, the "high key" is - kept in the page's first item, and real data items start at item 2. - The link portion of the "high key" item goes unused. A page that is - rightmost has no "high key", so data items start with the first item. - Putting the high key at the left, rather than the right, may seem odd, - but it avoids moving the high key as we add data items. +The notion of a half-dead page means that the key space relationship between +the half-dead page's level and its parent's level may be a little out of +whack: key space that appears to belong to the half-dead page's parent on the +parent level may really belong to its right sibling. We can tolerate this, +however, because insertions and deletions on upper tree levels are always +done by reference to child page numbers, not keys. The only cost is that +searches may sometimes descend to the half-dead page and then have to move +right, rather than going directly to the sibling page. -+ On a leaf page, the data items are simply links to (TIDs of) tuples - in the relation being indexed, with the associated key values. +A deleted page cannot be reclaimed immediately, since there may be other +processes waiting to reference it (ie, search processes that just left the +parent, or scans moving right or left from one of the siblings). These +processes must observe that the page is marked dead and recover +accordingly. Searches and forward scans simply follow the right-link +until they find a non-dead page --- this will be where the deleted page's +key-space moved to. -+ On a non-leaf page, the data items are down-links to child pages with - bounding keys. The key in each data item is the *lower* bound for - keys on that child page, so logically the key is to the left of that - downlink. The high key (if present) is the upper bound for the last - downlink. The first data item on each such page has no lower bound - --- or lower bound of minus infinity, if you prefer. The comparison - routines must treat it accordingly. The actual key stored in the - item is irrelevant, and need not be stored at all. This arrangement - corresponds to the fact that an L&Y non-leaf page has one more pointer - than key. +Stepping left in a backward scan is complicated because we must consider +the possibility that the left sibling was just split (meaning we must find +the rightmost page derived from the left sibling), plus the possibility +that the page we were just on has now been deleted and hence isn't in the +sibling chain at all anymore. So the move-left algorithm becomes: +0. Remember the page we are on as the "original page". +1. Follow the original page's left-link (we're done if this is zero). +2. If the current page is live and its right-link matches the "original + page", we are done. +3. Otherwise, move right one or more times looking for a live page whose + right-link matches the "original page". If found, we are done. (In + principle we could scan all the way to the right end of the index, but + in practice it seems better to give up after a small number of tries. + It's unlikely the original page's sibling split more than a few times + while we were in flight to it; if we do not find a matching link in a + few tries, then most likely the original page is deleted.) +4. Return to the "original page". If it is still live, return to step 1 + (we guessed wrong about it being deleted, and should restart with its + current left-link). If it is dead, move right until a non-dead page + is found (there must be one, since rightmost pages are never deleted), + mark that as the new "original page", and return to step 1. +This algorithm is correct because the live page found by step 4 will have +the same left keyspace boundary as the page we started from. Therefore, +when we ultimately exit, it must be on a page whose right keyspace +boundary matches the left boundary of where we started --- which is what +we need to be sure we don't miss or re-scan any items. -Notes to operator class implementors: +A deleted page can only be reclaimed once there is no scan or search that +has a reference to it; until then, it must stay in place with its +right-link undisturbed. We implement this by waiting until all +transactions that were running at the time of deletion are dead; which is +overly strong, but is simple to implement within Postgres. When marked +dead, a deleted page is labeled with the next-transaction counter value. +VACUUM can reclaim the page for re-use when this transaction number is +older than the oldest open transaction. (NOTE: VACUUM FULL can reclaim +such pages immediately.) -+ With this implementation, we require each supported datatype to supply - us with a comparison procedure via pg_amproc. This procedure must take - two nonnull values A and B and return an int32 < 0, 0, or > 0 if A < B, - A = B, or A > B, respectively. See nbtcompare.c for examples. +Reclaiming a page doesn't actually change its state on disk --- we simply +record it in the shared-memory free space map, from which it will be +handed out the next time a new page is needed for a page split. The +deleted page's contents will be overwritten by the split operation. +(Note: if we find a deleted page with an extremely old transaction +number, it'd be worthwhile to re-mark it with FrozenTransactionId so that +a later xid wraparound can't cause us to think the page is unreclaimable. +But in more normal situations this would be a waste of a disk write.) + +Because we never delete the rightmost page of any level (and in particular +never delete the root), it's impossible for the height of the tree to +decrease. After massive deletions we might have a scenario in which the +tree is "skinny", with several single-page levels below the root. +Operations will still be correct in this case, but we'd waste cycles +descending through the single-page levels. To handle this we use an idea +from Lanin and Shasha: we keep track of the "fast root" level, which is +the lowest single-page level. The meta-data page keeps a pointer to this +level as well as the true root. All ordinary operations initiate their +searches at the fast root not the true root. When we split a page that is +alone on its level or delete the next-to-last page on a level (both cases +are easily detected), we have to make sure that the fast root pointer is +adjusted appropriately. In the split case, we do this work as part of the +atomic update for the insertion into the parent level; in the delete case +as part of the atomic update for the delete (either way, the metapage has +to be the last page locked in the update to avoid deadlock risks). This +avoids race conditions if two such operations are executing concurrently. + +VACUUM needs to do a linear scan of an index to search for empty leaf +pages and half-dead parent pages that can be deleted, as well as deleted +pages that can be reclaimed because they are older than all open +transactions. + +WAL considerations +------------------ + +The insertion and deletion algorithms in themselves don't guarantee btree +consistency after a crash. To provide robustness, we depend on WAL +replay. A single WAL entry is effectively an atomic action --- we can +redo it from the log if it fails to complete. + +Ordinary item insertions (that don't force a page split) are of course +single WAL entries, since they only affect one page. The same for +leaf-item deletions (if the deletion brings the leaf page to zero items, +it is now a candidate to be deleted, but that is a separate action). + +An insertion that causes a page split is logged as a single WAL entry for +the changes occuring on the insertion's level --- including update of the +right sibling's left-link --- followed by a second WAL entry for the +insertion on the parent level (which might itself be a page split, requiring +an additional insertion above that, etc). + +For a root split, the followon WAL entry is a "new root" entry rather than +an "insertion" entry, but details are otherwise much the same. + +Because insertion involves multiple atomic actions, the WAL replay logic +has to detect the case where a page split isn't followed by a matching +insertion on the parent level, and then do that insertion on its own (and +recursively for any subsequent parent insertion, of course). This is +feasible because the WAL entry for the split contains enough info to know +what must be inserted in the parent level. + +When splitting a non-root page that is alone on its level, the required +metapage update (of the "fast root" link) is performed and logged as part +of the insertion into the parent level. When splitting the root page, the +metapage update is handled as part of the "new root" action. + +A page deletion is logged as a single WAL entry covering all four +required page updates (target page, left and right siblings, and parent) +as an atomic event. (Any required fast-root link update is also part +of the WAL entry.) If the parent page becomes half-dead but is not +immediately deleted due to a subsequent crash, there is no loss of +consistency, and the empty page will be picked up by the next VACUUM. + +Other things that are handy to know +----------------------------------- + +Page zero of every btree is a meta-data page. This page stores the +location of the root page --- both the true root and the current effective +root ("fast" root). + +The algorithm assumes we can fit at least three items per page +(a "high key" and two real data items). Therefore it's unsafe +to accept items larger than 1/3rd page size. Larger items would +work sometimes, but could cause failures later on depending on +what else gets put on their page. + +"ScanKey" data structures are used in two fundamentally different ways +in this code. Searches for the initial position for a scan, as well as +insertions, use scankeys in which the comparison function is a 3-way +comparator (<0, =0, >0 result). These scankeys are built within the +btree code (eg, by _bt_mkscankey()) and used by _bt_compare(). Once we +are positioned, sequential examination of tuples in a scan is done by +_bt_checkkeys() using scankeys in which the comparison functions return +booleans --- for example, int4lt might be used. These scankeys are the +ones originally passed in from outside the btree code. Same +representation, but different comparison functions! + +Notes about data representation +------------------------------- + +The right-sibling link required by L&Y is kept in the page "opaque +data" area, as is the left-sibling link, the page level, and some flags. +The page level counts upwards from zero at the leaf level, to the tree +depth minus 1 at the root. (Counting up from the leaves ensures that we +don't need to renumber any existing pages when splitting the root.) + +The Postgres disk block data format (an array of items) doesn't fit +Lehman and Yao's alternating-keys-and-pointers notion of a disk page, +so we have to play some games. + +On a page that is not rightmost in its tree level, the "high key" is +kept in the page's first item, and real data items start at item 2. +The link portion of the "high key" item goes unused. A page that is +rightmost has no "high key", so data items start with the first item. +Putting the high key at the left, rather than the right, may seem odd, +but it avoids moving the high key as we add data items. + +On a leaf page, the data items are simply links to (TIDs of) tuples +in the relation being indexed, with the associated key values. + +On a non-leaf page, the data items are down-links to child pages with +bounding keys. The key in each data item is the *lower* bound for +keys on that child page, so logically the key is to the left of that +downlink. The high key (if present) is the upper bound for the last +downlink. The first data item on each such page has no lower bound +--- or lower bound of minus infinity, if you prefer. The comparison +routines must treat it accordingly. The actual key stored in the +item is irrelevant, and need not be stored at all. This arrangement +corresponds to the fact that an L&Y non-leaf page has one more pointer +than key. + +Notes to operator class implementors +------------------------------------ + +With this implementation, we require each supported datatype to supply +us with a comparison procedure via pg_amproc. This procedure must take +two nonnull values A and B and return an int32 < 0, 0, or > 0 if A < B, +A = B, or A > B, respectively. See nbtcompare.c for examples. diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 99011a5c95..a93a9fed8c 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.96 2002/09/04 20:31:09 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.97 2003/02/21 00:06:21 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -35,15 +35,6 @@ typedef struct int best_delta; /* best size delta so far */ } FindSplitData; -extern bool FixBTree; - -Buffer _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release); -static void _bt_fixtree(Relation rel, BlockNumber blkno); -static void _bt_fixbranch(Relation rel, BlockNumber lblkno, - BlockNumber rblkno, BTStack true_stack); -static void _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit); -static void _bt_fixup(Relation rel, Buffer buf); -static OffsetNumber _bt_getoff(Page page, BlockNumber blkno); static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); @@ -54,9 +45,8 @@ static InsertIndexResult _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, int keysz, ScanKey scankey, BTItem btitem, - OffsetNumber afteritem); -static void _bt_insertuple(Relation rel, Buffer buf, - Size itemsz, BTItem btitem, OffsetNumber newitemoff); + OffsetNumber afteritem, + bool split_only_page); static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, BTItem newitem, bool newitemonleft, @@ -149,7 +139,8 @@ top: } /* do the insertion */ - res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey, btitem, 0); + res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey, btitem, + 0, false); /* be tidy */ _bt_freestack(stack); @@ -320,6 +311,7 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel, * right using information stored in the parent stack). * + invokes itself with the appropriate tuple for the right * child page on the parent. + * + updates the metapage if a true root or fast root is split. * * On entry, we must have the right buffer on which to do the * insertion, and the buffer must be pinned and locked. On return, @@ -358,7 +350,8 @@ _bt_insertonpg(Relation rel, int keysz, ScanKey scankey, BTItem btitem, - OffsetNumber afteritem) + OffsetNumber afteritem, + bool split_only_page) { InsertIndexResult res; Page page; @@ -458,11 +451,10 @@ _bt_insertonpg(Relation rel, */ if (PageGetFreeSpace(page) < itemsz) { - Buffer rbuf; - BlockNumber bknum = BufferGetBlockNumber(buf); - BlockNumber rbknum; bool is_root = P_ISROOT(lpageop); + bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop); bool newitemonleft; + Buffer rbuf; /* Choose the split point */ firstright = _bt_findsplitloc(rel, page, @@ -488,128 +480,127 @@ _bt_insertonpg(Relation rel, * locks for the child pages until we locate the parent, but we can * release them before doing the actual insertion (see Lehman and Yao * for the reasoning). - * - * Here we have to do something Lehman and Yao don't talk about: - * deal with a root split and construction of a new root. If our - * stack is empty then we have just split a node on what had been - * the root level when we descended the tree. If it is still the - * root then we perform a new-root construction. If it *wasn't* - * the root anymore, use the parent pointer to get up to the root - * level that someone constructed meanwhile, and find the right - * place to insert as for the normal case. *---------- */ - - if (is_root) - { - Buffer rootbuf; - - Assert(stack == (BTStack) NULL); - /* create a new root node and release the split buffers */ - rootbuf = _bt_newroot(rel, buf, rbuf); - _bt_wrtbuf(rel, rootbuf); - _bt_wrtbuf(rel, rbuf); - _bt_wrtbuf(rel, buf); - } - else - { - InsertIndexResult newres; - BTItem new_item; - BTStackData fakestack; - BTItem ritem; - Buffer pbuf; - - /* If root page was splitted */ - if (stack == (BTStack) NULL) - { - elog(LOG, "btree: concurrent ROOT page split"); - - /* - * If root page splitter failed to create new root page - * then old root' btpo_parent still points to metapage. We - * have to fix root page in this case. - */ - if (BTreeInvalidParent(lpageop)) - { - if (!FixBTree) - elog(ERROR, "bt_insertonpg[%s]: no root page found", RelationGetRelationName(rel)); - _bt_wrtbuf(rel, rbuf); - _bt_wrtnorelbuf(rel, buf); - elog(WARNING, "bt_insertonpg[%s]: root page unfound - fixing upper levels", RelationGetRelationName(rel)); - _bt_fixup(rel, buf); - goto formres; - } - - /* - * Set up a phony stack entry if we haven't got a real one - */ - stack = &fakestack; - stack->bts_blkno = lpageop->btpo_parent; - stack->bts_offset = InvalidOffsetNumber; - /* bts_btitem will be initialized below */ - stack->bts_parent = NULL; - } - - /* get high key from left page == lowest key on new right page */ - ritem = (BTItem) PageGetItem(page, - PageGetItemId(page, P_HIKEY)); - - /* form an index tuple that points at the new right page */ - new_item = _bt_formitem(&(ritem->bti_itup)); - rbknum = BufferGetBlockNumber(rbuf); - ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY); - - /* - * Find the parent buffer and get the parent page. - * - * Oops - if we were moved right then we need to change stack - * item! We want to find parent pointing to where we are, - * right ? - vadim 05/27/97 - * - * Interestingly, this means we didn't *really* need to stack the - * parent key at all; all we really care about is the saved - * block and offset as a starting point for our search... - */ - ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid), - bknum, P_HIKEY); - - pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); - - /* Now we can write and unlock the children */ - _bt_wrtbuf(rel, rbuf); - _bt_wrtbuf(rel, buf); - - if (pbuf == InvalidBuffer) - { - if (!FixBTree) - elog(ERROR, "_bt_getstackbuf: my bits moved right off the end of the world!" - "\n\tRecreate index %s.", RelationGetRelationName(rel)); - pfree(new_item); - elog(WARNING, "bt_insertonpg[%s]: parent page unfound - fixing branch", RelationGetRelationName(rel)); - _bt_fixbranch(rel, bknum, rbknum, stack); - goto formres; - } - /* Recursively update the parent */ - newres = _bt_insertonpg(rel, pbuf, stack->bts_parent, - 0, NULL, new_item, stack->bts_offset); - - /* be tidy */ - pfree(newres); - pfree(new_item); - } + _bt_insert_parent(rel, buf, rbuf, stack, is_root, is_only); } else { + Buffer metabuf = InvalidBuffer; + Page metapg = NULL; + BTMetaPageData *metad = NULL; + itup_off = newitemoff; itup_blkno = BufferGetBlockNumber(buf); - _bt_insertuple(rel, buf, itemsz, btitem, newitemoff); + /* + * If we are doing this insert because we split a page that was + * the only one on its tree level, but was not the root, it may + * have been the "fast root". We need to ensure that the fast root + * link points at or above the current page. We can safely acquire + * a lock on the metapage here --- see comments for _bt_newroot(). + */ + if (split_only_page) + { + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + if (metad->btm_fastlevel >= lpageop->btpo.level) + { + /* no update wanted */ + _bt_relbuf(rel, metabuf); + metabuf = InvalidBuffer; + } + } + + /* Do the actual update. No elog(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + _bt_pgaddtup(rel, page, itemsz, btitem, newitemoff, "page"); + + if (BufferIsValid(metabuf)) + { + metad->btm_fastroot = itup_blkno; + metad->btm_fastlevel = lpageop->btpo.level; + } + + /* XLOG stuff */ + if (!rel->rd_istemp) + { + xl_btree_insert xlrec; + xl_btree_metadata xlmeta; + uint8 xlinfo; + XLogRecPtr recptr; + XLogRecData rdata[3]; + XLogRecData *nextrdata; + BTItemData truncitem; + + xlrec.target.node = rel->rd_node; + ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off); + + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &xlrec; + rdata[0].len = SizeOfBtreeInsert; + rdata[0].next = nextrdata = &(rdata[1]); + + if (BufferIsValid(metabuf)) + { + xlmeta.root = metad->btm_root; + xlmeta.level = metad->btm_level; + xlmeta.fastroot = metad->btm_fastroot; + xlmeta.fastlevel = metad->btm_fastlevel; + + nextrdata->buffer = InvalidBuffer; + nextrdata->data = (char *) &xlmeta; + nextrdata->len = sizeof(xl_btree_metadata); + nextrdata->next = nextrdata + 1; + nextrdata++; + xlinfo = XLOG_BTREE_INSERT_META; + } + else if (P_ISLEAF(lpageop)) + xlinfo = XLOG_BTREE_INSERT_LEAF; + else + xlinfo = XLOG_BTREE_INSERT_UPPER; + + /* Read comments in _bt_pgaddtup */ + if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop)) + { + truncitem = *btitem; + truncitem.bti_itup.t_info = sizeof(BTItemData); + nextrdata->data = (char *) &truncitem; + nextrdata->len = sizeof(BTItemData); + } + else + { + nextrdata->data = (char *) btitem; + nextrdata->len = IndexTupleDSize(btitem->bti_itup) + + (sizeof(BTItemData) - sizeof(IndexTupleData)); + } + nextrdata->buffer = buf; + nextrdata->next = NULL; + + recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); + + if (BufferIsValid(metabuf)) + { + PageSetLSN(metapg, recptr); + PageSetSUI(metapg, ThisStartUpID); + } + + PageSetLSN(page, recptr); + PageSetSUI(page, ThisStartUpID); + } + + END_CRIT_SECTION(); /* Write out the updated page and release pin/lock */ + if (BufferIsValid(metabuf)) + _bt_wrtbuf(rel, metabuf); + _bt_wrtbuf(rel, buf); } -formres:; /* by here, the new tuple is inserted at itup_blkno/itup_off */ res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData)); ItemPointerSet(&(res->pointerData), itup_blkno, itup_off); @@ -617,61 +608,6 @@ formres:; return res; } -static void -_bt_insertuple(Relation rel, Buffer buf, - Size itemsz, BTItem btitem, OffsetNumber newitemoff) -{ - Page page = BufferGetPage(buf); - BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); - - START_CRIT_SECTION(); - - _bt_pgaddtup(rel, page, itemsz, btitem, newitemoff, "page"); - - /* XLOG stuff */ - if (!rel->rd_istemp) - { - xl_btree_insert xlrec; - uint8 flag = XLOG_BTREE_INSERT; - XLogRecPtr recptr; - XLogRecData rdata[2]; - BTItemData truncitem; - - xlrec.target.node = rel->rd_node; - ItemPointerSet(&(xlrec.target.tid), BufferGetBlockNumber(buf), newitemoff); - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) &xlrec; - rdata[0].len = SizeOfBtreeInsert; - rdata[0].next = &(rdata[1]); - - /* Read comments in _bt_pgaddtup */ - if (!(P_ISLEAF(pageop)) && newitemoff == P_FIRSTDATAKEY(pageop)) - { - truncitem = *btitem; - truncitem.bti_itup.t_info = sizeof(BTItemData); - rdata[1].data = (char *) &truncitem; - rdata[1].len = sizeof(BTItemData); - } - else - { - rdata[1].data = (char *) btitem; - rdata[1].len = IndexTupleDSize(btitem->bti_itup) + - (sizeof(BTItemData) - sizeof(IndexTupleData)); - } - rdata[1].buffer = buf; - rdata[1].next = NULL; - if (P_ISLEAF(pageop)) - flag |= XLOG_BTREE_LEAF; - - recptr = XLogInsert(RM_BTREE_ID, flag, rdata); - - PageSetLSN(page, recptr); - PageSetSUI(page, ThisStartUpID); - } - - END_CRIT_SECTION(); -} - /* * _bt_split() -- split a page in the btree. * @@ -729,13 +665,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, lopaque->btpo_next = BufferGetBlockNumber(rbuf); ropaque->btpo_prev = BufferGetBlockNumber(buf); ropaque->btpo_next = oopaque->btpo_next; - - /* - * Must copy the original parent link into both new pages, even though - * it might be quite obsolete by now. We might need it if this level - * is or recently was the root (see README). - */ - lopaque->btpo_parent = ropaque->btpo_parent = oopaque->btpo_parent; + lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level; /* * If the page we're splitting is not the rightmost page at its level @@ -876,34 +806,29 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, if (!rel->rd_istemp) { xl_btree_split xlrec; - int flag = (newitemonleft) ? - XLOG_BTREE_SPLEFT : XLOG_BTREE_SPLIT; - BlockNumber blkno; + uint8 xlinfo; XLogRecPtr recptr; XLogRecData rdata[4]; xlrec.target.node = rel->rd_node; ItemPointerSet(&(xlrec.target.tid), *itup_blkno, *itup_off); if (newitemonleft) - { - blkno = BufferGetBlockNumber(rbuf); - BlockIdSet(&(xlrec.otherblk), blkno); - } + xlrec.otherblk = BufferGetBlockNumber(rbuf); else - { - blkno = BufferGetBlockNumber(buf); - BlockIdSet(&(xlrec.otherblk), blkno); - } - BlockIdSet(&(xlrec.parentblk), lopaque->btpo_parent); - BlockIdSet(&(xlrec.leftblk), lopaque->btpo_prev); - BlockIdSet(&(xlrec.rightblk), ropaque->btpo_next); + xlrec.otherblk = BufferGetBlockNumber(buf); + xlrec.leftblk = lopaque->btpo_prev; + xlrec.rightblk = ropaque->btpo_next; + xlrec.level = lopaque->btpo.level; /* * Direct access to page is not good but faster - we should - * implement some new func in page API. + * implement some new func in page API. Note we only store the + * tuples themselves, knowing that the item pointers are in the + * same order and can be reconstructed by scanning the tuples. */ xlrec.leftlen = ((PageHeader) leftpage)->pd_special - ((PageHeader) leftpage)->pd_upper; + rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeSplit; @@ -933,10 +858,12 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, rdata[3].next = NULL; } - if (P_ISLEAF(lopaque)) - flag |= XLOG_BTREE_LEAF; + if (P_ISROOT(oopaque)) + xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT; + else + xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R; - recptr = XLogInsert(RM_BTREE_ID, flag, rdata); + recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); PageSetLSN(leftpage, recptr); PageSetSUI(leftpage, ThisStartUpID); @@ -1175,48 +1102,179 @@ _bt_checksplitloc(FindSplitData *state, OffsetNumber firstright, } } +/* + * _bt_insert_parent() -- Insert downlink into parent after a page split. + * + * On entry, buf and rbuf are the left and right split pages, which we + * still hold write locks on per the L&Y algorithm. We release the + * write locks once we have write lock on the parent page. (Any sooner, + * and it'd be possible for some other process to try to split or delete + * one of these pages, and get confused because it cannot find the downlink.) + * + * stack - stack showing how we got here. May be NULL in cases that don't + * have to be efficient (concurrent ROOT split, WAL recovery) + * is_root - we split the true root + * is_only - we split a page alone on its level (might have been fast root) + * + * This is exported so it can be called by nbtxlog.c. + */ +void +_bt_insert_parent(Relation rel, + Buffer buf, + Buffer rbuf, + BTStack stack, + bool is_root, + bool is_only) +{ + /* + * Here we have to do something Lehman and Yao don't talk about: + * deal with a root split and construction of a new root. If our + * stack is empty then we have just split a node on what had been + * the root level when we descended the tree. If it was still the + * root then we perform a new-root construction. If it *wasn't* + * the root anymore, search to find the next higher level that + * someone constructed meanwhile, and find the right place to insert + * as for the normal case. + * + * If we have to search for the parent level, we do so by + * re-descending from the root. This is not super-efficient, + * but it's rare enough not to matter. (This path is also taken + * when called from WAL recovery --- we have no stack in that case.) + */ + if (is_root) + { + Buffer rootbuf; + + Assert(stack == (BTStack) NULL); + Assert(is_only); + /* create a new root node and update the metapage */ + rootbuf = _bt_newroot(rel, buf, rbuf); + /* release the split buffers */ + _bt_wrtbuf(rel, rootbuf); + _bt_wrtbuf(rel, rbuf); + _bt_wrtbuf(rel, buf); + } + else + { + BlockNumber bknum = BufferGetBlockNumber(buf); + BlockNumber rbknum = BufferGetBlockNumber(rbuf); + Page page = BufferGetPage(buf); + InsertIndexResult newres; + BTItem new_item; + BTStackData fakestack; + BTItem ritem; + Buffer pbuf; + + if (stack == (BTStack) NULL) + { + BTPageOpaque lpageop; + + if (!InRecovery) + elog(DEBUG1, "_bt_insert_parent: concurrent ROOT page split"); + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + /* Find the leftmost page at the next level up */ + pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false); + /* Set up a phony stack entry pointing there */ + stack = &fakestack; + stack->bts_blkno = BufferGetBlockNumber(pbuf); + stack->bts_offset = InvalidOffsetNumber; + /* bts_btitem will be initialized below */ + stack->bts_parent = NULL; + _bt_relbuf(rel, pbuf); + } + + /* get high key from left page == lowest key on new right page */ + ritem = (BTItem) PageGetItem(page, + PageGetItemId(page, P_HIKEY)); + + /* form an index tuple that points at the new right page */ + new_item = _bt_formitem(&(ritem->bti_itup)); + ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY); + + /* + * Find the parent buffer and get the parent page. + * + * Oops - if we were moved right then we need to change stack + * item! We want to find parent pointing to where we are, + * right ? - vadim 05/27/97 + */ + ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid), + bknum, P_HIKEY); + + pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); + + /* Now we can write and unlock the children */ + _bt_wrtbuf(rel, rbuf); + _bt_wrtbuf(rel, buf); + + /* Check for error only after writing children */ + if (pbuf == InvalidBuffer) + elog(ERROR, "_bt_getstackbuf: my bits moved right off the end of the world!" + "\n\tRecreate index %s.", RelationGetRelationName(rel)); + + /* Recursively update the parent */ + newres = _bt_insertonpg(rel, pbuf, stack->bts_parent, + 0, NULL, new_item, stack->bts_offset, + is_only); + + /* be tidy */ + pfree(newres); + pfree(new_item); + } +} + /* * _bt_getstackbuf() -- Walk back up the tree one step, and find the item * we last looked at in the parent. * - * This is possible because we save a bit image of the last item - * we looked at in the parent, and the update algorithm guarantees - * that if items above us in the tree move, they only move right. + * This is possible because we save the downlink from the parent item, + * which is enough to uniquely identify it. Insertions into the parent + * level could cause the item to move right; deletions could cause it + * to move left, but not left of the page we previously found it in. * - * Also, re-set bts_blkno & bts_offset if changed. + * Adjusts bts_blkno & bts_offset if changed. + * + * Returns InvalidBuffer if item not found (should not happen). */ static Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access) { BlockNumber blkno; - Buffer buf; - OffsetNumber start, - offnum, - maxoff; - Page page; - ItemId itemid; - BTItem item; - BTPageOpaque opaque; + OffsetNumber start; blkno = stack->bts_blkno; - buf = _bt_getbuf(rel, blkno, access); - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - maxoff = PageGetMaxOffsetNumber(page); - start = stack->bts_offset; - /* - * _bt_insertonpg set bts_offset to InvalidOffsetNumber in the case of - * concurrent ROOT page split. Also, watch out for possibility that - * page has a high key now when it didn't before. - */ - if (start < P_FIRSTDATAKEY(opaque)) - start = P_FIRSTDATAKEY(opaque); - for (;;) { - /* see if it's on this page */ + Buffer buf; + Page page; + BTPageOpaque opaque; + OffsetNumber offnum, + minoff, + maxoff; + ItemId itemid; + BTItem item; + + buf = _bt_getbuf(rel, blkno, access); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * start = InvalidOffsetNumber means "search the whole page". + * We need this test anyway due to possibility that + * page has a high key now when it didn't before. + */ + if (start < minoff) + start = minoff; + + /* + * These loops will check every item on the page --- but in an order + * that's attuned to the probability of where it actually is. Scan + * to the right first, then to the left. + */ for (offnum = start; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) @@ -1232,23 +1290,32 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access) } } + for (offnum = OffsetNumberPrev(start); + offnum >= minoff; + offnum = OffsetNumberPrev(offnum)) + { + itemid = PageGetItemId(page, offnum); + item = (BTItem) PageGetItem(page, itemid); + if (BTItemSame(item, &stack->bts_btitem)) + { + /* Return accurate pointer to where link is now */ + stack->bts_blkno = blkno; + stack->bts_offset = offnum; + return buf; + } + } + /* - * by here, the item we're looking for moved right at least one - * page + * The item we're looking for moved right at least one page. */ if (P_RIGHTMOST(opaque)) { _bt_relbuf(rel, buf); return (InvalidBuffer); } - blkno = opaque->btpo_next; + start = InvalidOffsetNumber; _bt_relbuf(rel, buf); - buf = _bt_getbuf(rel, blkno, access); - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - maxoff = PageGetMaxOffsetNumber(page); - start = P_FIRSTDATAKEY(opaque); } } @@ -1289,6 +1356,11 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) Page metapg; BTMetaPageData *metad; + lbkno = BufferGetBlockNumber(lbuf); + rbkno = BufferGetBlockNumber(rbuf); + lpage = BufferGetPage(lbuf); + rpage = BufferGetPage(rbuf); + /* get a new root page */ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); rootpage = BufferGetPage(rootbuf); @@ -1303,22 +1375,15 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) /* set btree special data */ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; - rootopaque->btpo_flags |= BTP_ROOT; - rootopaque->btpo_parent = BTREE_METAPAGE; + rootopaque->btpo_flags = BTP_ROOT; + rootopaque->btpo.level = + ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1; - lbkno = BufferGetBlockNumber(lbuf); - rbkno = BufferGetBlockNumber(rbuf); - lpage = BufferGetPage(lbuf); - rpage = BufferGetPage(rbuf); - - /* - * Make sure pages in old root level have valid parent links --- we - * will need this in _bt_insertonpg() if a concurrent root split - * happens (see README). - */ - ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_parent = - ((BTPageOpaque) PageGetSpecialPointer(rpage))->btpo_parent = - rootblknum; + /* update metapage data */ + metad->btm_root = rootblknum; + metad->btm_level = rootopaque->btpo.level; + metad->btm_fastroot = rootblknum; + metad->btm_fastlevel = rootopaque->btpo.level; /* * Create downlink item for left page (old root). Since this will be @@ -1356,9 +1421,6 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) elog(PANIC, "btree: failed to add rightkey to new root page"); pfree(new_item); - metad->btm_root = rootblknum; - (metad->btm_level)++; - /* XLOG stuff */ if (!rel->rd_istemp) { @@ -1367,8 +1429,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) XLogRecData rdata[2]; xlrec.node = rel->rd_node; + xlrec.rootblk = rootblknum; xlrec.level = metad->btm_level; - BlockIdSet(&(xlrec.rootblk), rootblknum); + rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeNewroot; @@ -1390,8 +1453,6 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) PageSetSUI(rootpage, ThisStartUpID); PageSetLSN(metapg, recptr); PageSetSUI(metapg, ThisStartUpID); - - /* we changed their btpo_parent */ PageSetLSN(lpage, recptr); PageSetSUI(lpage, ThisStartUpID); PageSetLSN(rpage, recptr); @@ -1406,620 +1467,6 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) return (rootbuf); } -/* - * In the event old root page was splitted but no new one was created we - * build required parent levels keeping write lock on old root page. - * Note: it's assumed that old root page' btpo_parent points to meta page, - * ie not to parent page. On exit, new root page buffer is write locked. - * If "release" is TRUE then oldrootbuf will be released immediately - * after upper level is builded. - */ -Buffer -_bt_fixroot(Relation rel, Buffer oldrootbuf, bool release) -{ - Buffer rootbuf; - BlockNumber rootblk; - Page rootpage; - XLogRecPtr rootLSN; - Page oldrootpage = BufferGetPage(oldrootbuf); - BTPageOpaque oldrootopaque = (BTPageOpaque) - PageGetSpecialPointer(oldrootpage); - Buffer buf, - leftbuf, - rightbuf; - Page page, - leftpage, - rightpage; - BTPageOpaque opaque, - leftopaque, - rightopaque; - OffsetNumber newitemoff; - BTItem btitem, - ritem; - Size itemsz; - - if (!P_LEFTMOST(oldrootopaque) || P_RIGHTMOST(oldrootopaque)) - elog(ERROR, "bt_fixroot: not valid old root page"); - - /* Read right neighbor and create new root page */ - leftbuf = _bt_getbuf(rel, oldrootopaque->btpo_next, BT_WRITE); - leftpage = BufferGetPage(leftbuf); - leftopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage); - rootbuf = _bt_newroot(rel, oldrootbuf, leftbuf); - rootpage = BufferGetPage(rootbuf); - rootLSN = PageGetLSN(rootpage); - rootblk = BufferGetBlockNumber(rootbuf); - - /* parent page where to insert pointers */ - buf = rootbuf; - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - - /* - * Now read other pages (if any) on level and add them to new root. - * Here we break one of our locking rules - never hold lock on parent - * page when acquiring lock on its child, - but we free from deadlock: - * - * If concurrent process will split one of pages on this level then it - * will see either btpo_parent == metablock or btpo_parent == rootblk. - * In first case it will give up its locks and walk to the leftmost - * page (oldrootbuf) in _bt_fixup() - ie it will wait for us and let - * us continue. In second case it will try to lock rootbuf keeping its - * locks on buffers we already passed, also waiting for us. If we'll - * have to unlock rootbuf (split it) and that process will have to - * split page of new level we created (level of rootbuf) then it will - * wait while we create upper level. Etc. - */ - while (!P_RIGHTMOST(leftopaque)) - { - rightbuf = _bt_getbuf(rel, leftopaque->btpo_next, BT_WRITE); - rightpage = BufferGetPage(rightbuf); - rightopaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); - - /* - * Update LSN & StartUpID of child page buffer to ensure that it - * will be written on disk after flushing log record for new root - * creation. Unfortunately, for the moment (?) we do not log this - * operation and so possibly break our rule to log entire page - * content on first after checkpoint modification. - */ - HOLD_INTERRUPTS(); - rightopaque->btpo_parent = rootblk; - if (XLByteLT(PageGetLSN(rightpage), rootLSN)) - PageSetLSN(rightpage, rootLSN); - PageSetSUI(rightpage, ThisStartUpID); - RESUME_INTERRUPTS(); - - ritem = (BTItem) PageGetItem(leftpage, PageGetItemId(leftpage, P_HIKEY)); - btitem = _bt_formitem(&(ritem->bti_itup)); - ItemPointerSet(&(btitem->bti_itup.t_tid), leftopaque->btpo_next, P_HIKEY); - itemsz = IndexTupleDSize(btitem->bti_itup) - + (sizeof(BTItemData) - sizeof(IndexTupleData)); - itemsz = MAXALIGN(itemsz); - - newitemoff = OffsetNumberNext(PageGetMaxOffsetNumber(page)); - - if (PageGetFreeSpace(page) < itemsz) - { - Buffer newbuf; - OffsetNumber firstright; - OffsetNumber itup_off; - BlockNumber itup_blkno; - bool newitemonleft; - - firstright = _bt_findsplitloc(rel, page, - newitemoff, itemsz, &newitemonleft); - newbuf = _bt_split(rel, buf, firstright, - newitemoff, itemsz, btitem, newitemonleft, - &itup_off, &itup_blkno); - /* Keep lock on new "root" buffer ! */ - if (buf != rootbuf) - _bt_relbuf(rel, buf); - buf = newbuf; - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - } - else - _bt_insertuple(rel, buf, itemsz, btitem, newitemoff); - - /* give up left buffer */ - _bt_wrtbuf(rel, leftbuf); - pfree(btitem); - leftbuf = rightbuf; - leftpage = rightpage; - leftopaque = rightopaque; - } - - /* give up rightmost page buffer */ - _bt_wrtbuf(rel, leftbuf); - - /* - * Here we hold locks on old root buffer, new root buffer we've - * created with _bt_newroot() - rootbuf, - and buf we've used for last - * insert ops - buf. If rootbuf != buf then we have to create at least - * one more level. And if "release" is TRUE then we give up - * oldrootbuf. - */ - if (release) - _bt_wrtbuf(rel, oldrootbuf); - - if (rootbuf != buf) - { - _bt_wrtbuf(rel, buf); - return (_bt_fixroot(rel, rootbuf, true)); - } - - return (rootbuf); -} - -/* - * Using blkno of leftmost page on a level inside tree this func - * checks/fixes tree from this level up to the root page. - */ -static void -_bt_fixtree(Relation rel, BlockNumber blkno) -{ - Buffer buf; - Page page; - BTPageOpaque opaque; - BlockNumber pblkno; - - for (;;) - { - buf = _bt_getbuf(rel, blkno, BT_READ); - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (!P_LEFTMOST(opaque) || P_ISLEAF(opaque)) - elog(ERROR, "bt_fixtree[%s]: invalid start page (need to recreate index)", RelationGetRelationName(rel)); - pblkno = opaque->btpo_parent; - - /* check/fix entire level */ - _bt_fixlevel(rel, buf, InvalidBlockNumber); - - /* - * No pins/locks are held here. Re-read start page if its - * btpo_parent pointed to meta page else go up one level. - * - * XXX have to catch InvalidBlockNumber at the moment -:( - */ - if (pblkno == BTREE_METAPAGE || pblkno == InvalidBlockNumber) - { - buf = _bt_getbuf(rel, blkno, BT_WRITE); - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (P_ISROOT(opaque)) - { - /* Tree is Ok now */ - _bt_relbuf(rel, buf); - return; - } - /* Call _bt_fixroot() if there is no upper level */ - if (BTreeInvalidParent(opaque)) - { - elog(WARNING, "bt_fixtree[%s]: fixing root page", RelationGetRelationName(rel)); - buf = _bt_fixroot(rel, buf, true); - _bt_relbuf(rel, buf); - return; - } - /* Have to go up one level */ - pblkno = opaque->btpo_parent; - _bt_relbuf(rel, buf); - } - blkno = pblkno; - } - -} - -/* - * Check/fix level starting from page in buffer buf up to block - * limit on *child* level (or till rightmost child page if limit - * is InvalidBlockNumber). Start buffer must be read locked. - * No pins/locks are held on exit. - */ -static void -_bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit) -{ - BlockNumber blkno = BufferGetBlockNumber(buf); - Page page; - BTPageOpaque opaque; - BlockNumber cblkno[3]; - OffsetNumber coff[3]; - Buffer cbuf[3]; - Page cpage[3]; - BTPageOpaque copaque[3]; - BTItem btitem; - int cidx, - i; - bool goodbye = false; - char tbuf[BLCKSZ]; - - page = BufferGetPage(buf); - /* copy page to temp storage */ - memmove(tbuf, page, PageGetPageSize(page)); - _bt_relbuf(rel, buf); - - page = (Page) tbuf; - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - - /* Initialize first child data */ - coff[0] = P_FIRSTDATAKEY(opaque); - if (coff[0] > PageGetMaxOffsetNumber(page)) - elog(ERROR, "bt_fixlevel[%s]: invalid maxoff on start page (need to recreate index)", RelationGetRelationName(rel)); - btitem = (BTItem) PageGetItem(page, PageGetItemId(page, coff[0])); - cblkno[0] = ItemPointerGetBlockNumber(&(btitem->bti_itup.t_tid)); - cbuf[0] = _bt_getbuf(rel, cblkno[0], BT_READ); - cpage[0] = BufferGetPage(cbuf[0]); - copaque[0] = (BTPageOpaque) PageGetSpecialPointer(cpage[0]); - if (P_LEFTMOST(opaque) && !P_LEFTMOST(copaque[0])) - elog(ERROR, "bt_fixtlevel[%s]: non-leftmost child page of leftmost parent (need to recreate index)", RelationGetRelationName(rel)); - /* caller should take care and avoid this */ - if (P_RIGHTMOST(copaque[0])) - elog(ERROR, "bt_fixtlevel[%s]: invalid start child (need to recreate index)", RelationGetRelationName(rel)); - - for (;;) - { - /* - * Read up to 2 more child pages and look for pointers to them in - * *saved* parent page - */ - coff[1] = coff[2] = InvalidOffsetNumber; - for (cidx = 0; cidx < 2;) - { - cidx++; - cblkno[cidx] = (copaque[cidx - 1])->btpo_next; - cbuf[cidx] = _bt_getbuf(rel, cblkno[cidx], BT_READ); - cpage[cidx] = BufferGetPage(cbuf[cidx]); - copaque[cidx] = (BTPageOpaque) PageGetSpecialPointer(cpage[cidx]); - coff[cidx] = _bt_getoff(page, cblkno[cidx]); - - /* sanity check */ - if (coff[cidx] != InvalidOffsetNumber) - { - for (i = cidx - 1; i >= 0; i--) - { - if (coff[i] == InvalidOffsetNumber) - continue; - if (coff[cidx] != coff[i] + 1) - elog(ERROR, "bt_fixlevel[%s]: invalid item order(1) (need to recreate index)", RelationGetRelationName(rel)); - break; - } - } - - if (P_RIGHTMOST(copaque[cidx])) - break; - } - - /* - * Read parent page and insert missed pointers. - */ - if (coff[1] == InvalidOffsetNumber || - (cidx == 2 && coff[2] == InvalidOffsetNumber)) - { - Buffer newbuf; - Page newpage; - BTPageOpaque newopaque; - BTItem ritem; - Size itemsz; - OffsetNumber newitemoff; - BlockNumber parblk[3]; - BTStackData stack; - - stack.bts_parent = NULL; - stack.bts_blkno = blkno; - stack.bts_offset = InvalidOffsetNumber; - ItemPointerSet(&(stack.bts_btitem.bti_itup.t_tid), - cblkno[0], P_HIKEY); - - buf = _bt_getstackbuf(rel, &stack, BT_WRITE); - if (buf == InvalidBuffer) - elog(ERROR, "bt_fixlevel[%s]: pointer disappeared (need to recreate index)", RelationGetRelationName(rel)); - - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - coff[0] = stack.bts_offset; - blkno = BufferGetBlockNumber(buf); - parblk[0] = blkno; - - /* Check/insert missed pointers */ - for (i = 1; i <= cidx; i++) - { - coff[i] = _bt_getoff(page, cblkno[i]); - - /* sanity check */ - parblk[i] = BufferGetBlockNumber(buf); - if (coff[i] != InvalidOffsetNumber) - { - if (parblk[i] == parblk[i - 1] && - coff[i] != coff[i - 1] + 1) - elog(ERROR, "bt_fixlevel[%s]: invalid item order(2) (need to recreate index)", RelationGetRelationName(rel)); - continue; - } - /* Have to check next page ? */ - if ((!P_RIGHTMOST(opaque)) && - coff[i - 1] == PageGetMaxOffsetNumber(page)) /* yes */ - { - newbuf = _bt_getbuf(rel, opaque->btpo_next, BT_WRITE); - newpage = BufferGetPage(newbuf); - newopaque = (BTPageOpaque) PageGetSpecialPointer(newpage); - coff[i] = _bt_getoff(newpage, cblkno[i]); - if (coff[i] != InvalidOffsetNumber) /* found ! */ - { - if (coff[i] != P_FIRSTDATAKEY(newopaque)) - elog(ERROR, "bt_fixlevel[%s]: invalid item order(3) (need to recreate index)", RelationGetRelationName(rel)); - _bt_relbuf(rel, buf); - buf = newbuf; - page = newpage; - opaque = newopaque; - blkno = BufferGetBlockNumber(buf); - parblk[i] = blkno; - continue; - } - /* unfound - need to insert on current page */ - _bt_relbuf(rel, newbuf); - } - /* insert pointer */ - ritem = (BTItem) PageGetItem(cpage[i - 1], - PageGetItemId(cpage[i - 1], P_HIKEY)); - btitem = _bt_formitem(&(ritem->bti_itup)); - ItemPointerSet(&(btitem->bti_itup.t_tid), cblkno[i], P_HIKEY); - itemsz = IndexTupleDSize(btitem->bti_itup) - + (sizeof(BTItemData) - sizeof(IndexTupleData)); - itemsz = MAXALIGN(itemsz); - - newitemoff = coff[i - 1] + 1; - - if (PageGetFreeSpace(page) < itemsz) - { - OffsetNumber firstright; - OffsetNumber itup_off; - BlockNumber itup_blkno; - bool newitemonleft; - - firstright = _bt_findsplitloc(rel, page, - newitemoff, itemsz, &newitemonleft); - newbuf = _bt_split(rel, buf, firstright, - newitemoff, itemsz, btitem, newitemonleft, - &itup_off, &itup_blkno); - /* what buffer we need in ? */ - if (newitemonleft) - _bt_relbuf(rel, newbuf); - else - { - _bt_relbuf(rel, buf); - buf = newbuf; - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - } - blkno = BufferGetBlockNumber(buf); - coff[i] = itup_off; - } - else - { - _bt_insertuple(rel, buf, itemsz, btitem, newitemoff); - coff[i] = newitemoff; - } - - pfree(btitem); - parblk[i] = blkno; - } - - /* copy page with pointer to cblkno[cidx] to temp storage */ - memmove(tbuf, page, PageGetPageSize(page)); - _bt_relbuf(rel, buf); - page = (Page) tbuf; - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - } - - /* Continue if current check/fix level page is rightmost */ - if (P_RIGHTMOST(opaque)) - goodbye = false; - - /* Pointers to child pages are Ok - right end of child level ? */ - _bt_relbuf(rel, cbuf[0]); - _bt_relbuf(rel, cbuf[1]); - if (cidx == 1 || - (cidx == 2 && (P_RIGHTMOST(copaque[2]) || goodbye))) - { - if (cidx == 2) - _bt_relbuf(rel, cbuf[2]); - return; - } - if (cblkno[0] == limit || cblkno[1] == limit) - goodbye = true; - cblkno[0] = cblkno[2]; - cbuf[0] = cbuf[2]; - cpage[0] = cpage[2]; - copaque[0] = copaque[2]; - coff[0] = coff[2]; - } -} - -/* - * Check/fix part of tree - branch - up from parent of level with blocks - * lblkno and rblknum. We first ensure that parent level has pointers - * to both lblkno & rblknum and if those pointers are on different - * parent pages then do the same for parent level, etc. No locks must - * be held on target level and upper on entry. No locks will be held - * on exit. Stack created when traversing tree down should be provided and - * it must points to parent level. rblkno must be on the right from lblkno. - * (This function is special edition of more expensive _bt_fixtree(), - * but it doesn't guarantee full consistency of tree.) - */ -static void -_bt_fixbranch(Relation rel, BlockNumber lblkno, - BlockNumber rblkno, BTStack true_stack) -{ - BlockNumber blkno = true_stack->bts_blkno; - BTStackData stack; - BTPageOpaque opaque; - Buffer buf, - rbuf; - Page page; - OffsetNumber offnum; - - true_stack = true_stack->bts_parent; - for (;;) - { - buf = _bt_getbuf(rel, blkno, BT_READ); - - /* Check/fix parent level pointed by blkno */ - _bt_fixlevel(rel, buf, rblkno); - - /* - * Here parent level should have pointers for both lblkno and - * rblkno and we have to find them. - */ - stack.bts_parent = NULL; - stack.bts_blkno = blkno; - stack.bts_offset = InvalidOffsetNumber; - ItemPointerSet(&(stack.bts_btitem.bti_itup.t_tid), lblkno, P_HIKEY); - buf = _bt_getstackbuf(rel, &stack, BT_READ); - if (buf == InvalidBuffer) - elog(ERROR, "bt_fixbranch[%s]: left pointer unfound (need to recreate index)", RelationGetRelationName(rel)); - page = BufferGetPage(buf); - offnum = _bt_getoff(page, rblkno); - - if (offnum != InvalidOffsetNumber) /* right pointer found */ - { - if (offnum <= stack.bts_offset) - elog(ERROR, "bt_fixbranch[%s]: invalid item order (need to recreate index)", RelationGetRelationName(rel)); - _bt_relbuf(rel, buf); - return; - } - - /* Pointers are on different parent pages - find right one */ - lblkno = BufferGetBlockNumber(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (P_RIGHTMOST(opaque)) - elog(ERROR, "bt_fixbranch[%s]: right pointer unfound(1) (need to recreate index)", RelationGetRelationName(rel)); - - stack.bts_parent = NULL; - stack.bts_blkno = opaque->btpo_next; - stack.bts_offset = InvalidOffsetNumber; - ItemPointerSet(&(stack.bts_btitem.bti_itup.t_tid), rblkno, P_HIKEY); - rbuf = _bt_getstackbuf(rel, &stack, BT_READ); - if (rbuf == InvalidBuffer) - elog(ERROR, "bt_fixbranch[%s]: right pointer unfound(2) (need to recreate index)", RelationGetRelationName(rel)); - rblkno = BufferGetBlockNumber(rbuf); - _bt_relbuf(rel, rbuf); - - /* - * If we have parent item in true_stack then go up one level and - * ensure that it has pointers to new lblkno & rblkno. - */ - if (true_stack) - { - _bt_relbuf(rel, buf); - blkno = true_stack->bts_blkno; - true_stack = true_stack->bts_parent; - continue; - } - - /* - * Well, we are on the level that was root or unexistent when we - * started traversing tree down. If btpo_parent is updated then - * we'll use it to continue, else we'll fix/restore upper levels - * entirely. - */ - if (!BTreeInvalidParent(opaque)) - { - blkno = opaque->btpo_parent; - _bt_relbuf(rel, buf); - continue; - } - - /* Have to switch to excl buf lock and re-check btpo_parent */ - _bt_relbuf(rel, buf); - buf = _bt_getbuf(rel, blkno, BT_WRITE); - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (!BTreeInvalidParent(opaque)) - { - blkno = opaque->btpo_parent; - _bt_relbuf(rel, buf); - continue; - } - - /* - * We hold excl lock on some internal page with unupdated - * btpo_parent - time for _bt_fixup. - */ - break; - } - - elog(WARNING, "bt_fixbranch[%s]: fixing upper levels", RelationGetRelationName(rel)); - _bt_fixup(rel, buf); - - return; -} - -/* - * Having buf excl locked this routine walks to the left on level and - * uses either _bt_fixtree() or _bt_fixroot() to create/check&fix upper - * levels. No buffer pins/locks will be held on exit. - */ -static void -_bt_fixup(Relation rel, Buffer buf) -{ - Page page; - BTPageOpaque opaque; - BlockNumber blkno; - - for (;;) - { - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - - /* - * If someone else already created parent pages then it's time for - * _bt_fixtree() to check upper levels and fix them, if required. - */ - if (!BTreeInvalidParent(opaque)) - { - blkno = opaque->btpo_parent; - _bt_relbuf(rel, buf); - elog(WARNING, "bt_fixup[%s]: checking/fixing upper levels", RelationGetRelationName(rel)); - _bt_fixtree(rel, blkno); - return; - } - if (P_LEFTMOST(opaque)) - break; - blkno = opaque->btpo_prev; - _bt_relbuf(rel, buf); - buf = _bt_getbuf(rel, blkno, BT_WRITE); - } - - /* - * Ok, we are on the leftmost page, it's write locked by us and its - * btpo_parent points to meta page - time for _bt_fixroot(). - */ - elog(WARNING, "bt_fixup[%s]: fixing root page", RelationGetRelationName(rel)); - buf = _bt_fixroot(rel, buf, true); - _bt_relbuf(rel, buf); -} - -static OffsetNumber -_bt_getoff(Page page, BlockNumber blkno) -{ - BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); - OffsetNumber maxoff = PageGetMaxOffsetNumber(page); - OffsetNumber offnum = P_FIRSTDATAKEY(opaque); - BlockNumber curblkno; - ItemId itemid; - BTItem item; - - for (; offnum <= maxoff; offnum++) - { - itemid = PageGetItemId(page, offnum); - item = (BTItem) PageGetItem(page, itemid); - curblkno = ItemPointerGetBlockNumber(&(item->bti_itup.t_tid)); - if (curblkno == blkno) - return (offnum); - } - - return (InvalidOffsetNumber); -} - /* * _bt_pgaddtup() -- add a tuple to a particular page in the index. * diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 110de69406..c9879b73ae 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.58 2002/08/06 02:36:33 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.59 2003/02/21 00:06:21 tgl Exp $ * * NOTES * Postgres btree pages look like ordinary relation pages. The opaque @@ -47,15 +47,16 @@ extern Buffer _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release); #define USELOCKING (!BuildingBtree && !IsInitProcessingMode()) + /* - * _bt_metapinit() -- Initialize the metadata page of a btree. + * _bt_metapinit() -- Initialize the metadata page of a new btree. */ void _bt_metapinit(Relation rel) { Buffer buf; Page pg; - BTMetaPageData metad; + BTMetaPageData *metad; BTPageOpaque op; /* can't be sharing this with anyone, now... */ @@ -67,18 +68,51 @@ _bt_metapinit(Relation rel) RelationGetRelationName(rel)); buf = ReadBuffer(rel, P_NEW); + Assert(BufferGetBlockNumber(buf) == BTREE_METAPAGE); pg = BufferGetPage(buf); + + /* NO ELOG(ERROR) from here till newmeta op is logged */ + START_CRIT_SECTION(); + _bt_pageinit(pg, BufferGetPageSize(buf)); - metad.btm_magic = BTREE_MAGIC; - metad.btm_version = BTREE_VERSION; - metad.btm_root = P_NONE; - metad.btm_level = 0; - memcpy((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad)); + metad = BTPageGetMeta(pg); + metad->btm_magic = BTREE_MAGIC; + metad->btm_version = BTREE_VERSION; + metad->btm_root = P_NONE; + metad->btm_level = 0; + metad->btm_fastroot = P_NONE; + metad->btm_fastlevel = 0; op = (BTPageOpaque) PageGetSpecialPointer(pg); op->btpo_flags = BTP_META; + /* XLOG stuff */ + if (!rel->rd_istemp) + { + xl_btree_newmeta xlrec; + XLogRecPtr recptr; + XLogRecData rdata[1]; + + xlrec.node = rel->rd_node; + xlrec.meta.root = metad->btm_root; + xlrec.meta.level = metad->btm_level; + xlrec.meta.fastroot = metad->btm_fastroot; + xlrec.meta.fastlevel = metad->btm_fastlevel; + + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &xlrec; + rdata[0].len = SizeOfBtreeNewmeta; + rdata[0].next = NULL; + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWMETA, rdata); + + PageSetLSN(pg, recptr); + PageSetSUI(pg, ThisStartUpID); + } + + END_CRIT_SECTION(); + WriteBuffer(buf); /* all done */ @@ -102,6 +136,14 @@ _bt_metapinit(Relation rel) * NOTE that the returned root page will have only a read lock set * on it even if access = BT_WRITE! * + * The returned page is not necessarily the true root --- it could be + * a "fast root" (a page that is alone in its level due to deletions). + * Also, if the root page is split while we are "in flight" to it, + * what we will return is the old root, which is now just the leftmost + * page on a probably-not-very-wide level. For most purposes this is + * as good as or better than the true root, so we do not bother to + * insist on finding the true root. + * * On successful return, the root page is pinned and read-locked. * The metadata page is not locked or pinned on exit. */ @@ -162,15 +204,19 @@ _bt_getroot(Relation rel, int access) rootblkno = BufferGetBlockNumber(rootbuf); rootpage = BufferGetPage(rootbuf); + _bt_pageinit(rootpage, BufferGetPageSize(rootbuf)); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; + rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT); + rootopaque->btpo.level = 0; + /* NO ELOG(ERROR) till meta is updated */ START_CRIT_SECTION(); metad->btm_root = rootblkno; - metad->btm_level = 1; - - _bt_pageinit(rootpage, BufferGetPageSize(rootbuf)); - rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); - rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT); + metad->btm_level = 0; + metad->btm_fastroot = rootblkno; + metad->btm_fastlevel = 0; /* XLOG stuff */ if (!rel->rd_istemp) @@ -180,16 +226,15 @@ _bt_getroot(Relation rel, int access) XLogRecData rdata; xlrec.node = rel->rd_node; - xlrec.level = 1; - BlockIdSet(&(xlrec.rootblk), rootblkno); + xlrec.rootblk = rootblkno; + xlrec.level = 0; + rdata.buffer = InvalidBuffer; rdata.data = (char *) &xlrec; rdata.len = SizeOfBtreeNewroot; rdata.next = NULL; - recptr = XLogInsert(RM_BTREE_ID, - XLOG_BTREE_NEWROOT | XLOG_BTREE_LEAF, - &rdata); + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata); PageSetLSN(rootpage, recptr); PageSetSUI(rootpage, ThisStartUpID); @@ -201,7 +246,11 @@ _bt_getroot(Relation rel, int access) _bt_wrtnorelbuf(rel, rootbuf); - /* swap write lock for read lock */ + /* + * swap root write lock for read lock. There is no danger of + * anyone else accessing the new root page while it's unlocked, + * since no one else knows where it is yet. + */ LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK); LockBuffer(rootbuf, BT_READ); @@ -221,86 +270,72 @@ _bt_getroot(Relation rel, int access) } else { - rootblkno = metad->btm_root; + rootblkno = metad->btm_fastroot; + _bt_relbuf(rel, metabuf); /* done with the meta page */ rootbuf = _bt_getbuf(rel, rootblkno, BT_READ); } /* - * Race condition: If the root page split between the time we looked - * at the metadata page and got the root buffer, then we got the wrong - * buffer. Release it and try again. + * By here, we have a pin and read lock on the root page, and no + * lock set on the metadata page. Return the root page's buffer. */ - rootpage = BufferGetPage(rootbuf); - rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + return rootbuf; +} - if (!P_ISROOT(rootopaque)) +/* + * _bt_gettrueroot() -- Get the true root page of the btree. + * + * This is the same as the BT_READ case of _bt_getroot(), except + * we follow the true-root link not the fast-root link. + * + * By the time we acquire lock on the root page, it might have been split and + * not be the true root anymore. This is okay for the present uses of this + * routine; we only really need to be able to move up at least one tree level + * from whatever non-root page we were at. If we ever do need to lock the + * one true root page, we could loop here, re-reading the metapage on each + * failure. (Note that it wouldn't do to hold the lock on the metapage while + * moving to the root --- that'd deadlock against any concurrent root split.) + */ +Buffer +_bt_gettrueroot(Relation rel) +{ + Buffer metabuf; + Page metapg; + BTPageOpaque metaopaque; + Buffer rootbuf; + BlockNumber rootblkno; + BTMetaPageData *metad; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + metad = BTPageGetMeta(metapg); + + if (!(metaopaque->btpo_flags & BTP_META) || + metad->btm_magic != BTREE_MAGIC) + elog(ERROR, "Index %s is not a btree", + RelationGetRelationName(rel)); + + if (metad->btm_version != BTREE_VERSION) + elog(ERROR, "Version mismatch on %s: version %d file, version %d code", + RelationGetRelationName(rel), + metad->btm_version, BTREE_VERSION); + + /* if no root page initialized yet, fail */ + if (metad->btm_root == P_NONE) { - /* - * It happened, but if root page splitter failed to create new - * root page then we'll go in loop trying to call _bt_getroot - * again and again. - */ - if (FixBTree) - { - Buffer newrootbuf; - - check_parent:; - if (BTreeInvalidParent(rootopaque)) /* unupdated! */ - { - LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK); - LockBuffer(rootbuf, BT_WRITE); - - /* handle concurrent fix of root page */ - if (BTreeInvalidParent(rootopaque)) /* unupdated! */ - { - elog(WARNING, "bt_getroot[%s]: fixing root page", RelationGetRelationName(rel)); - newrootbuf = _bt_fixroot(rel, rootbuf, true); - LockBuffer(newrootbuf, BUFFER_LOCK_UNLOCK); - LockBuffer(newrootbuf, BT_READ); - rootbuf = newrootbuf; - rootpage = BufferGetPage(rootbuf); - rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); - /* New root might be splitted while changing lock */ - if (P_ISROOT(rootopaque)) - return (rootbuf); - /* rootbuf is read locked */ - goto check_parent; - } - else - { - /* someone else already fixed root */ - LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK); - LockBuffer(rootbuf, BT_READ); - } - } - - /* - * Ok, here we have old root page with btpo_parent pointing to - * upper level - check parent page because of there is good - * chance that parent is root page. - */ - newrootbuf = _bt_getbuf(rel, rootopaque->btpo_parent, BT_READ); - _bt_relbuf(rel, rootbuf); - rootbuf = newrootbuf; - rootpage = BufferGetPage(rootbuf); - rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); - if (P_ISROOT(rootopaque)) - return (rootbuf); - /* no luck -:( */ - } - - /* try again */ - _bt_relbuf(rel, rootbuf); - return _bt_getroot(rel, access); + _bt_relbuf(rel, metabuf); + return InvalidBuffer; } - /* - * By here, we have a correct lock on the root block, its reference - * count is correct, and we have no lock set on the metadata page. - * Return the root block. - */ + rootblkno = metad->btm_root; + + _bt_relbuf(rel, metabuf); /* done with the meta page */ + + rootbuf = _bt_getbuf(rel, rootblkno, BT_READ); + return rootbuf; } @@ -397,13 +432,14 @@ _bt_wrtnorelbuf(Relation rel, Buffer buf) /* * _bt_pageinit() -- Initialize a new page. + * + * On return, the page header is initialized; data space is empty; + * special space is zeroed out. */ void _bt_pageinit(Page page, Size size) { PageInit(page, size, sizeof(BTPageOpaqueData)); - ((BTPageOpaque) PageGetSpecialPointer(page))->btpo_parent = - InvalidBlockNumber; } /* @@ -418,9 +454,12 @@ _bt_pageinit(Page page, Size size) * at least the old root page when you call this, you're making a big * mistake. On exit, metapage data is correct and we no longer have * a pin or lock on the metapage. + * + * XXX this is not used for splitting anymore, only in nbtsort.c at the + * completion of btree building. */ void -_bt_metaproot(Relation rel, BlockNumber rootbknum, int level) +_bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level) { Buffer metabuf; Page metap; @@ -431,12 +470,42 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, int level) metap = BufferGetPage(metabuf); metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap); Assert(metaopaque->btpo_flags & BTP_META); + + /* NO ELOG(ERROR) from here till newmeta op is logged */ + START_CRIT_SECTION(); + metad = BTPageGetMeta(metap); metad->btm_root = rootbknum; - if (level == 0) /* called from _do_insert */ - metad->btm_level += 1; - else - metad->btm_level = level; /* called from btsort */ + metad->btm_level = level; + metad->btm_fastroot = rootbknum; + metad->btm_fastlevel = level; + + /* XLOG stuff */ + if (!rel->rd_istemp) + { + xl_btree_newmeta xlrec; + XLogRecPtr recptr; + XLogRecData rdata[1]; + + xlrec.node = rel->rd_node; + xlrec.meta.root = metad->btm_root; + xlrec.meta.level = metad->btm_level; + xlrec.meta.fastroot = metad->btm_fastroot; + xlrec.meta.fastlevel = metad->btm_fastlevel; + + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &xlrec; + rdata[0].len = SizeOfBtreeNewmeta; + rdata[0].next = NULL; + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWMETA, rdata); + + PageSetLSN(metap, recptr); + PageSetSUI(metap, ThisStartUpID); + } + + END_CRIT_SECTION(); + _bt_wrtbuf(rel, metabuf); } @@ -467,6 +536,7 @@ _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid) xlrec.target.node = rel->rd_node; xlrec.target.tid = *tid; + rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeDelete; diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 3244beb121..de6765415f 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -12,21 +12,17 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.94 2002/11/15 01:26:08 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.95 2003/02/21 00:06:21 tgl Exp $ * *------------------------------------------------------------------------- */ - #include "postgres.h" #include "access/genam.h" #include "access/heapam.h" #include "access/nbtree.h" #include "catalog/index.h" -#include "executor/executor.h" #include "miscadmin.h" -#include "storage/sinval.h" -#include "access/xlogutils.h" /* Working state for btbuild and its callback */ @@ -817,396 +813,3 @@ _bt_restscan(IndexScanDesc scan) ItemPointerSet(current, blkno, offnum); } } - -static void -_bt_restore_page(Page page, char *from, int len) -{ - BTItemData btdata; - Size itemsz; - char *end = from + len; - - for (; from < end;) - { - memcpy(&btdata, from, sizeof(BTItemData)); - itemsz = IndexTupleDSize(btdata.bti_itup) + - (sizeof(BTItemData) - sizeof(IndexTupleData)); - itemsz = MAXALIGN(itemsz); - if (PageAddItem(page, (Item) from, itemsz, - FirstOffsetNumber, LP_USED) == InvalidOffsetNumber) - elog(PANIC, "_bt_restore_page: can't add item to page"); - from += itemsz; - } -} - -static void -btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record) -{ - xl_btree_delete *xlrec; - Relation reln; - Buffer buffer; - Page page; - - if (!redo || (record->xl_info & XLR_BKP_BLOCK_1)) - return; - - xlrec = (xl_btree_delete *) XLogRecGetData(record); - reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node); - if (!RelationIsValid(reln)) - return; - buffer = XLogReadBuffer(false, reln, - ItemPointerGetBlockNumber(&(xlrec->target.tid))); - if (!BufferIsValid(buffer)) - elog(PANIC, "btree_delete_redo: block unfound"); - page = (Page) BufferGetPage(buffer); - if (PageIsNew((PageHeader) page)) - elog(PANIC, "btree_delete_redo: uninitialized page"); - - if (XLByteLE(lsn, PageGetLSN(page))) - { - UnlockAndReleaseBuffer(buffer); - return; - } - - PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid))); - - PageSetLSN(page, lsn); - PageSetSUI(page, ThisStartUpID); - UnlockAndWriteBuffer(buffer); - - return; -} - -static void -btree_xlog_insert(bool redo, XLogRecPtr lsn, XLogRecord *record) -{ - xl_btree_insert *xlrec; - Relation reln; - Buffer buffer; - Page page; - BTPageOpaque pageop; - - if (redo && (record->xl_info & XLR_BKP_BLOCK_1)) - return; - - xlrec = (xl_btree_insert *) XLogRecGetData(record); - reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node); - if (!RelationIsValid(reln)) - return; - buffer = XLogReadBuffer(false, reln, - ItemPointerGetBlockNumber(&(xlrec->target.tid))); - if (!BufferIsValid(buffer)) - elog(PANIC, "btree_insert_%sdo: block unfound", (redo) ? "re" : "un"); - page = (Page) BufferGetPage(buffer); - if (PageIsNew((PageHeader) page)) - elog(PANIC, "btree_insert_%sdo: uninitialized page", (redo) ? "re" : "un"); - pageop = (BTPageOpaque) PageGetSpecialPointer(page); - - if (redo) - { - if (XLByteLE(lsn, PageGetLSN(page))) - { - UnlockAndReleaseBuffer(buffer); - return; - } - if (PageAddItem(page, (Item) ((char *) xlrec + SizeOfBtreeInsert), - record->xl_len - SizeOfBtreeInsert, - ItemPointerGetOffsetNumber(&(xlrec->target.tid)), - LP_USED) == InvalidOffsetNumber) - elog(PANIC, "btree_insert_redo: failed to add item"); - - PageSetLSN(page, lsn); - PageSetSUI(page, ThisStartUpID); - UnlockAndWriteBuffer(buffer); - } - else - { - if (XLByteLT(PageGetLSN(page), lsn)) - elog(PANIC, "btree_insert_undo: bad page LSN"); - - if (!P_ISLEAF(pageop)) - { - UnlockAndReleaseBuffer(buffer); - return; - } - - elog(PANIC, "btree_insert_undo: unimplemented"); - } - - return; -} - -static void -btree_xlog_split(bool redo, bool onleft, XLogRecPtr lsn, XLogRecord *record) -{ - xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); - Relation reln; - BlockNumber blkno; - Buffer buffer; - Page page; - BTPageOpaque pageop; - char *op = (redo) ? "redo" : "undo"; - bool isleaf = (record->xl_info & XLOG_BTREE_LEAF); - - reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node); - if (!RelationIsValid(reln)) - return; - - /* Left (original) sibling */ - blkno = (onleft) ? ItemPointerGetBlockNumber(&(xlrec->target.tid)) : - BlockIdGetBlockNumber(&(xlrec->otherblk)); - buffer = XLogReadBuffer(false, reln, blkno); - if (!BufferIsValid(buffer)) - elog(PANIC, "btree_split_%s: lost left sibling", op); - - page = (Page) BufferGetPage(buffer); - if (redo) - _bt_pageinit(page, BufferGetPageSize(buffer)); - else if (PageIsNew((PageHeader) page)) - elog(PANIC, "btree_split_undo: uninitialized left sibling"); - pageop = (BTPageOpaque) PageGetSpecialPointer(page); - - if (redo) - { - pageop->btpo_parent = BlockIdGetBlockNumber(&(xlrec->parentblk)); - pageop->btpo_prev = BlockIdGetBlockNumber(&(xlrec->leftblk)); - if (onleft) - pageop->btpo_next = BlockIdGetBlockNumber(&(xlrec->otherblk)); - else - pageop->btpo_next = ItemPointerGetBlockNumber(&(xlrec->target.tid)); - pageop->btpo_flags = (isleaf) ? BTP_LEAF : 0; - - _bt_restore_page(page, (char *) xlrec + SizeOfBtreeSplit, xlrec->leftlen); - - PageSetLSN(page, lsn); - PageSetSUI(page, ThisStartUpID); - UnlockAndWriteBuffer(buffer); - } - else -/* undo */ - { - if (XLByteLT(PageGetLSN(page), lsn)) - elog(PANIC, "btree_split_undo: bad left sibling LSN"); - elog(PANIC, "btree_split_undo: unimplemented"); - } - - /* Right (new) sibling */ - blkno = (onleft) ? BlockIdGetBlockNumber(&(xlrec->otherblk)) : - ItemPointerGetBlockNumber(&(xlrec->target.tid)); - buffer = XLogReadBuffer((redo) ? true : false, reln, blkno); - if (!BufferIsValid(buffer)) - elog(PANIC, "btree_split_%s: lost right sibling", op); - - page = (Page) BufferGetPage(buffer); - if (redo) - _bt_pageinit(page, BufferGetPageSize(buffer)); - else if (PageIsNew((PageHeader) page)) - elog(PANIC, "btree_split_undo: uninitialized right sibling"); - pageop = (BTPageOpaque) PageGetSpecialPointer(page); - - if (redo) - { - pageop->btpo_parent = BlockIdGetBlockNumber(&(xlrec->parentblk)); - pageop->btpo_prev = (onleft) ? - ItemPointerGetBlockNumber(&(xlrec->target.tid)) : - BlockIdGetBlockNumber(&(xlrec->otherblk)); - pageop->btpo_next = BlockIdGetBlockNumber(&(xlrec->rightblk)); - pageop->btpo_flags = (isleaf) ? BTP_LEAF : 0; - - _bt_restore_page(page, - (char *) xlrec + SizeOfBtreeSplit + xlrec->leftlen, - record->xl_len - SizeOfBtreeSplit - xlrec->leftlen); - - PageSetLSN(page, lsn); - PageSetSUI(page, ThisStartUpID); - UnlockAndWriteBuffer(buffer); - } - else -/* undo */ - { - if (XLByteLT(PageGetLSN(page), lsn)) - elog(PANIC, "btree_split_undo: bad right sibling LSN"); - elog(PANIC, "btree_split_undo: unimplemented"); - } - - if (!redo || (record->xl_info & XLR_BKP_BLOCK_1)) - return; - - /* Right (next) page */ - blkno = BlockIdGetBlockNumber(&(xlrec->rightblk)); - if (blkno == P_NONE) - return; - - buffer = XLogReadBuffer(false, reln, blkno); - if (!BufferIsValid(buffer)) - elog(PANIC, "btree_split_redo: lost next right page"); - - page = (Page) BufferGetPage(buffer); - if (PageIsNew((PageHeader) page)) - elog(PANIC, "btree_split_redo: uninitialized next right page"); - - if (XLByteLE(lsn, PageGetLSN(page))) - { - UnlockAndReleaseBuffer(buffer); - return; - } - pageop = (BTPageOpaque) PageGetSpecialPointer(page); - pageop->btpo_prev = (onleft) ? - BlockIdGetBlockNumber(&(xlrec->otherblk)) : - ItemPointerGetBlockNumber(&(xlrec->target.tid)); - - PageSetLSN(page, lsn); - PageSetSUI(page, ThisStartUpID); - UnlockAndWriteBuffer(buffer); -} - -static void -btree_xlog_newroot(bool redo, XLogRecPtr lsn, XLogRecord *record) -{ - xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record); - Relation reln; - Buffer buffer; - Page page; - BTPageOpaque pageop; - Buffer metabuf; - Page metapg; - BTMetaPageData md; - - if (!redo) - return; - - reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node); - if (!RelationIsValid(reln)) - return; - buffer = XLogReadBuffer(true, reln, BlockIdGetBlockNumber(&(xlrec->rootblk))); - if (!BufferIsValid(buffer)) - elog(PANIC, "btree_newroot_redo: no root page"); - metabuf = XLogReadBuffer(false, reln, BTREE_METAPAGE); - if (!BufferIsValid(buffer)) - elog(PANIC, "btree_newroot_redo: no metapage"); - page = (Page) BufferGetPage(buffer); - _bt_pageinit(page, BufferGetPageSize(buffer)); - pageop = (BTPageOpaque) PageGetSpecialPointer(page); - - pageop->btpo_flags |= BTP_ROOT; - pageop->btpo_prev = pageop->btpo_next = P_NONE; - pageop->btpo_parent = BTREE_METAPAGE; - - if (record->xl_info & XLOG_BTREE_LEAF) - pageop->btpo_flags |= BTP_LEAF; - - if (record->xl_len > SizeOfBtreeNewroot) - _bt_restore_page(page, - (char *) xlrec + SizeOfBtreeNewroot, - record->xl_len - SizeOfBtreeNewroot); - - PageSetLSN(page, lsn); - PageSetSUI(page, ThisStartUpID); - UnlockAndWriteBuffer(buffer); - - metapg = BufferGetPage(metabuf); - _bt_pageinit(metapg, BufferGetPageSize(metabuf)); - md.btm_magic = BTREE_MAGIC; - md.btm_version = BTREE_VERSION; - md.btm_root = BlockIdGetBlockNumber(&(xlrec->rootblk)); - md.btm_level = xlrec->level; - memcpy((char *) BTPageGetMeta(metapg), (char *) &md, sizeof(md)); - - pageop = (BTPageOpaque) PageGetSpecialPointer(metapg); - pageop->btpo_flags = BTP_META; - - PageSetLSN(metapg, lsn); - PageSetSUI(metapg, ThisStartUpID); - UnlockAndWriteBuffer(metabuf); -} - -void -btree_redo(XLogRecPtr lsn, XLogRecord *record) -{ - uint8 info = record->xl_info & ~XLR_INFO_MASK; - - info &= ~XLOG_BTREE_LEAF; - if (info == XLOG_BTREE_DELETE) - btree_xlog_delete(true, lsn, record); - else if (info == XLOG_BTREE_INSERT) - btree_xlog_insert(true, lsn, record); - else if (info == XLOG_BTREE_SPLIT) - btree_xlog_split(true, false, lsn, record); /* new item on the right */ - else if (info == XLOG_BTREE_SPLEFT) - btree_xlog_split(true, true, lsn, record); /* new item on the left */ - else if (info == XLOG_BTREE_NEWROOT) - btree_xlog_newroot(true, lsn, record); - else - elog(PANIC, "btree_redo: unknown op code %u", info); -} - -void -btree_undo(XLogRecPtr lsn, XLogRecord *record) -{ - uint8 info = record->xl_info & ~XLR_INFO_MASK; - - info &= ~XLOG_BTREE_LEAF; - if (info == XLOG_BTREE_DELETE) - btree_xlog_delete(false, lsn, record); - else if (info == XLOG_BTREE_INSERT) - btree_xlog_insert(false, lsn, record); - else if (info == XLOG_BTREE_SPLIT) - btree_xlog_split(false, false, lsn, record); /* new item on the right */ - else if (info == XLOG_BTREE_SPLEFT) - btree_xlog_split(false, true, lsn, record); /* new item on the left */ - else if (info == XLOG_BTREE_NEWROOT) - btree_xlog_newroot(false, lsn, record); - else - elog(PANIC, "btree_undo: unknown op code %u", info); -} - -static void -out_target(char *buf, xl_btreetid *target) -{ - sprintf(buf + strlen(buf), "node %u/%u; tid %u/%u", - target->node.tblNode, target->node.relNode, - ItemPointerGetBlockNumber(&(target->tid)), - ItemPointerGetOffsetNumber(&(target->tid))); -} - -void -btree_desc(char *buf, uint8 xl_info, char *rec) -{ - uint8 info = xl_info & ~XLR_INFO_MASK; - - info &= ~XLOG_BTREE_LEAF; - if (info == XLOG_BTREE_INSERT) - { - xl_btree_insert *xlrec = (xl_btree_insert *) rec; - - strcat(buf, "insert: "); - out_target(buf, &(xlrec->target)); - } - else if (info == XLOG_BTREE_DELETE) - { - xl_btree_delete *xlrec = (xl_btree_delete *) rec; - - strcat(buf, "delete: "); - out_target(buf, &(xlrec->target)); - } - else if (info == XLOG_BTREE_SPLIT || info == XLOG_BTREE_SPLEFT) - { - xl_btree_split *xlrec = (xl_btree_split *) rec; - - sprintf(buf + strlen(buf), "split(%s): ", - (info == XLOG_BTREE_SPLIT) ? "right" : "left"); - out_target(buf, &(xlrec->target)); - sprintf(buf + strlen(buf), "; oth %u; rgh %u", - BlockIdGetBlockNumber(&xlrec->otherblk), - BlockIdGetBlockNumber(&xlrec->rightblk)); - } - else if (info == XLOG_BTREE_NEWROOT) - { - xl_btree_newroot *xlrec = (xl_btree_newroot *) rec; - - sprintf(buf + strlen(buf), "root: node %u/%u; blk %u", - xlrec->node.tblNode, xlrec->node.relNode, - BlockIdGetBlockNumber(&xlrec->rootblk)); - } - else - strcat(buf, "UNKNOWN"); -} diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 66f2428cd4..0daae3cd58 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.72 2002/06/20 20:29:25 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.73 2003/02/21 00:06:21 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -895,6 +895,89 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) return true; } +/* + * _bt_get_endpoint() -- Find the first or last page on a given tree level + * + * If the index is empty, we will return InvalidBuffer; any other failure + * condition causes elog(). + * + * The returned buffer is pinned and read-locked. + */ +Buffer +_bt_get_endpoint(Relation rel, uint32 level, bool rightmost) +{ + Buffer buf; + Page page; + BTPageOpaque opaque; + OffsetNumber offnum; + BlockNumber blkno; + BTItem btitem; + IndexTuple itup; + + /* + * If we are looking for a leaf page, okay to descend from fast root; + * otherwise better descend from true root. (There is no point in being + * smarter about intermediate levels.) + */ + if (level == 0) + buf = _bt_getroot(rel, BT_READ); + else + buf = _bt_gettrueroot(rel); + + if (!BufferIsValid(buf)) + { + /* empty index... */ + return InvalidBuffer; + } + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + for (;;) + { + /* + * If we landed on a deleted page, step right to find a live page + * (there must be one). Also, if we want the rightmost page, + * step right if needed to get to it (this could happen if the + * page split since we obtained a pointer to it). + */ + while (P_ISDELETED(opaque) || + (rightmost && !P_RIGHTMOST(opaque))) + { + blkno = opaque->btpo_next; + if (blkno == P_NONE) + elog(ERROR, "_bt_get_endpoint: ran off end of btree"); + _bt_relbuf(rel, buf); + buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + + /* Done? */ + if (opaque->btpo.level == level) + break; + if (opaque->btpo.level < level) + elog(ERROR, "_bt_get_endpoint: btree level %u not found", level); + + /* Step to leftmost or rightmost child page */ + if (rightmost) + offnum = PageGetMaxOffsetNumber(page); + else + offnum = P_FIRSTDATAKEY(opaque); + + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &(btitem->bti_itup); + blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + + _bt_relbuf(rel, buf); + buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + + return buf; +} + /* * _bt_endpoint() -- Find the first or last key in the index. * @@ -910,8 +993,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) Page page; BTPageOpaque opaque; ItemPointer current; - OffsetNumber offnum, - maxoff; + OffsetNumber maxoff; OffsetNumber start; BlockNumber blkno; BTItem btitem; @@ -929,7 +1011,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) * simplified version of _bt_search(). We don't maintain a stack * since we know we won't need it. */ - buf = _bt_getroot(rel, BT_READ); + buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); if (!BufferIsValid(buf)) { @@ -942,51 +1024,14 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) blkno = BufferGetBlockNumber(buf); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(P_ISLEAF(opaque)); - for (;;) - { - if (P_ISLEAF(opaque)) - break; - - if (ScanDirectionIsForward(dir)) - offnum = P_FIRSTDATAKEY(opaque); - else - offnum = PageGetMaxOffsetNumber(page); - - btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); - itup = &(btitem->bti_itup); - blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); - - _bt_relbuf(rel, buf); - buf = _bt_getbuf(rel, blkno, BT_READ); - - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - - /* - * Race condition: If the child page we just stepped onto was just - * split, we need to make sure we're all the way at the right edge - * of the tree. See the paper by Lehman and Yao. - */ - if (ScanDirectionIsBackward(dir) && !P_RIGHTMOST(opaque)) - { - do - { - blkno = opaque->btpo_next; - _bt_relbuf(rel, buf); - buf = _bt_getbuf(rel, blkno, BT_READ); - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - } while (!P_RIGHTMOST(opaque)); - } - } - - /* okay, we've got the {left,right}-most page in the tree */ maxoff = PageGetMaxOffsetNumber(page); if (ScanDirectionIsForward(dir)) { - Assert(P_LEFTMOST(opaque)); + /* There could be dead pages to the left, so not this: */ + /* Assert(P_LEFTMOST(opaque)); */ start = P_FIRSTDATAKEY(opaque); } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index b0c6db8720..f9d227ecd0 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -35,7 +35,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.70 2002/11/15 01:26:08 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.71 2003/02/21 00:06:21 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -43,6 +43,7 @@ #include "postgres.h" #include "access/nbtree.h" +#include "miscadmin.h" #include "utils/tuplesort.h" @@ -76,7 +77,7 @@ typedef struct BTPageState BTItem btps_minkey; /* copy of minimum key (first item) on * page */ OffsetNumber btps_lastoff; /* last item offset loaded */ - int btps_level; /* tree level (0 = leaf) */ + uint32 btps_level; /* tree level (0 = leaf) */ Size btps_full; /* "full" if less than this much free * space */ struct BTPageState *btps_next; /* link to parent level, if any */ @@ -90,8 +91,9 @@ typedef struct BTPageState 0) -static void _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags); -static BTPageState *_bt_pagestate(Relation index, int flags, int level); +static void _bt_blnewpage(Relation index, Buffer *buf, Page *page, + uint32 level); +static BTPageState *_bt_pagestate(Relation index, uint32 level); static void _bt_slideleft(Relation index, Buffer buf, Page page); static void _bt_sortaddtup(Page page, Size itemsize, BTItem btitem, OffsetNumber itup_off); @@ -179,7 +181,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) * allocate a new, clean btree page, not linked to any siblings. */ static void -_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags) +_bt_blnewpage(Relation index, Buffer *buf, Page *page, uint32 level) { BTPageOpaque opaque; @@ -192,23 +194,67 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags) /* Initialize BT opaque state */ opaque = (BTPageOpaque) PageGetSpecialPointer(*page); opaque->btpo_prev = opaque->btpo_next = P_NONE; - opaque->btpo_flags = flags; + opaque->btpo.level = level; + opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF; /* Make the P_HIKEY line pointer appear allocated */ ((PageHeader) *page)->pd_lower += sizeof(ItemIdData); } +/* + * emit a completed btree page, and release the lock and pin on it. + * This is essentially _bt_wrtbuf except we also emit a WAL record. + */ +static void +_bt_blwritepage(Relation index, Buffer buf) +{ + Page pg = BufferGetPage(buf); + + /* NO ELOG(ERROR) from here till newpage op is logged */ + START_CRIT_SECTION(); + + /* XLOG stuff */ + if (!index->rd_istemp) + { + xl_btree_newpage xlrec; + XLogRecPtr recptr; + XLogRecData rdata[2]; + + xlrec.node = index->rd_node; + xlrec.blkno = BufferGetBlockNumber(buf); + + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &xlrec; + rdata[0].len = SizeOfBtreeNewpage; + rdata[0].next = &(rdata[1]); + + rdata[1].buffer = buf; + rdata[1].data = (char *) pg; + rdata[1].len = BLCKSZ; + rdata[1].next = NULL; + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWPAGE, rdata); + + PageSetLSN(pg, recptr); + PageSetSUI(pg, ThisStartUpID); + } + + END_CRIT_SECTION(); + + _bt_wrtbuf(index, buf); +} + /* * allocate and initialize a new BTPageState. the returned structure * is suitable for immediate use by _bt_buildadd. */ static BTPageState * -_bt_pagestate(Relation index, int flags, int level) +_bt_pagestate(Relation index, uint32 level) { BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState)); /* create initial page */ - _bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags); + _bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), level); state->btps_minkey = (BTItem) NULL; /* initialize lastoff so first item goes into P_FIRSTKEY */ @@ -365,9 +411,8 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti) ItemId hii; BTItem obti; - /* Create new page */ - _bt_blnewpage(index, &nbuf, &npage, - (state->btps_level > 0) ? 0 : BTP_LEAF); + /* Create new page on same level */ + _bt_blnewpage(index, &nbuf, &npage, state->btps_level); /* * We copy the last item on the page into the new page, and then @@ -396,10 +441,8 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti) * btree level. */ if (state->btps_next == (BTPageState *) NULL) - { - state->btps_next = - _bt_pagestate(index, 0, state->btps_level + 1); - } + state->btps_next = _bt_pagestate(index, state->btps_level + 1); + Assert(state->btps_minkey != NULL); ItemPointerSet(&(state->btps_minkey->bti_itup.t_tid), BufferGetBlockNumber(obuf), P_HIKEY); @@ -414,16 +457,7 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti) state->btps_minkey = _bt_formitem(&(obti->bti_itup)); /* - * Set the sibling links for both pages, and parent links too. - * - * It's not necessary to set the parent link at all, because it's - * only used for handling concurrent root splits, but we may as - * well do it as a debugging aid. Note we set new page's link as - * well as old's, because if the new page turns out to be the last - * of the level, _bt_uppershutdown won't change it. The links may - * be out of date by the time the build finishes, but that's OK; - * they need only point to a left-sibling of the true parent. See - * the README file for more info. + * Set the sibling links for both pages. */ { BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); @@ -431,9 +465,7 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti) oopaque->btpo_next = BufferGetBlockNumber(nbuf); nopaque->btpo_prev = BufferGetBlockNumber(obuf); - nopaque->btpo_next = P_NONE; - oopaque->btpo_parent = nopaque->btpo_parent = - BufferGetBlockNumber(state->btps_next->btps_buf); + nopaque->btpo_next = P_NONE; /* redundant */ } /* @@ -441,7 +473,7 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti) * can give up our lock (if we had one; most likely BuildingBtree * is set, so we aren't locking). */ - _bt_wrtbuf(index, obuf); + _bt_blwritepage(index, obuf); /* * Reset last_off to point to new page @@ -519,7 +551,7 @@ _bt_uppershutdown(Relation index, BTPageState *state) * slid back one slot. Then we can dump out the page. */ _bt_slideleft(index, s->btps_buf, s->btps_page); - _bt_wrtbuf(index, s->btps_buf); + _bt_blwritepage(index, s->btps_buf); } } @@ -603,7 +635,7 @@ _bt_load(Relation index, BTSpool *btspool, BTSpool *btspool2) /* When we see first tuple, create first index page */ if (state == NULL) - state = _bt_pagestate(index, BTP_LEAF, 0); + state = _bt_pagestate(index, 0); if (load1) { @@ -623,13 +655,13 @@ _bt_load(Relation index, BTSpool *btspool, BTSpool *btspool2) _bt_freeskey(indexScanKey); } else -/* merge is unnecessary */ { + /* merge is unnecessary */ while (bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, true, &should_free), bti != (BTItem) NULL) { /* When we see first tuple, create first index page */ if (state == NULL) - state = _bt_pagestate(index, BTP_LEAF, 0); + state = _bt_pagestate(index, 0); _bt_buildadd(index, state, bti); if (should_free) diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c new file mode 100644 index 0000000000..87a0aaaa7a --- /dev/null +++ b/src/backend/access/nbtree/nbtxlog.c @@ -0,0 +1,780 @@ +/*------------------------------------------------------------------------- + * + * nbtxlog.c + * WAL replay logic for btrees. + * + * + * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.1 2003/02/21 00:06:21 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/xlogutils.h" + + +/* + * We must keep track of expected insertions due to page splits, and apply + * them manually if they are not seen in the WAL log during replay. This + * makes it safe for page insertion to be a multiple-WAL-action process. + * + * The data structure is a simple linked list --- this should be good enough, + * since we don't expect a page split to remain incomplete for long. + */ +typedef struct bt_incomplete_split +{ + RelFileNode node; /* the index */ + BlockNumber leftblk; /* left half of split */ + BlockNumber rightblk; /* right half of split */ + bool is_root; /* we split the root */ +} bt_incomplete_split; + +static List *incomplete_splits; + + +static void +log_incomplete_split(RelFileNode node, BlockNumber leftblk, + BlockNumber rightblk, bool is_root) +{ + bt_incomplete_split *split = palloc(sizeof(bt_incomplete_split)); + + split->node = node; + split->leftblk = leftblk; + split->rightblk = rightblk; + split->is_root = is_root; + incomplete_splits = lappend(incomplete_splits, split); +} + +static void +forget_matching_split(Relation reln, RelFileNode node, + BlockNumber insertblk, OffsetNumber offnum, + bool is_root) +{ + Buffer buffer; + Page page; + BTItem btitem; + BlockNumber rightblk; + List *l; + + /* Get downlink TID from page */ + buffer = XLogReadBuffer(false, reln, insertblk); + if (!BufferIsValid(buffer)) + elog(PANIC, "forget_matching_split: block unfound"); + page = (Page) BufferGetPage(buffer); + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + rightblk = ItemPointerGetBlockNumber(&(btitem->bti_itup.t_tid)); + Assert(ItemPointerGetOffsetNumber(&(btitem->bti_itup.t_tid)) == P_HIKEY); + UnlockAndReleaseBuffer(buffer); + + foreach(l, incomplete_splits) + { + bt_incomplete_split *split = (bt_incomplete_split *) lfirst(l); + + if (RelFileNodeEquals(node, split->node) && + rightblk == split->rightblk) + { + if (is_root != split->is_root) + elog(LOG, "forget_matching_split: fishy is_root data"); + incomplete_splits = lremove(split, incomplete_splits); + break; /* need not look further */ + } + } +} + +static void +_bt_restore_page(Page page, char *from, int len) +{ + BTItemData btdata; + Size itemsz; + char *end = from + len; + + for (; from < end;) + { + memcpy(&btdata, from, sizeof(BTItemData)); + itemsz = IndexTupleDSize(btdata.bti_itup) + + (sizeof(BTItemData) - sizeof(IndexTupleData)); + itemsz = MAXALIGN(itemsz); + if (PageAddItem(page, (Item) from, itemsz, + FirstOffsetNumber, LP_USED) == InvalidOffsetNumber) + elog(PANIC, "_bt_restore_page: can't add item to page"); + from += itemsz; + } +} + +static void +_bt_restore_meta(Relation reln, XLogRecPtr lsn, + BlockNumber root, uint32 level, + BlockNumber fastroot, uint32 fastlevel) +{ + Buffer metabuf; + Page metapg; + BTMetaPageData *md; + BTPageOpaque pageop; + + metabuf = XLogReadBuffer(true, reln, BTREE_METAPAGE); + if (!BufferIsValid(metabuf)) + elog(PANIC, "_bt_restore_meta: no metapage"); + + metapg = BufferGetPage(metabuf); + _bt_pageinit(metapg, BufferGetPageSize(metabuf)); + + md = BTPageGetMeta(metapg); + md->btm_magic = BTREE_MAGIC; + md->btm_version = BTREE_VERSION; + md->btm_root = root; + md->btm_level = level; + md->btm_fastroot = fastroot; + md->btm_fastlevel = fastlevel; + + pageop = (BTPageOpaque) PageGetSpecialPointer(metapg); + pageop->btpo_flags = BTP_META; + + PageSetLSN(metapg, lsn); + PageSetSUI(metapg, ThisStartUpID); + UnlockAndWriteBuffer(metabuf); +} + +static void +btree_xlog_insert(bool redo, bool isleaf, bool ismeta, + XLogRecPtr lsn, XLogRecord *record) +{ + xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record); + Relation reln; + Buffer buffer; + Page page; + BTPageOpaque pageop; + char *datapos; + int datalen; + xl_btree_metadata md; + + datapos = (char *) xlrec + SizeOfBtreeInsert; + datalen = record->xl_len - SizeOfBtreeInsert; + if (ismeta) + { + memcpy(&md, datapos, sizeof(xl_btree_metadata)); + datapos += sizeof(xl_btree_metadata); + datalen -= sizeof(xl_btree_metadata); + } + + if (redo && (record->xl_info & XLR_BKP_BLOCK_1) && !ismeta && + incomplete_splits == NIL) + return; /* nothing to do */ + + reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node); + if (!RelationIsValid(reln)) + return; + + if (!redo || !(record->xl_info & XLR_BKP_BLOCK_1)) + { + buffer = XLogReadBuffer(false, reln, + ItemPointerGetBlockNumber(&(xlrec->target.tid))); + if (!BufferIsValid(buffer)) + elog(PANIC, "btree_insert_%sdo: block unfound", (redo) ? "re" : "un"); + page = (Page) BufferGetPage(buffer); + if (PageIsNew((PageHeader) page)) + elog(PANIC, "btree_insert_%sdo: uninitialized page", (redo) ? "re" : "un"); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + if (redo) + { + if (XLByteLE(lsn, PageGetLSN(page))) + { + UnlockAndReleaseBuffer(buffer); + } + else + { + if (PageAddItem(page, (Item) datapos, datalen, + ItemPointerGetOffsetNumber(&(xlrec->target.tid)), + LP_USED) == InvalidOffsetNumber) + elog(PANIC, "btree_insert_redo: failed to add item"); + + PageSetLSN(page, lsn); + PageSetSUI(page, ThisStartUpID); + UnlockAndWriteBuffer(buffer); + } + } + else + { + if (XLByteLT(PageGetLSN(page), lsn)) + elog(PANIC, "btree_insert_undo: bad page LSN"); + + if (!P_ISLEAF(pageop)) + { + UnlockAndReleaseBuffer(buffer); + } + else + { + elog(PANIC, "btree_insert_undo: unimplemented"); + } + } + } + + if (redo) /* metapage changes not undoable */ + { + if (ismeta) + _bt_restore_meta(reln, lsn, + md.root, md.level, + md.fastroot, md.fastlevel); + } + + /* Forget any split this insertion completes */ + if (redo && !isleaf && incomplete_splits != NIL) + { + forget_matching_split(reln, xlrec->target.node, + ItemPointerGetBlockNumber(&(xlrec->target.tid)), + ItemPointerGetOffsetNumber(&(xlrec->target.tid)), + false); + } +} + +static void +btree_xlog_split(bool redo, bool onleft, bool isroot, + XLogRecPtr lsn, XLogRecord *record) +{ + xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); + Relation reln; + BlockNumber targetblk; + BlockNumber leftsib; + BlockNumber rightsib; + Buffer buffer; + Page page; + BTPageOpaque pageop; + char *op = (redo) ? "redo" : "undo"; + + reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node); + if (!RelationIsValid(reln)) + return; + + targetblk = ItemPointerGetBlockNumber(&(xlrec->target.tid)); + leftsib = (onleft) ? targetblk : xlrec->otherblk; + rightsib = (onleft) ? xlrec->otherblk : targetblk; + + /* Left (original) sibling */ + buffer = XLogReadBuffer(false, reln, leftsib); + if (!BufferIsValid(buffer)) + elog(PANIC, "btree_split_%s: lost left sibling", op); + + page = (Page) BufferGetPage(buffer); + if (redo) + _bt_pageinit(page, BufferGetPageSize(buffer)); + else if (PageIsNew((PageHeader) page)) + elog(PANIC, "btree_split_undo: uninitialized left sibling"); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + if (redo) + { + pageop->btpo_prev = xlrec->leftblk; + pageop->btpo_next = rightsib; + pageop->btpo.level = xlrec->level; + pageop->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0; + + _bt_restore_page(page, + (char *) xlrec + SizeOfBtreeSplit, + xlrec->leftlen); + + PageSetLSN(page, lsn); + PageSetSUI(page, ThisStartUpID); + UnlockAndWriteBuffer(buffer); + } + else + { + /* undo */ + if (XLByteLT(PageGetLSN(page), lsn)) + elog(PANIC, "btree_split_undo: bad left sibling LSN"); + elog(PANIC, "btree_split_undo: unimplemented"); + } + + /* Right (new) sibling */ + buffer = XLogReadBuffer((redo) ? true : false, reln, rightsib); + if (!BufferIsValid(buffer)) + elog(PANIC, "btree_split_%s: lost right sibling", op); + + page = (Page) BufferGetPage(buffer); + if (redo) + _bt_pageinit(page, BufferGetPageSize(buffer)); + else if (PageIsNew((PageHeader) page)) + elog(PANIC, "btree_split_undo: uninitialized right sibling"); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + if (redo) + { + pageop->btpo_prev = leftsib; + pageop->btpo_next = xlrec->rightblk; + pageop->btpo.level = xlrec->level; + pageop->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0; + + _bt_restore_page(page, + (char *) xlrec + SizeOfBtreeSplit + xlrec->leftlen, + record->xl_len - SizeOfBtreeSplit - xlrec->leftlen); + + PageSetLSN(page, lsn); + PageSetSUI(page, ThisStartUpID); + UnlockAndWriteBuffer(buffer); + } + else + { + /* undo */ + if (XLByteLT(PageGetLSN(page), lsn)) + elog(PANIC, "btree_split_undo: bad right sibling LSN"); + elog(PANIC, "btree_split_undo: unimplemented"); + } + + /* Fix left-link of right (next) page */ + if (redo && !(record->xl_info & XLR_BKP_BLOCK_1)) + { + if (xlrec->rightblk != P_NONE) + { + buffer = XLogReadBuffer(false, reln, xlrec->rightblk); + if (!BufferIsValid(buffer)) + elog(PANIC, "btree_split_redo: lost next right page"); + + page = (Page) BufferGetPage(buffer); + if (PageIsNew((PageHeader) page)) + elog(PANIC, "btree_split_redo: uninitialized next right page"); + + if (XLByteLE(lsn, PageGetLSN(page))) + { + UnlockAndReleaseBuffer(buffer); + } + else + { + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + pageop->btpo_prev = rightsib; + + PageSetLSN(page, lsn); + PageSetSUI(page, ThisStartUpID); + UnlockAndWriteBuffer(buffer); + } + } + } + + /* Forget any split this insertion completes */ + if (redo && xlrec->level > 0 && incomplete_splits != NIL) + { + forget_matching_split(reln, xlrec->target.node, + ItemPointerGetBlockNumber(&(xlrec->target.tid)), + ItemPointerGetOffsetNumber(&(xlrec->target.tid)), + false); + } + + /* The job ain't done till the parent link is inserted... */ + log_incomplete_split(xlrec->target.node, + leftsib, rightsib, isroot); +} + +static void +btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record) +{ + xl_btree_delete *xlrec; + Relation reln; + Buffer buffer; + Page page; + + if (!redo || (record->xl_info & XLR_BKP_BLOCK_1)) + return; + + xlrec = (xl_btree_delete *) XLogRecGetData(record); + reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node); + if (!RelationIsValid(reln)) + return; + buffer = XLogReadBuffer(false, reln, + ItemPointerGetBlockNumber(&(xlrec->target.tid))); + if (!BufferIsValid(buffer)) + elog(PANIC, "btree_delete_redo: block unfound"); + page = (Page) BufferGetPage(buffer); + if (PageIsNew((PageHeader) page)) + elog(PANIC, "btree_delete_redo: uninitialized page"); + + if (XLByteLE(lsn, PageGetLSN(page))) + { + UnlockAndReleaseBuffer(buffer); + return; + } + + PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid))); + + PageSetLSN(page, lsn); + PageSetSUI(page, ThisStartUpID); + UnlockAndWriteBuffer(buffer); +} + +static void +btree_xlog_newroot(bool redo, XLogRecPtr lsn, XLogRecord *record) +{ + xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record); + Relation reln; + Buffer buffer; + Page page; + BTPageOpaque pageop; + + if (!redo) + return; /* not undoable */ + + reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node); + if (!RelationIsValid(reln)) + return; + buffer = XLogReadBuffer(true, reln, xlrec->rootblk); + if (!BufferIsValid(buffer)) + elog(PANIC, "btree_newroot_redo: no root page"); + + page = (Page) BufferGetPage(buffer); + _bt_pageinit(page, BufferGetPageSize(buffer)); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + pageop->btpo_flags = BTP_ROOT; + pageop->btpo_prev = pageop->btpo_next = P_NONE; + pageop->btpo.level = xlrec->level; + if (xlrec->level == 0) + pageop->btpo_flags |= BTP_LEAF; + + if (record->xl_len > SizeOfBtreeNewroot) + _bt_restore_page(page, + (char *) xlrec + SizeOfBtreeNewroot, + record->xl_len - SizeOfBtreeNewroot); + + PageSetLSN(page, lsn); + PageSetSUI(page, ThisStartUpID); + UnlockAndWriteBuffer(buffer); + + _bt_restore_meta(reln, lsn, + xlrec->rootblk, xlrec->level, + xlrec->rootblk, xlrec->level); + + /* Check to see if this satisfies any incomplete insertions */ + if (record->xl_len > SizeOfBtreeNewroot && + incomplete_splits != NIL) + { + forget_matching_split(reln, xlrec->node, + xlrec->rootblk, + P_FIRSTKEY, + true); + } +} + +static void +btree_xlog_newmeta(bool redo, XLogRecPtr lsn, XLogRecord *record) +{ + xl_btree_newmeta *xlrec = (xl_btree_newmeta *) XLogRecGetData(record); + Relation reln; + + if (!redo) + return; /* not undoable */ + + reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node); + if (!RelationIsValid(reln)) + return; + + _bt_restore_meta(reln, lsn, + xlrec->meta.root, xlrec->meta.level, + xlrec->meta.fastroot, xlrec->meta.fastlevel); +} + +static void +btree_xlog_newpage(bool redo, XLogRecPtr lsn, XLogRecord *record) +{ + xl_btree_newpage *xlrec = (xl_btree_newpage *) XLogRecGetData(record); + Relation reln; + Buffer buffer; + Page page; + + if (!redo || (record->xl_info & XLR_BKP_BLOCK_1)) + return; + + reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node); + if (!RelationIsValid(reln)) + return; + buffer = XLogReadBuffer(true, reln, xlrec->blkno); + if (!BufferIsValid(buffer)) + elog(PANIC, "btree_newpage_redo: block unfound"); + page = (Page) BufferGetPage(buffer); + + Assert(record->xl_len == SizeOfBtreeNewpage + BLCKSZ); + memcpy(page, (char *) xlrec + SizeOfBtreeNewpage, BLCKSZ); + + PageSetLSN(page, lsn); + PageSetSUI(page, ThisStartUpID); + UnlockAndWriteBuffer(buffer); +} + + +void +btree_redo(XLogRecPtr lsn, XLogRecord *record) +{ + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_BTREE_INSERT_LEAF: + btree_xlog_insert(true, true, false, lsn, record); + break; + case XLOG_BTREE_INSERT_UPPER: + btree_xlog_insert(true, false, false, lsn, record); + break; + case XLOG_BTREE_INSERT_META: + btree_xlog_insert(true, false, true, lsn, record); + break; + case XLOG_BTREE_SPLIT_L: + btree_xlog_split(true, true, false, lsn, record); + break; + case XLOG_BTREE_SPLIT_R: + btree_xlog_split(true, false, false, lsn, record); + break; + case XLOG_BTREE_SPLIT_L_ROOT: + btree_xlog_split(true, true, true, lsn, record); + break; + case XLOG_BTREE_SPLIT_R_ROOT: + btree_xlog_split(true, false, true, lsn, record); + break; + case XLOG_BTREE_DELETE: + btree_xlog_delete(true, lsn, record); + break; + case XLOG_BTREE_DELETE_PAGE: + case XLOG_BTREE_DELETE_PAGE_META: + // ??? + break; + case XLOG_BTREE_NEWROOT: + btree_xlog_newroot(true, lsn, record); + break; + case XLOG_BTREE_NEWMETA: + btree_xlog_newmeta(true, lsn, record); + break; + case XLOG_BTREE_NEWPAGE: + btree_xlog_newpage(true, lsn, record); + break; + default: + elog(PANIC, "btree_redo: unknown op code %u", info); + } +} + +void +btree_undo(XLogRecPtr lsn, XLogRecord *record) +{ + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_BTREE_INSERT_LEAF: + btree_xlog_insert(false, true, false, lsn, record); + break; + case XLOG_BTREE_INSERT_UPPER: + btree_xlog_insert(false, false, false, lsn, record); + break; + case XLOG_BTREE_INSERT_META: + btree_xlog_insert(false, false, true, lsn, record); + break; + case XLOG_BTREE_SPLIT_L: + btree_xlog_split(false, true, false, lsn, record); + break; + case XLOG_BTREE_SPLIT_R: + btree_xlog_split(false, false, false, lsn, record); + break; + case XLOG_BTREE_SPLIT_L_ROOT: + btree_xlog_split(false, true, true, lsn, record); + break; + case XLOG_BTREE_SPLIT_R_ROOT: + btree_xlog_split(false, false, true, lsn, record); + break; + case XLOG_BTREE_DELETE: + btree_xlog_delete(false, lsn, record); + break; + case XLOG_BTREE_DELETE_PAGE: + case XLOG_BTREE_DELETE_PAGE_META: + // ??? + break; + case XLOG_BTREE_NEWROOT: + btree_xlog_newroot(false, lsn, record); + break; + case XLOG_BTREE_NEWMETA: + btree_xlog_newmeta(false, lsn, record); + break; + case XLOG_BTREE_NEWPAGE: + btree_xlog_newpage(false, lsn, record); + break; + default: + elog(PANIC, "btree_undo: unknown op code %u", info); + } +} + +static void +out_target(char *buf, xl_btreetid *target) +{ + sprintf(buf + strlen(buf), "node %u/%u; tid %u/%u", + target->node.tblNode, target->node.relNode, + ItemPointerGetBlockNumber(&(target->tid)), + ItemPointerGetOffsetNumber(&(target->tid))); +} + +void +btree_desc(char *buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_BTREE_INSERT_LEAF: + { + xl_btree_insert *xlrec = (xl_btree_insert *) rec; + + strcat(buf, "insert: "); + out_target(buf, &(xlrec->target)); + break; + } + case XLOG_BTREE_INSERT_UPPER: + { + xl_btree_insert *xlrec = (xl_btree_insert *) rec; + + strcat(buf, "insert_upper: "); + out_target(buf, &(xlrec->target)); + break; + } + case XLOG_BTREE_INSERT_META: + { + xl_btree_insert *xlrec = (xl_btree_insert *) rec; + + strcat(buf, "insert_meta: "); + out_target(buf, &(xlrec->target)); + break; + } + case XLOG_BTREE_SPLIT_L: + { + xl_btree_split *xlrec = (xl_btree_split *) rec; + + strcat(buf, "split_l: "); + out_target(buf, &(xlrec->target)); + sprintf(buf + strlen(buf), "; oth %u; rgh %u", + xlrec->otherblk, xlrec->rightblk); + break; + } + case XLOG_BTREE_SPLIT_R: + { + xl_btree_split *xlrec = (xl_btree_split *) rec; + + strcat(buf, "split_r: "); + out_target(buf, &(xlrec->target)); + sprintf(buf + strlen(buf), "; oth %u; rgh %u", + xlrec->otherblk, xlrec->rightblk); + break; + } + case XLOG_BTREE_SPLIT_L_ROOT: + { + xl_btree_split *xlrec = (xl_btree_split *) rec; + + strcat(buf, "split_l_root: "); + out_target(buf, &(xlrec->target)); + sprintf(buf + strlen(buf), "; oth %u; rgh %u", + xlrec->otherblk, xlrec->rightblk); + break; + } + case XLOG_BTREE_SPLIT_R_ROOT: + { + xl_btree_split *xlrec = (xl_btree_split *) rec; + + strcat(buf, "split_r_root: "); + out_target(buf, &(xlrec->target)); + sprintf(buf + strlen(buf), "; oth %u; rgh %u", + xlrec->otherblk, xlrec->rightblk); + break; + } + case XLOG_BTREE_DELETE: + { + xl_btree_delete *xlrec = (xl_btree_delete *) rec; + + strcat(buf, "delete: "); + out_target(buf, &(xlrec->target)); + break; + } + case XLOG_BTREE_DELETE_PAGE: + case XLOG_BTREE_DELETE_PAGE_META: + { + xl_btree_delete_page *xlrec = (xl_btree_delete_page *) rec; + + strcat(buf, "delete_page: "); + out_target(buf, &(xlrec->target)); + sprintf(buf + strlen(buf), "; dead %u; left %u; right %u", + xlrec->deadblk, xlrec->leftblk, xlrec->rightblk); + break; + } + case XLOG_BTREE_NEWROOT: + { + xl_btree_newroot *xlrec = (xl_btree_newroot *) rec; + + sprintf(buf + strlen(buf), "newroot: node %u/%u; root %u lev %u", + xlrec->node.tblNode, xlrec->node.relNode, + xlrec->rootblk, xlrec->level); + break; + } + case XLOG_BTREE_NEWMETA: + { + xl_btree_newmeta *xlrec = (xl_btree_newmeta *) rec; + + sprintf(buf + strlen(buf), "newmeta: node %u/%u; root %u lev %u fast %u lev %u", + xlrec->node.tblNode, xlrec->node.relNode, + xlrec->meta.root, xlrec->meta.level, + xlrec->meta.fastroot, xlrec->meta.fastlevel); + break; + } + case XLOG_BTREE_NEWPAGE: + { + xl_btree_newpage *xlrec = (xl_btree_newpage *) rec; + + sprintf(buf + strlen(buf), "newpage: node %u/%u; page %u", + xlrec->node.tblNode, xlrec->node.relNode, + xlrec->blkno); + break; + } + default: + strcat(buf, "UNKNOWN"); + break; + } +} + +void +btree_xlog_startup(void) +{ + incomplete_splits = NIL; +} + +void +btree_xlog_cleanup(void) +{ + List *l; + + foreach(l, incomplete_splits) + { + bt_incomplete_split *split = (bt_incomplete_split *) lfirst(l); + Relation reln; + Buffer lbuf, + rbuf; + Page lpage, + rpage; + BTPageOpaque lpageop, + rpageop; + bool is_only; + + reln = XLogOpenRelation(true, RM_BTREE_ID, split->node); + if (!RelationIsValid(reln)) + continue; + lbuf = XLogReadBuffer(false, reln, split->leftblk); + if (!BufferIsValid(lbuf)) + elog(PANIC, "btree_xlog_cleanup: left block unfound"); + lpage = (Page) BufferGetPage(lbuf); + lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage); + rbuf = XLogReadBuffer(false, reln, split->rightblk); + if (!BufferIsValid(rbuf)) + elog(PANIC, "btree_xlog_cleanup: right block unfound"); + rpage = (Page) BufferGetPage(rbuf); + rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* if the two pages are all of their level, it's a only-page split */ + is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop); + + _bt_insert_parent(reln, lbuf, rbuf, (BTStack) NULL, + split->is_root, is_only); + } + incomplete_splits = NIL; +} diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index b9af3a06c9..59af280802 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -3,7 +3,7 @@ * * Resource managers definition * - * $Header: /cvsroot/pgsql/src/backend/access/transam/rmgr.c,v 1.9 2001/08/25 18:52:41 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/transam/rmgr.c,v 1.10 2003/02/21 00:06:22 tgl Exp $ */ #include "postgres.h" @@ -19,21 +19,22 @@ #include "commands/sequence.h" -RmgrData RmgrTable[] = { - {"XLOG", xlog_redo, xlog_undo, xlog_desc}, - {"Transaction", xact_redo, xact_undo, xact_desc}, - {"Storage", smgr_redo, smgr_undo, smgr_desc}, - {"CLOG", clog_redo, clog_undo, clog_desc}, - {"Reserved 4", NULL, NULL, NULL}, - {"Reserved 5", NULL, NULL, NULL}, - {"Reserved 6", NULL, NULL, NULL}, - {"Reserved 7", NULL, NULL, NULL}, - {"Reserved 8", NULL, NULL, NULL}, - {"Reserved 9", NULL, NULL, NULL}, - {"Heap", heap_redo, heap_undo, heap_desc}, - {"Btree", btree_redo, btree_undo, btree_desc}, - {"Hash", hash_redo, hash_undo, hash_desc}, - {"Rtree", rtree_redo, rtree_undo, rtree_desc}, - {"Gist", gist_redo, gist_undo, gist_desc}, - {"Sequence", seq_redo, seq_undo, seq_desc} +RmgrData RmgrTable[RM_MAX_ID+1] = { + {"XLOG", xlog_redo, xlog_undo, xlog_desc, NULL, NULL}, + {"Transaction", xact_redo, xact_undo, xact_desc, NULL, NULL}, + {"Storage", smgr_redo, smgr_undo, smgr_desc, NULL, NULL}, + {"CLOG", clog_redo, clog_undo, clog_desc, NULL, NULL}, + {"Reserved 4", NULL, NULL, NULL, NULL, NULL}, + {"Reserved 5", NULL, NULL, NULL, NULL, NULL}, + {"Reserved 6", NULL, NULL, NULL, NULL, NULL}, + {"Reserved 7", NULL, NULL, NULL, NULL, NULL}, + {"Reserved 8", NULL, NULL, NULL, NULL, NULL}, + {"Reserved 9", NULL, NULL, NULL, NULL, NULL}, + {"Heap", heap_redo, heap_undo, heap_desc, NULL, NULL}, + {"Btree", btree_redo, btree_undo, btree_desc, + btree_xlog_startup, btree_xlog_cleanup}, + {"Hash", hash_redo, hash_undo, hash_desc, NULL, NULL}, + {"Rtree", rtree_redo, rtree_undo, rtree_desc, NULL, NULL}, + {"Gist", gist_redo, gist_undo, gist_desc, NULL, NULL}, + {"Sequence", seq_redo, seq_undo, seq_desc, NULL, NULL} }; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index c35762bba9..3b615f8229 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.111 2003/01/25 03:06:04 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.112 2003/02/21 00:06:22 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1203,16 +1203,6 @@ XLogFlush(XLogRecPtr record) XLogRecPtr WriteRqstPtr; XLogwrtRqst WriteRqst; - if (XLOG_DEBUG) - { - elog(LOG, "XLogFlush%s%s: request %X/%X; write %X/%X; flush %X/%X", - (IsBootstrapProcessingMode()) ? "(bootstrap)" : "", - (InRedo) ? "(redo)" : "", - record.xlogid, record.xrecoff, - LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff, - LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff); - } - /* Disabled during REDO */ if (InRedo) return; @@ -1221,6 +1211,15 @@ XLogFlush(XLogRecPtr record) if (XLByteLE(record, LogwrtResult.Flush)) return; + if (XLOG_DEBUG) + { + elog(LOG, "XLogFlush%s: request %X/%X; write %X/%X; flush %X/%X", + (IsBootstrapProcessingMode()) ? "(bootstrap)" : "", + record.xlogid, record.xrecoff, + LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff, + LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff); + } + START_CRIT_SECTION(); /* @@ -2515,6 +2514,12 @@ StartupXLOG(void) elog(LOG, "database system was interrupted at %s", str_time(ControlFile->time)); + /* This is just to allow attaching to startup process with a debugger */ +#ifdef XLOG_REPLAY_DELAY + if (XLOG_DEBUG && ControlFile->state != DB_SHUTDOWNED) + sleep(60); +#endif + /* * Get the last valid checkpoint record. If the latest one according * to pg_control is broken, try the next-to-last one. @@ -2578,14 +2583,23 @@ StartupXLOG(void) /* REDO */ if (InRecovery) { + int rmid; + elog(LOG, "database system was not properly shut down; " "automatic recovery in progress"); ControlFile->state = DB_IN_RECOVERY; ControlFile->time = time(NULL); UpdateControlFile(); + /* Start up the recovery environment */ XLogInitRelationCache(); + for (rmid = 0; rmid <= RM_MAX_ID; rmid++) + { + if (RmgrTable[rmid].rm_startup != NULL) + RmgrTable[rmid].rm_startup(); + } + /* Is REDO required ? */ if (XLByteLT(checkPoint.redo, RecPtr)) record = ReadRecord(&(checkPoint.redo), PANIC, buffer); @@ -2737,7 +2751,25 @@ StartupXLOG(void) if (InRecovery) { + int rmid; + /* + * Allow resource managers to do any required cleanup. + */ + for (rmid = 0; rmid <= RM_MAX_ID; rmid++) + { + if (RmgrTable[rmid].rm_cleanup != NULL) + RmgrTable[rmid].rm_cleanup(); + } + + /* suppress in-transaction check in CreateCheckPoint */ + MyLastRecPtr.xrecoff = 0; + MyXactMadeXLogEntry = false; + MyXactMadeTempRelUpdate = false; + + /* + * Perform a new checkpoint to update our recovery activity to disk. + * * In case we had to use the secondary checkpoint, make sure that * it will still be shown as the secondary checkpoint after this * CreateCheckPoint operation; we don't want the broken primary @@ -2745,6 +2777,10 @@ StartupXLOG(void) */ ControlFile->checkPoint = checkPointLoc; CreateCheckPoint(true, true); + + /* + * Close down recovery environment + */ XLogCloseRelationCache(); } diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 7d279ef94d..f4dce1842f 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: nbtree.h,v 1.63 2002/07/02 05:48:44 momjian Exp $ + * $Id: nbtree.h,v 1.64 2003/02/21 00:06:22 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -22,46 +22,55 @@ /* * BTPageOpaqueData -- At the end of every page, we store a pointer * to both siblings in the tree. This is used to do forward/backward - * index scans. See Lehman and Yao's paper for more - * info. In addition, we need to know what type of page this is - * (leaf or internal), and whether the page is available for reuse. + * index scans. The next-page link is also critical for recovery when + * a search has navigated to the wrong page due to concurrent page splits + * or deletions; see src/backend/access/nbtree/README for more info. * - * We also store a back-link to the parent page, but this cannot be trusted - * very far since it does not get updated when the parent is split. - * See backend/access/nbtree/README for details. + * In addition, we store the page's btree level (counting upwards from + * zero at a leaf page) as well as some flag bits indicating the page type + * and status. If the page is deleted, we replace the level with the + * next-transaction-ID value indicating when it is safe to reclaim the page. + * + * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested + * instead. */ typedef struct BTPageOpaqueData { - BlockNumber btpo_prev; /* used for backward index scans */ - BlockNumber btpo_next; /* used for forward index scans */ - BlockNumber btpo_parent; /* pointer to parent, but not updated on - * parent split */ - uint16 btpo_flags; /* LEAF?, ROOT?, FREE?, META?, REORDER? */ - + BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */ + BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */ + union + { + uint32 level; /* tree level --- zero for leaf pages */ + TransactionId xact; /* next transaction ID, if deleted */ + } btpo; + uint16 btpo_flags; /* flag bits, see below */ } BTPageOpaqueData; typedef BTPageOpaqueData *BTPageOpaque; /* Bits defined in btpo_flags */ -#define BTP_LEAF (1 << 0) /* leaf page, if not internal page */ +#define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */ #define BTP_ROOT (1 << 1) /* root page (has no parent) */ -#define BTP_FREE (1 << 2) /* page not in use */ +#define BTP_DELETED (1 << 2) /* page has been deleted from tree */ #define BTP_META (1 << 3) /* meta-page */ -#define BTP_REORDER (1 << 4) /* items need reordering */ /* * The Meta page is always the first page in the btree index. * Its primary purpose is to point to the location of the btree root page. + * We also point to the "fast" root, which is the current effective root; + * see README for discussion. */ typedef struct BTMetaPageData { - uint32 btm_magic; - uint32 btm_version; - BlockNumber btm_root; - int32 btm_level; + uint32 btm_magic; /* should contain BTREE_MAGIC */ + uint32 btm_version; /* should contain BTREE_VERSION */ + BlockNumber btm_root; /* current root location */ + uint32 btm_level; /* tree level of the root page */ + BlockNumber btm_fastroot; /* current "fast" root location */ + uint32 btm_fastlevel; /* tree level of the "fast" root page */ } BTMetaPageData; #define BTPageGetMeta(p) \ @@ -69,12 +78,7 @@ typedef struct BTMetaPageData #define BTREE_METAPAGE 0 /* first page is meta */ #define BTREE_MAGIC 0x053162 /* magic number of btree pages */ - -#define BTreeInvalidParent(opaque) \ - (opaque->btpo_parent == InvalidBlockNumber || \ - opaque->btpo_parent == BTREE_METAPAGE) - -#define BTREE_VERSION 1 +#define BTREE_VERSION 2 /* current version number */ /* * We actually need to be able to fit three items on every page, @@ -84,6 +88,295 @@ typedef struct BTMetaPageData ((PageGetPageSize(page) - \ sizeof(PageHeaderData) - \ MAXALIGN(sizeof(BTPageOpaqueData))) / 3 - sizeof(ItemIdData)) + +/* + * BTItems are what we store in the btree. Each item is an index tuple, + * including key and pointer values. (In some cases either the key or the + * pointer may go unused, see backend/access/nbtree/README for details.) + * + * Old comments: + * In addition, we must guarantee that all tuples in the index are unique, + * in order to satisfy some assumptions in Lehman and Yao. The way that we + * do this is by generating a new OID for every insertion that we do in the + * tree. This adds eight bytes to the size of btree index tuples. Note + * that we do not use the OID as part of a composite key; the OID only + * serves as a unique identifier for a given index tuple (logical position + * within a page). + * + * New comments: + * actually, we must guarantee that all tuples in A LEVEL + * are unique, not in ALL INDEX. So, we can use bti_itup->t_tid + * as unique identifier for a given index tuple (logical position + * within a level). - vadim 04/09/97 + */ + +typedef struct BTItemData +{ + IndexTupleData bti_itup; +} BTItemData; + +typedef BTItemData *BTItem; + +/* + * For XLOG: size without alignment. Sizeof works as long as + * IndexTupleData has exactly 8 bytes. + */ +#define SizeOfBTItem sizeof(BTItemData) + +/* Test whether items are the "same" per the above notes */ +#define BTItemSame(i1, i2) ( (i1)->bti_itup.t_tid.ip_blkid.bi_hi == \ + (i2)->bti_itup.t_tid.ip_blkid.bi_hi && \ + (i1)->bti_itup.t_tid.ip_blkid.bi_lo == \ + (i2)->bti_itup.t_tid.ip_blkid.bi_lo && \ + (i1)->bti_itup.t_tid.ip_posid == \ + (i2)->bti_itup.t_tid.ip_posid ) + +/* + * In general, the btree code tries to localize its knowledge about + * page layout to a couple of routines. However, we need a special + * value to indicate "no page number" in those places where we expect + * page numbers. We can use zero for this because we never need to + * make a pointer to the metadata page. + */ + +#define P_NONE 0 + +/* + * Macros to test whether a page is leftmost or rightmost on its tree level, + * as well as other state info kept in the opaque data. + */ +#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE) +#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE) +#define P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF) +#define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT) +#define P_ISDELETED(opaque) ((opaque)->btpo_flags & BTP_DELETED) + +/* + * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost + * page. The high key is not a data key, but gives info about what range of + * keys is supposed to be on this page. The high key on a page is required + * to be greater than or equal to any data key that appears on the page. + * If we find ourselves trying to insert a key > high key, we know we need + * to move right (this should only happen if the page was split since we + * examined the parent page). + * + * Our insertion algorithm guarantees that we can use the initial least key + * on our right sibling as the high key. Once a page is created, its high + * key changes only if the page is split. + * + * On a non-rightmost page, the high key lives in item 1 and data items + * start in item 2. Rightmost pages have no high key, so we store data + * items beginning in item 1. + */ + +#define P_HIKEY ((OffsetNumber) 1) +#define P_FIRSTKEY ((OffsetNumber) 2) +#define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY) + +/* + * XLOG records for btree operations + * + * XLOG allows to store some information in high 4 bits of log + * record xl_info field + */ +#define XLOG_BTREE_INSERT_LEAF 0x00 /* add btitem without split */ +#define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */ +#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ +#define XLOG_BTREE_SPLIT_L 0x30 /* add btitem with split */ +#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ +#define XLOG_BTREE_SPLIT_L_ROOT 0x50 /* add btitem with split of root */ +#define XLOG_BTREE_SPLIT_R_ROOT 0x60 /* as above, new item on right */ +#define XLOG_BTREE_DELETE 0x70 /* delete leaf btitem */ +#define XLOG_BTREE_DELETE_PAGE 0x80 /* delete an entire page */ +#define XLOG_BTREE_DELETE_PAGE_META 0x90 /* same, plus update metapage */ +#define XLOG_BTREE_NEWROOT 0xA0 /* new root page */ +#define XLOG_BTREE_NEWMETA 0xB0 /* update metadata page */ +#define XLOG_BTREE_NEWPAGE 0xC0 /* new index page during build */ + +/* + * All that we need to find changed index tuple + */ +typedef struct xl_btreetid +{ + RelFileNode node; + ItemPointerData tid; /* changed tuple id */ +} xl_btreetid; + +/* + * All that we need to regenerate the meta-data page + */ +typedef struct xl_btree_metadata +{ + BlockNumber root; + uint32 level; + BlockNumber fastroot; + uint32 fastlevel; +} xl_btree_metadata; + +/* + * This is what we need to know about simple (without split) insert. + * + * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META. + * Note that INSERT_META implies it's not a leaf page. + */ +typedef struct xl_btree_insert +{ + xl_btreetid target; /* inserted tuple id */ + /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */ + /* BTITEM FOLLOWS AT END OF STRUCT */ +} xl_btree_insert; + +#define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData) + +/* + * On insert with split we save items of both left and right siblings + * and restore content of both pages from log record. This way takes less + * xlog space than the normal approach, because if we did it standardly, + * XLogInsert would almost always think the right page is new and store its + * whole page image. + * + * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record. + * The _L and _R variants indicate whether the inserted btitem went into the + * left or right split page (and thus, whether otherblk is the right or left + * page of the split pair). The _ROOT variants indicate that we are splitting + * the root page, and thus that a newroot record rather than an insert or + * split record should follow. Note that a split record never carries a + * metapage update --- we'll do that in the parent-level update. + */ +typedef struct xl_btree_split +{ + xl_btreetid target; /* inserted tuple id */ + BlockNumber otherblk; /* second block participated in split: */ + /* first one is stored in target' tid */ + BlockNumber leftblk; /* prev/left block */ + BlockNumber rightblk; /* next/right block */ + uint32 level; /* tree level of page being split */ + uint16 leftlen; /* len of left page items below */ + /* LEFT AND RIGHT PAGES TUPLES FOLLOW AT THE END */ +} xl_btree_split; + +#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16)) + +/* + * This is what we need to know about delete of an individual leaf btitem + */ +typedef struct xl_btree_delete +{ + xl_btreetid target; /* deleted tuple id */ +} xl_btree_delete; + +#define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData) + +/* + * This is what we need to know about deletion of a btree page. The target + * identifies the tuple removed from the parent page (note that we remove + * this tuple's downlink and the *following* tuple's key). Note we do not + * store any content for the deleted page --- it is just rewritten as empty + * during recovery. + */ +typedef struct xl_btree_delete_page +{ + xl_btreetid target; /* deleted tuple id in parent page */ + BlockNumber deadblk; /* child block being deleted */ + BlockNumber leftblk; /* child block's left sibling, if any */ + BlockNumber rightblk; /* child block's right sibling */ + /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_DELETE_PAGE_META */ +} xl_btree_delete_page; + +#define SizeOfBtreeDeletePage (offsetof(xl_btree_delete_page, rightblk) + sizeof(BlockNumber)) + +/* + * New root log record. There are zero btitems if this is to establish an + * empty root, or two if it is the result of splitting an old root. + * + * Note that although this implies rewriting the metadata page, we don't need + * an xl_btree_metadata record --- the rootblk and level are sufficient. + */ +typedef struct xl_btree_newroot +{ + RelFileNode node; + BlockNumber rootblk; /* location of new root */ + uint32 level; /* its tree level */ + /* 0 or 2 BTITEMS FOLLOW AT END OF STRUCT */ +} xl_btree_newroot; + +#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32)) + +/* + * New metapage log record. This is not issued during routine operations; + * it's only used when initializing an empty index and at completion of + * index build. + */ +typedef struct xl_btree_newmeta +{ + RelFileNode node; + xl_btree_metadata meta; +} xl_btree_newmeta; + +#define SizeOfBtreeNewmeta (sizeof(xl_btree_newmeta)) + +/* + * New index page log record. This is only used while building a new index. + */ +typedef struct xl_btree_newpage +{ + RelFileNode node; + BlockNumber blkno; /* location of new page */ + /* entire page contents follow at end of record */ +} xl_btree_newpage; + +#define SizeOfBtreeNewpage (offsetof(xl_btree_newpage, blkno) + sizeof(BlockNumber)) + + +/* + * Operator strategy numbers -- ordering of these is <, <=, =, >=, > + */ + +#define BTLessStrategyNumber 1 +#define BTLessEqualStrategyNumber 2 +#define BTEqualStrategyNumber 3 +#define BTGreaterEqualStrategyNumber 4 +#define BTGreaterStrategyNumber 5 +#define BTMaxStrategyNumber 5 + +/* + * When a new operator class is declared, we require that the user + * supply us with an amproc procedure for determining whether, for + * two keys a and b, a < b, a = b, or a > b. This routine must + * return < 0, 0, > 0, respectively, in these three cases. Since we + * only have one such proc in amproc, it's number 1. + */ + +#define BTORDER_PROC 1 + +/* + * We need to be able to tell the difference between read and write + * requests for pages, in order to do locking correctly. + */ + +#define BT_READ BUFFER_LOCK_SHARE +#define BT_WRITE BUFFER_LOCK_EXCLUSIVE + +/* + * BTStackData -- As we descend a tree, we push the (location, downlink) + * pairs from internal pages onto a private stack. If we split a + * leaf, we use this stack to walk back up the tree and insert data + * into parent pages (and possibly to split them, too). Lehman and + * Yao's update algorithm guarantees that under no circumstances can + * our private stack give us an irredeemably bad picture up the tree. + * Again, see the paper for details. + */ + +typedef struct BTStackData +{ + BlockNumber bts_blkno; + OffsetNumber bts_offset; + BTItemData bts_btitem; + struct BTStackData *bts_parent; +} BTStackData; + +typedef BTStackData *BTStack; + /* * BTScanOpaqueData is used to remember which buffers we're currently * examining in the scan. We keep these buffers pinned (but not locked, @@ -116,212 +409,6 @@ typedef struct BTScanOpaqueData typedef BTScanOpaqueData *BTScanOpaque; -/* - * BTItems are what we store in the btree. Each item is an index tuple, - * including key and pointer values. (In some cases either the key or the - * pointer may go unused, see backend/access/nbtree/README for details.) - * - * Old comments: - * In addition, we must guarantee that all tuples in the index are unique, - * in order to satisfy some assumptions in Lehman and Yao. The way that we - * do this is by generating a new OID for every insertion that we do in the - * tree. This adds eight bytes to the size of btree index tuples. Note - * that we do not use the OID as part of a composite key; the OID only - * serves as a unique identifier for a given index tuple (logical position - * within a page). - * - * New comments: - * actually, we must guarantee that all tuples in A LEVEL - * are unique, not in ALL INDEX. So, we can use bti_itup->t_tid - * as unique identifier for a given index tuple (logical position - * within a level). - vadim 04/09/97 - */ - -typedef struct BTItemData -{ - IndexTupleData bti_itup; -} BTItemData; - -typedef BTItemData *BTItem; - -/* - * For XLOG: size without alignement. Sizeof works as long as - * IndexTupleData has exactly 8 bytes. - */ -#define SizeOfBTItem sizeof(BTItemData) - -/* Test whether items are the "same" per the above notes */ -#define BTItemSame(i1, i2) ( (i1)->bti_itup.t_tid.ip_blkid.bi_hi == \ - (i2)->bti_itup.t_tid.ip_blkid.bi_hi && \ - (i1)->bti_itup.t_tid.ip_blkid.bi_lo == \ - (i2)->bti_itup.t_tid.ip_blkid.bi_lo && \ - (i1)->bti_itup.t_tid.ip_posid == \ - (i2)->bti_itup.t_tid.ip_posid ) - -/* - * BTStackData -- As we descend a tree, we push the (key, pointer) - * pairs from internal nodes onto a private stack. If we split a - * leaf, we use this stack to walk back up the tree and insert data - * into parent nodes (and possibly to split them, too). Lehman and - * Yao's update algorithm guarantees that under no circumstances can - * our private stack give us an irredeemably bad picture up the tree. - * Again, see the paper for details. - */ - -typedef struct BTStackData -{ - BlockNumber bts_blkno; - OffsetNumber bts_offset; - BTItemData bts_btitem; - struct BTStackData *bts_parent; -} BTStackData; - -typedef BTStackData *BTStack; - -/* - * We need to be able to tell the difference between read and write - * requests for pages, in order to do locking correctly. - */ - -#define BT_READ BUFFER_LOCK_SHARE -#define BT_WRITE BUFFER_LOCK_EXCLUSIVE - -/* - * In general, the btree code tries to localize its knowledge about - * page layout to a couple of routines. However, we need a special - * value to indicate "no page number" in those places where we expect - * page numbers. We can use zero for this because we never need to - * make a pointer to the metadata page. - */ - -#define P_NONE 0 - -/* - * Macros to test whether a page is leftmost or rightmost on its tree level, - * as well as other state info kept in the opaque data. - */ -#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE) -#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE) -#define P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF) -#define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT) - -/* - * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost - * page. The high key is not a data key, but gives info about what range of - * keys is supposed to be on this page. The high key on a page is required - * to be greater than or equal to any data key that appears on the page. - * If we find ourselves trying to insert a key > high key, we know we need - * to move right (this should only happen if the page was split since we - * examined the parent page). - * - * Our insertion algorithm guarantees that we can use the initial least key - * on our right sibling as the high key. Once a page is created, its high - * key changes only if the page is split. - * - * On a non-rightmost page, the high key lives in item 1 and data items - * start in item 2. Rightmost pages have no high key, so we store data - * items beginning in item 1. - */ - -#define P_HIKEY ((OffsetNumber) 1) -#define P_FIRSTKEY ((OffsetNumber) 2) -#define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY) - -/* - * XLOG allows to store some information in high 4 bits of log - * record xl_info field - */ -#define XLOG_BTREE_DELETE 0x00 /* delete btitem */ -#define XLOG_BTREE_INSERT 0x10 /* add btitem without split */ -#define XLOG_BTREE_SPLIT 0x20 /* add btitem with split */ -#define XLOG_BTREE_SPLEFT 0x30 /* as above + flag that new btitem */ - /* goes to the left sibling */ -#define XLOG_BTREE_NEWROOT 0x40 /* new root page */ - -#define XLOG_BTREE_LEAF 0x80 /* leaf/internal page was changed */ - -/* - * All what we need to find changed index tuple - */ -typedef struct xl_btreetid -{ - RelFileNode node; - ItemPointerData tid; /* changed tuple id */ -} xl_btreetid; - -/* - * This is what we need to know about delete - */ -typedef struct xl_btree_delete -{ - xl_btreetid target; /* deleted tuple id */ -} xl_btree_delete; - -#define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData) - -/* - * This is what we need to know about pure (without split) insert - */ -typedef struct xl_btree_insert -{ - xl_btreetid target; /* inserted tuple id */ - /* BTITEM FOLLOWS AT END OF STRUCT */ -} xl_btree_insert; - -#define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData) - -/* - * On insert with split we save items of both left and right siblings - * and restore content of both pages from log record - */ -typedef struct xl_btree_split -{ - xl_btreetid target; /* inserted tuple id */ - BlockIdData otherblk; /* second block participated in split: */ - /* first one is stored in target' tid */ - BlockIdData parentblk; /* parent block */ - BlockIdData leftblk; /* prev left block */ - BlockIdData rightblk; /* next right block */ - uint16 leftlen; /* len of left page items below */ - /* LEFT AND RIGHT PAGES ITEMS FOLLOW AT THE END */ -} xl_btree_split; - -#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16)) - -/* - * New root log record. - */ -typedef struct xl_btree_newroot -{ - RelFileNode node; - int32 level; - BlockIdData rootblk; - /* 0 or 2 BTITEMS FOLLOW AT END OF STRUCT */ -} xl_btree_newroot; - -#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, rootblk) + sizeof(BlockIdData)) - -/* - * Operator strategy numbers -- ordering of these is <, <=, =, >=, > - */ - -#define BTLessStrategyNumber 1 -#define BTLessEqualStrategyNumber 2 -#define BTEqualStrategyNumber 3 -#define BTGreaterEqualStrategyNumber 4 -#define BTGreaterStrategyNumber 5 -#define BTMaxStrategyNumber 5 - -/* - * When a new operator class is declared, we require that the user - * supply us with an amproc procedure for determining whether, for - * two keys a and b, a < b, a = b, or a > b. This routine must - * return < 0, 0, > 0, respectively, in these three cases. Since we - * only have one such proc in amproc, it's number 1. - */ - -#define BTORDER_PROC 1 - /* * prototypes for functions in nbtree.c (external entry points for btree) */ @@ -340,27 +427,26 @@ extern Datum btmarkpos(PG_FUNCTION_ARGS); extern Datum btrestrpos(PG_FUNCTION_ARGS); extern Datum btbulkdelete(PG_FUNCTION_ARGS); -extern void btree_redo(XLogRecPtr lsn, XLogRecord *record); -extern void btree_undo(XLogRecPtr lsn, XLogRecord *record); -extern void btree_desc(char *buf, uint8 xl_info, char *rec); - /* * prototypes for functions in nbtinsert.c */ extern InsertIndexResult _bt_doinsert(Relation rel, BTItem btitem, bool index_is_unique, Relation heapRel); +extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, + BTStack stack, bool is_root, bool is_only); /* * prototypes for functions in nbtpage.c */ extern void _bt_metapinit(Relation rel); extern Buffer _bt_getroot(Relation rel, int access); +extern Buffer _bt_gettrueroot(Relation rel); extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); extern void _bt_relbuf(Relation rel, Buffer buf); extern void _bt_wrtbuf(Relation rel, Buffer buf); extern void _bt_wrtnorelbuf(Relation rel, Buffer buf); extern void _bt_pageinit(Page page, Size size); -extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, int level); +extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level); extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid); /* @@ -377,6 +463,7 @@ extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir); +extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost); /* * prototypes for functions in nbtstrat.c @@ -407,4 +494,13 @@ extern void _bt_spooldestroy(BTSpool *btspool); extern void _bt_spool(BTItem btitem, BTSpool *btspool); extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2); +/* + * prototypes for functions in nbtxlog.c + */ +extern void btree_redo(XLogRecPtr lsn, XLogRecord *record); +extern void btree_undo(XLogRecPtr lsn, XLogRecord *record); +extern void btree_desc(char *buf, uint8 xl_info, char *rec); +extern void btree_xlog_startup(void); +extern void btree_xlog_cleanup(void); + #endif /* NBTREE_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 1659c65b93..a1be9bacf3 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: xlog.h,v 1.40 2002/11/15 02:44:57 momjian Exp $ + * $Id: xlog.h,v 1.41 2003/02/21 00:06:22 tgl Exp $ */ #ifndef XLOG_H #define XLOG_H @@ -145,10 +145,12 @@ typedef XLogPageHeaderData *XLogPageHeader; */ typedef struct RmgrData { - char *rm_name; + const char *rm_name; void (*rm_redo) (XLogRecPtr lsn, XLogRecord *rptr); void (*rm_undo) (XLogRecPtr lsn, XLogRecord *rptr); void (*rm_desc) (char *buf, uint8 xl_info, char *rec); + void (*rm_startup) (void); + void (*rm_cleanup) (void); } RmgrData; extern RmgrData RmgrTable[]; diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 03e452121f..240889577a 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -37,7 +37,7 @@ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: catversion.h,v 1.177 2003/02/16 02:30:39 tgl Exp $ + * $Id: catversion.h,v 1.178 2003/02/21 00:06:22 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 200302151 +#define CATALOG_VERSION_NO 200302171 #endif