Make btree index structure adjustments and WAL logging changes needed to
support btree compaction, as per proposal of a few days ago. btree index pages no longer store parent links, instead they have a level indicator (counting up from zero for leaf pages). The FixBTree recovery logic is removed, and replaced by code that detects missing parent-level insertions during WAL replay. Also, generate appropriate WAL entries when updating btree metapage and when building a btree index from scratch. I believe btree indexes are now completely WAL-legal for the first time. initdb forced due to index and WAL changes.
This commit is contained in:
parent
4df0f1d26f
commit
70508ba7ae
|
@ -4,7 +4,7 @@
|
||||||
# Makefile for access/nbtree
|
# Makefile for access/nbtree
|
||||||
#
|
#
|
||||||
# IDENTIFICATION
|
# IDENTIFICATION
|
||||||
# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.11 2001/07/15 22:48:16 tgl Exp $
|
# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.12 2003/02/21 00:06:21 tgl Exp $
|
||||||
#
|
#
|
||||||
#-------------------------------------------------------------------------
|
#-------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ top_builddir = ../../../..
|
||||||
include $(top_builddir)/src/Makefile.global
|
include $(top_builddir)/src/Makefile.global
|
||||||
|
|
||||||
OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtsearch.o \
|
OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtsearch.o \
|
||||||
nbtstrat.o nbtutils.o nbtsort.o
|
nbtstrat.o nbtutils.o nbtsort.o nbtxlog.o
|
||||||
|
|
||||||
all: SUBSYS.o
|
all: SUBSYS.o
|
||||||
|
|
||||||
|
|
|
@ -1,186 +1,378 @@
|
||||||
$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.6 2002/10/20 20:47:31 tgl Exp $
|
$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.7 2003/02/21 00:06:21 tgl Exp $
|
||||||
|
|
||||||
This directory contains a correct implementation of Lehman and Yao's
|
This directory contains a correct implementation of Lehman and Yao's
|
||||||
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
|
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
|
||||||
Efficient Locking for Concurrent Operations on B-Trees, ACM Transactions
|
Efficient Locking for Concurrent Operations on B-Trees, ACM Transactions
|
||||||
on Database Systems, Vol 6, No. 4, December 1981, pp 650-670).
|
on Database Systems, Vol 6, No. 4, December 1981, pp 650-670). We also
|
||||||
|
use a simplified version of the deletion logic described in Lanin and
|
||||||
|
Shasha (V. Lanin and D. Shasha, A Symmetric Concurrent B-Tree Algorithm,
|
||||||
|
Proceedings of 1986 Fall Joint Computer Conference, pp 380-389).
|
||||||
|
|
||||||
We have made the following changes in order to incorporate their algorithm
|
The Lehman and Yao algorithm and insertions
|
||||||
|
-------------------------------------------
|
||||||
|
|
||||||
|
We have made the following changes in order to incorporate the L&Y algorithm
|
||||||
into Postgres:
|
into Postgres:
|
||||||
|
|
||||||
+ The requirement that all btree keys be unique is too onerous,
|
The requirement that all btree keys be unique is too onerous,
|
||||||
but the algorithm won't work correctly without it. Fortunately, it is
|
but the algorithm won't work correctly without it. Fortunately, it is
|
||||||
only necessary that keys be unique on a single tree level, because L&Y
|
only necessary that keys be unique on a single tree level, because L&Y
|
||||||
only use the assumption of key uniqueness when re-finding a key in a
|
only use the assumption of key uniqueness when re-finding a key in a
|
||||||
parent node (to determine where to insert the key for a split page).
|
parent page (to determine where to insert the key for a split page).
|
||||||
Therefore, we can use the link field to disambiguate multiple
|
Therefore, we can use the link field to disambiguate multiple
|
||||||
occurrences of the same user key: only one entry in the parent level
|
occurrences of the same user key: only one entry in the parent level
|
||||||
will be pointing at the page we had split. (Indeed we need not look at
|
will be pointing at the page we had split. (Indeed we need not look at
|
||||||
the real "key" at all, just at the link field.) We can distinguish
|
the real "key" at all, just at the link field.) We can distinguish
|
||||||
items at the leaf level in the same way, by examining their links to
|
items at the leaf level in the same way, by examining their links to
|
||||||
heap tuples; we'd never have two items for the same heap tuple.
|
heap tuples; we'd never have two items for the same heap tuple.
|
||||||
|
|
||||||
+ Lehman and Yao assume that the key range for a subtree S is described
|
Lehman and Yao assume that the key range for a subtree S is described
|
||||||
by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the parent
|
by Ki < v <= Ki+1 where Ki and Ki+1 are the adjacent keys in the parent
|
||||||
node. This does not work for nonunique keys (for example, if we have
|
page. This does not work for nonunique keys (for example, if we have
|
||||||
enough equal keys to spread across several leaf pages, there *must* be
|
enough equal keys to spread across several leaf pages, there *must* be
|
||||||
some equal bounding keys in the first level up). Therefore we assume
|
some equal bounding keys in the first level up). Therefore we assume
|
||||||
Ki <= v <= Ki+1 instead. A search that finds exact equality to a
|
Ki <= v <= Ki+1 instead. A search that finds exact equality to a
|
||||||
bounding key in an upper tree level must descend to the left of that
|
bounding key in an upper tree level must descend to the left of that
|
||||||
key to ensure it finds any equal keys in the preceding page. An
|
key to ensure it finds any equal keys in the preceding page. An
|
||||||
insertion that sees the high key of its target page is equal to the key
|
insertion that sees the high key of its target page is equal to the key
|
||||||
to be inserted has a choice whether or not to move right, since the new
|
to be inserted has a choice whether or not to move right, since the new
|
||||||
key could go on either page. (Currently, we try to find a page where
|
key could go on either page. (Currently, we try to find a page where
|
||||||
there is room for the new key without a split.)
|
there is room for the new key without a split.)
|
||||||
|
|
||||||
+ Lehman and Yao don't require read locks, but assume that in-memory
|
Lehman and Yao don't require read locks, but assume that in-memory
|
||||||
copies of tree nodes are unshared. Postgres shares in-memory buffers
|
copies of tree pages are unshared. Postgres shares in-memory buffers
|
||||||
among backends. As a result, we do page-level read locking on btree
|
among backends. As a result, we do page-level read locking on btree
|
||||||
nodes in order to guarantee that no record is modified while we are
|
pages in order to guarantee that no record is modified while we are
|
||||||
examining it. This reduces concurrency but guaranteees correct
|
examining it. This reduces concurrency but guaranteees correct
|
||||||
behavior. An advantage is that when trading in a read lock for a
|
behavior. An advantage is that when trading in a read lock for a
|
||||||
write lock, we need not re-read the page after getting the write lock.
|
write lock, we need not re-read the page after getting the write lock.
|
||||||
Since we're also holding a pin on the shared buffer containing the
|
Since we're also holding a pin on the shared buffer containing the
|
||||||
page, we know that buffer still contains the page and is up-to-date.
|
page, we know that buffer still contains the page and is up-to-date.
|
||||||
|
|
||||||
+ We support the notion of an ordered "scan" of an index as well as
|
We support the notion of an ordered "scan" of an index as well as
|
||||||
insertions, deletions, and simple lookups. A scan in the forward
|
insertions, deletions, and simple lookups. A scan in the forward
|
||||||
direction is no problem, we just use the right-sibling pointers that
|
direction is no problem, we just use the right-sibling pointers that
|
||||||
L&Y require anyway. (Thus, once we have descended the tree to the
|
L&Y require anyway. (Thus, once we have descended the tree to the
|
||||||
correct start point for the scan, the scan looks only at leaf pages
|
correct start point for the scan, the scan looks only at leaf pages
|
||||||
and never at higher tree levels.) To support scans in the backward
|
and never at higher tree levels.) To support scans in the backward
|
||||||
direction, we also store a "left sibling" link much like the "right
|
direction, we also store a "left sibling" link much like the "right
|
||||||
sibling". (This adds an extra step to the L&Y split algorithm: while
|
sibling". (This adds an extra step to the L&Y split algorithm: while
|
||||||
holding the write lock on the page being split, we also lock its former
|
holding the write lock on the page being split, we also lock its former
|
||||||
right sibling to update that page's left-link. This is safe since no
|
right sibling to update that page's left-link. This is safe since no
|
||||||
writer of that page can be interested in acquiring a write lock on our
|
writer of that page can be interested in acquiring a write lock on our
|
||||||
page.) A backwards scan has one additional bit of complexity: after
|
page.) A backwards scan has one additional bit of complexity: after
|
||||||
following the left-link we must account for the possibility that the
|
following the left-link we must account for the possibility that the
|
||||||
left sibling page got split before we could read it. So, we have to
|
left sibling page got split before we could read it. So, we have to
|
||||||
move right until we find a page whose right-link matches the page we
|
move right until we find a page whose right-link matches the page we
|
||||||
came from.
|
came from. (Actually, it's even harder than that; see deletion discussion
|
||||||
|
below.)
|
||||||
|
|
||||||
+ Read locks on a page are held for as long as a scan is examining a page.
|
Read locks on a page are held for as long as a scan is examining a page.
|
||||||
But nbtree.c arranges to drop the read lock, but not the buffer pin,
|
But nbtree.c arranges to drop the read lock, but not the buffer pin,
|
||||||
on the current page of a scan before control leaves nbtree. When we
|
on the current page of a scan before control leaves nbtree. When we
|
||||||
come back to resume the scan, we have to re-grab the read lock and
|
come back to resume the scan, we have to re-grab the read lock and
|
||||||
then move right if the current item moved (see _bt_restscan()). Keeping
|
then move right if the current item moved (see _bt_restscan()). Keeping
|
||||||
the pin ensures that the current item cannot move left or be deleted
|
the pin ensures that the current item cannot move left or be deleted
|
||||||
(see btbulkdelete).
|
(see btbulkdelete).
|
||||||
|
|
||||||
+ In most cases we release our lock and pin on a page before attempting
|
In most cases we release our lock and pin on a page before attempting
|
||||||
to acquire pin and lock on the page we are moving to. In a few places
|
to acquire pin and lock on the page we are moving to. In a few places
|
||||||
it is necessary to lock the next page before releasing the current one.
|
it is necessary to lock the next page before releasing the current one.
|
||||||
This is safe when moving right or up, but not when moving left or down
|
This is safe when moving right or up, but not when moving left or down
|
||||||
(else we'd create the possibility of deadlocks).
|
(else we'd create the possibility of deadlocks).
|
||||||
|
|
||||||
+ Lehman and Yao fail to discuss what must happen when the root page
|
Lehman and Yao fail to discuss what must happen when the root page
|
||||||
becomes full and must be split. Our implementation is to split the
|
becomes full and must be split. Our implementation is to split the
|
||||||
root in the same way that any other page would be split, then construct
|
root in the same way that any other page would be split, then construct
|
||||||
a new root page holding pointers to both of the resulting pages (which
|
a new root page holding pointers to both of the resulting pages (which
|
||||||
now become siblings on level 2 of the tree). The new root page is then
|
now become siblings on the next level of the tree). The new root page
|
||||||
installed by altering the root pointer in the meta-data page (see
|
is then installed by altering the root pointer in the meta-data page (see
|
||||||
below). This works because the root is not treated specially in any
|
below). This works because the root is not treated specially in any
|
||||||
other way --- in particular, searches will move right using its link
|
other way --- in particular, searches will move right using its link
|
||||||
pointer if the link is set. Therefore, searches will find the data
|
pointer if the link is set. Therefore, searches will find the data
|
||||||
that's been moved into the right sibling even if they read the metadata
|
that's been moved into the right sibling even if they read the meta-data
|
||||||
page before it got updated. This is the same reasoning that makes a
|
page before it got updated. This is the same reasoning that makes a
|
||||||
split of a non-root page safe. The locking considerations are similar too.
|
split of a non-root page safe. The locking considerations are similar too.
|
||||||
|
|
||||||
+ Lehman and Yao assume fixed-size keys, but we must deal with
|
When an inserter recurses up the tree, splitting internal pages to insert
|
||||||
variable-size keys. Therefore there is not a fixed maximum number of
|
links to pages inserted on the level below, it is possible that it will
|
||||||
keys per page; we just stuff in as many as will fit. When we split a
|
need to access a page above the level that was the root when it began its
|
||||||
page, we try to equalize the number of bytes, not items, assigned to
|
descent (or more accurately, the level that was the root when it read the
|
||||||
each of the resulting pages. Note we must include the incoming item in
|
meta-data page). In this case the stack it made while descending does not
|
||||||
this calculation, otherwise it is possible to find that the incoming
|
help for finding the correct page. When this happens, we find the correct
|
||||||
item doesn't fit on the split page where it needs to go!
|
place by re-descending the tree until we reach the level one above the
|
||||||
|
level we need to insert a link to, and then moving right as necessary.
|
||||||
|
(Typically this will take only two fetches, the meta-data page and the new
|
||||||
|
root, but in principle there could have been more than one root split
|
||||||
|
since we saw the root. We can identify the correct tree level by means of
|
||||||
|
the level numbers stored in each page. The situation is rare enough that
|
||||||
|
we do not need a more efficient solution.)
|
||||||
|
|
||||||
In addition, the following things are handy to know:
|
Lehman and Yao assume fixed-size keys, but we must deal with
|
||||||
|
variable-size keys. Therefore there is not a fixed maximum number of
|
||||||
|
keys per page; we just stuff in as many as will fit. When we split a
|
||||||
|
page, we try to equalize the number of bytes, not items, assigned to
|
||||||
|
each of the resulting pages. Note we must include the incoming item in
|
||||||
|
this calculation, otherwise it is possible to find that the incoming
|
||||||
|
item doesn't fit on the split page where it needs to go!
|
||||||
|
|
||||||
+ Page zero of every btree is a meta-data page. This page stores
|
The deletion algorithm
|
||||||
the location of the root page, a pointer to a list of free
|
----------------------
|
||||||
pages, and other stuff that's handy to know. (Currently, we
|
|
||||||
never shrink btree indexes so there are never any free pages.)
|
|
||||||
|
|
||||||
+ The algorithm assumes we can fit at least three items per page
|
Deletions of leaf items are handled by getting a super-exclusive lock on
|
||||||
(a "high key" and two real data items). Therefore it's unsafe
|
the target page, so that no other backend has a pin on the page when the
|
||||||
to accept items larger than 1/3rd page size. Larger items would
|
deletion starts. This means no scan is pointing at the page, so no other
|
||||||
work sometimes, but could cause failures later on depending on
|
backend can lose its place due to the item deletion.
|
||||||
what else gets put on their page.
|
|
||||||
|
|
||||||
+ This algorithm doesn't guarantee btree consistency after a kernel crash
|
The above does not work for deletion of items in internal pages, since
|
||||||
or hardware failure. To do that, we'd need ordered writes, and UNIX
|
other backends keep no lock nor pin on a page they have descended past.
|
||||||
doesn't support ordered writes (short of fsync'ing every update, which
|
Instead, when a backend is ascending the tree using its stack, it must
|
||||||
is too high a price). Rebuilding corrupted indexes during restart
|
be prepared for the possibility that the item it wants is to the left of
|
||||||
seems more attractive.
|
the recorded position (but it can't have moved left out of the recorded
|
||||||
|
page). Since we hold a lock on the lower page (per L&Y) until we have
|
||||||
|
re-found the parent item that links to it, we can be assured that the
|
||||||
|
parent item does still exist and can't have been deleted. Also, because
|
||||||
|
we are matching downlink page numbers and not data keys, we don't have any
|
||||||
|
problem with possibly misidentifying the parent item.
|
||||||
|
|
||||||
+ Deletions are handled by getting a super-exclusive lock on the target
|
We consider deleting an entire page from the btree only when it's become
|
||||||
page, so that no other backend has a pin on the page when the deletion
|
completely empty of items. (Merging partly-full pages would allow better
|
||||||
starts. This means no scan is pointing at the page. This is OK for
|
space reuse, but it seems impractical to move existing data items left or
|
||||||
deleting leaf items, probably not OK for deleting internal nodes;
|
right to make this happen --- a scan moving in the opposite direction
|
||||||
will need to think harder when it's time to support index compaction.
|
might miss the items if so. We could do it during VACUUM FULL, though.)
|
||||||
|
Also, we *never* delete the rightmost page on a tree level (this
|
||||||
|
restriction simplifies the traversal algorithms, as explained below).
|
||||||
|
|
||||||
+ "ScanKey" data structures are used in two fundamentally different ways
|
To delete an empty page, we acquire write lock on its left sibling (if
|
||||||
in this code. Searches for the initial position for a scan, as well as
|
any), the target page itself, the right sibling (there must be one), and
|
||||||
insertions, use scankeys in which the comparison function is a 3-way
|
the parent page, in that order. The parent page must be found using the
|
||||||
comparator (<0, =0, >0 result). These scankeys are built within the
|
same type of search as used to find the parent during an insertion split.
|
||||||
btree code (eg, by _bt_mkscankey()) and used by _bt_compare(). Once we
|
Then we update the side-links in the siblings, mark the target page
|
||||||
are positioned, sequential examination of tuples in a scan is done by
|
deleted, and remove the downlink from the parent, as well as the parent's
|
||||||
_bt_checkkeys() using scankeys in which the comparison functions return
|
upper bounding key for the target (the one separating it from its right
|
||||||
booleans --- for example, int4lt might be used. These scankeys are the
|
sibling). This causes the target page's key space to effectively belong
|
||||||
ones originally passed in from outside the btree code. Same
|
to its right sibling. (Neither the left nor right sibling pages need to
|
||||||
representation, but different comparison functions!
|
change their "high key" if any; so there is no problem with possibly not
|
||||||
|
having enough space to replace a high key.) The side-links in the target
|
||||||
|
page are not changed.
|
||||||
|
|
||||||
Notes about data representation:
|
(Note: Lanin and Shasha prefer to make the key space move left, but their
|
||||||
|
argument for doing so hinges on not having left-links, which we have
|
||||||
|
anyway. So we simplify the algorithm by moving key space right.)
|
||||||
|
|
||||||
+ The right-sibling link required by L&Y is kept in the page "opaque
|
To preserve consistency on the parent level, we cannot merge the key space
|
||||||
data" area, as is the left-sibling link and some flags.
|
of a page into its right sibling unless the right sibling is a child of
|
||||||
|
the same parent --- otherwise, the parent's key space assignment changes
|
||||||
|
too, meaning we'd have to make bounding-key updates in its parent, and
|
||||||
|
perhaps all the way up the tree. Since we can't possibly do that
|
||||||
|
atomically, we forbid this case. That means that the rightmost child of a
|
||||||
|
parent node can't be deleted unless it's the only remaining child.
|
||||||
|
|
||||||
+ We also keep a parent link in the opaque data, but this link is not
|
When we delete the last remaining child of a parent page, we mark the
|
||||||
very trustworthy because it is not updated when the parent page splits.
|
parent page "half-dead" as part of the atomic update that deletes the
|
||||||
Thus, it points to some page on the parent level, but possibly a page
|
child page. This implicitly transfers the parent's key space to its right
|
||||||
well to the left of the page's actual current parent. In most cases
|
sibling (which it must have, since we never delete the overall-rightmost
|
||||||
we do not need this link at all. Normally we return to a parent page
|
page of a level). No future insertions into the parent level are allowed
|
||||||
using a stack of entries that are made as we descend the tree, as in L&Y.
|
to insert keys into the half-dead page --- they must move right to its
|
||||||
There is exactly one case where the stack will not help: concurrent
|
sibling, instead. The parent remains empty and can be deleted in a
|
||||||
root splits. If an inserter process needs to split what had been the
|
separate atomic action. (However, if it's the rightmost child of its own
|
||||||
root when it started its descent, but finds that that page is no longer
|
parent, it might have to stay half-dead for awhile, until it's also the
|
||||||
the root (because someone else split it meanwhile), then it uses the
|
only child.)
|
||||||
parent link to move up to the next level. This is OK because we do fix
|
|
||||||
the parent link in a former root page when splitting it. This logic
|
|
||||||
will work even if the root is split multiple times (even up to creation
|
|
||||||
of multiple new levels) before an inserter returns to it. The same
|
|
||||||
could not be said of finding the new root via the metapage, since that
|
|
||||||
would work only for a single level of added root.
|
|
||||||
|
|
||||||
+ The Postgres disk block data format (an array of items) doesn't fit
|
Note that an empty leaf page is a valid tree state, but an empty interior
|
||||||
Lehman and Yao's alternating-keys-and-pointers notion of a disk page,
|
page is not legal (an interior page must have children to delegate its
|
||||||
so we have to play some games.
|
key space to). So an interior page *must* be marked half-dead as soon
|
||||||
|
as its last child is deleted.
|
||||||
|
|
||||||
+ On a page that is not rightmost in its tree level, the "high key" is
|
The notion of a half-dead page means that the key space relationship between
|
||||||
kept in the page's first item, and real data items start at item 2.
|
the half-dead page's level and its parent's level may be a little out of
|
||||||
The link portion of the "high key" item goes unused. A page that is
|
whack: key space that appears to belong to the half-dead page's parent on the
|
||||||
rightmost has no "high key", so data items start with the first item.
|
parent level may really belong to its right sibling. We can tolerate this,
|
||||||
Putting the high key at the left, rather than the right, may seem odd,
|
however, because insertions and deletions on upper tree levels are always
|
||||||
but it avoids moving the high key as we add data items.
|
done by reference to child page numbers, not keys. The only cost is that
|
||||||
|
searches may sometimes descend to the half-dead page and then have to move
|
||||||
|
right, rather than going directly to the sibling page.
|
||||||
|
|
||||||
+ On a leaf page, the data items are simply links to (TIDs of) tuples
|
A deleted page cannot be reclaimed immediately, since there may be other
|
||||||
in the relation being indexed, with the associated key values.
|
processes waiting to reference it (ie, search processes that just left the
|
||||||
|
parent, or scans moving right or left from one of the siblings). These
|
||||||
|
processes must observe that the page is marked dead and recover
|
||||||
|
accordingly. Searches and forward scans simply follow the right-link
|
||||||
|
until they find a non-dead page --- this will be where the deleted page's
|
||||||
|
key-space moved to.
|
||||||
|
|
||||||
+ On a non-leaf page, the data items are down-links to child pages with
|
Stepping left in a backward scan is complicated because we must consider
|
||||||
bounding keys. The key in each data item is the *lower* bound for
|
the possibility that the left sibling was just split (meaning we must find
|
||||||
keys on that child page, so logically the key is to the left of that
|
the rightmost page derived from the left sibling), plus the possibility
|
||||||
downlink. The high key (if present) is the upper bound for the last
|
that the page we were just on has now been deleted and hence isn't in the
|
||||||
downlink. The first data item on each such page has no lower bound
|
sibling chain at all anymore. So the move-left algorithm becomes:
|
||||||
--- or lower bound of minus infinity, if you prefer. The comparison
|
0. Remember the page we are on as the "original page".
|
||||||
routines must treat it accordingly. The actual key stored in the
|
1. Follow the original page's left-link (we're done if this is zero).
|
||||||
item is irrelevant, and need not be stored at all. This arrangement
|
2. If the current page is live and its right-link matches the "original
|
||||||
corresponds to the fact that an L&Y non-leaf page has one more pointer
|
page", we are done.
|
||||||
than key.
|
3. Otherwise, move right one or more times looking for a live page whose
|
||||||
|
right-link matches the "original page". If found, we are done. (In
|
||||||
|
principle we could scan all the way to the right end of the index, but
|
||||||
|
in practice it seems better to give up after a small number of tries.
|
||||||
|
It's unlikely the original page's sibling split more than a few times
|
||||||
|
while we were in flight to it; if we do not find a matching link in a
|
||||||
|
few tries, then most likely the original page is deleted.)
|
||||||
|
4. Return to the "original page". If it is still live, return to step 1
|
||||||
|
(we guessed wrong about it being deleted, and should restart with its
|
||||||
|
current left-link). If it is dead, move right until a non-dead page
|
||||||
|
is found (there must be one, since rightmost pages are never deleted),
|
||||||
|
mark that as the new "original page", and return to step 1.
|
||||||
|
This algorithm is correct because the live page found by step 4 will have
|
||||||
|
the same left keyspace boundary as the page we started from. Therefore,
|
||||||
|
when we ultimately exit, it must be on a page whose right keyspace
|
||||||
|
boundary matches the left boundary of where we started --- which is what
|
||||||
|
we need to be sure we don't miss or re-scan any items.
|
||||||
|
|
||||||
Notes to operator class implementors:
|
A deleted page can only be reclaimed once there is no scan or search that
|
||||||
|
has a reference to it; until then, it must stay in place with its
|
||||||
|
right-link undisturbed. We implement this by waiting until all
|
||||||
|
transactions that were running at the time of deletion are dead; which is
|
||||||
|
overly strong, but is simple to implement within Postgres. When marked
|
||||||
|
dead, a deleted page is labeled with the next-transaction counter value.
|
||||||
|
VACUUM can reclaim the page for re-use when this transaction number is
|
||||||
|
older than the oldest open transaction. (NOTE: VACUUM FULL can reclaim
|
||||||
|
such pages immediately.)
|
||||||
|
|
||||||
+ With this implementation, we require each supported datatype to supply
|
Reclaiming a page doesn't actually change its state on disk --- we simply
|
||||||
us with a comparison procedure via pg_amproc. This procedure must take
|
record it in the shared-memory free space map, from which it will be
|
||||||
two nonnull values A and B and return an int32 < 0, 0, or > 0 if A < B,
|
handed out the next time a new page is needed for a page split. The
|
||||||
A = B, or A > B, respectively. See nbtcompare.c for examples.
|
deleted page's contents will be overwritten by the split operation.
|
||||||
|
(Note: if we find a deleted page with an extremely old transaction
|
||||||
|
number, it'd be worthwhile to re-mark it with FrozenTransactionId so that
|
||||||
|
a later xid wraparound can't cause us to think the page is unreclaimable.
|
||||||
|
But in more normal situations this would be a waste of a disk write.)
|
||||||
|
|
||||||
|
Because we never delete the rightmost page of any level (and in particular
|
||||||
|
never delete the root), it's impossible for the height of the tree to
|
||||||
|
decrease. After massive deletions we might have a scenario in which the
|
||||||
|
tree is "skinny", with several single-page levels below the root.
|
||||||
|
Operations will still be correct in this case, but we'd waste cycles
|
||||||
|
descending through the single-page levels. To handle this we use an idea
|
||||||
|
from Lanin and Shasha: we keep track of the "fast root" level, which is
|
||||||
|
the lowest single-page level. The meta-data page keeps a pointer to this
|
||||||
|
level as well as the true root. All ordinary operations initiate their
|
||||||
|
searches at the fast root not the true root. When we split a page that is
|
||||||
|
alone on its level or delete the next-to-last page on a level (both cases
|
||||||
|
are easily detected), we have to make sure that the fast root pointer is
|
||||||
|
adjusted appropriately. In the split case, we do this work as part of the
|
||||||
|
atomic update for the insertion into the parent level; in the delete case
|
||||||
|
as part of the atomic update for the delete (either way, the metapage has
|
||||||
|
to be the last page locked in the update to avoid deadlock risks). This
|
||||||
|
avoids race conditions if two such operations are executing concurrently.
|
||||||
|
|
||||||
|
VACUUM needs to do a linear scan of an index to search for empty leaf
|
||||||
|
pages and half-dead parent pages that can be deleted, as well as deleted
|
||||||
|
pages that can be reclaimed because they are older than all open
|
||||||
|
transactions.
|
||||||
|
|
||||||
|
WAL considerations
|
||||||
|
------------------
|
||||||
|
|
||||||
|
The insertion and deletion algorithms in themselves don't guarantee btree
|
||||||
|
consistency after a crash. To provide robustness, we depend on WAL
|
||||||
|
replay. A single WAL entry is effectively an atomic action --- we can
|
||||||
|
redo it from the log if it fails to complete.
|
||||||
|
|
||||||
|
Ordinary item insertions (that don't force a page split) are of course
|
||||||
|
single WAL entries, since they only affect one page. The same for
|
||||||
|
leaf-item deletions (if the deletion brings the leaf page to zero items,
|
||||||
|
it is now a candidate to be deleted, but that is a separate action).
|
||||||
|
|
||||||
|
An insertion that causes a page split is logged as a single WAL entry for
|
||||||
|
the changes occuring on the insertion's level --- including update of the
|
||||||
|
right sibling's left-link --- followed by a second WAL entry for the
|
||||||
|
insertion on the parent level (which might itself be a page split, requiring
|
||||||
|
an additional insertion above that, etc).
|
||||||
|
|
||||||
|
For a root split, the followon WAL entry is a "new root" entry rather than
|
||||||
|
an "insertion" entry, but details are otherwise much the same.
|
||||||
|
|
||||||
|
Because insertion involves multiple atomic actions, the WAL replay logic
|
||||||
|
has to detect the case where a page split isn't followed by a matching
|
||||||
|
insertion on the parent level, and then do that insertion on its own (and
|
||||||
|
recursively for any subsequent parent insertion, of course). This is
|
||||||
|
feasible because the WAL entry for the split contains enough info to know
|
||||||
|
what must be inserted in the parent level.
|
||||||
|
|
||||||
|
When splitting a non-root page that is alone on its level, the required
|
||||||
|
metapage update (of the "fast root" link) is performed and logged as part
|
||||||
|
of the insertion into the parent level. When splitting the root page, the
|
||||||
|
metapage update is handled as part of the "new root" action.
|
||||||
|
|
||||||
|
A page deletion is logged as a single WAL entry covering all four
|
||||||
|
required page updates (target page, left and right siblings, and parent)
|
||||||
|
as an atomic event. (Any required fast-root link update is also part
|
||||||
|
of the WAL entry.) If the parent page becomes half-dead but is not
|
||||||
|
immediately deleted due to a subsequent crash, there is no loss of
|
||||||
|
consistency, and the empty page will be picked up by the next VACUUM.
|
||||||
|
|
||||||
|
Other things that are handy to know
|
||||||
|
-----------------------------------
|
||||||
|
|
||||||
|
Page zero of every btree is a meta-data page. This page stores the
|
||||||
|
location of the root page --- both the true root and the current effective
|
||||||
|
root ("fast" root).
|
||||||
|
|
||||||
|
The algorithm assumes we can fit at least three items per page
|
||||||
|
(a "high key" and two real data items). Therefore it's unsafe
|
||||||
|
to accept items larger than 1/3rd page size. Larger items would
|
||||||
|
work sometimes, but could cause failures later on depending on
|
||||||
|
what else gets put on their page.
|
||||||
|
|
||||||
|
"ScanKey" data structures are used in two fundamentally different ways
|
||||||
|
in this code. Searches for the initial position for a scan, as well as
|
||||||
|
insertions, use scankeys in which the comparison function is a 3-way
|
||||||
|
comparator (<0, =0, >0 result). These scankeys are built within the
|
||||||
|
btree code (eg, by _bt_mkscankey()) and used by _bt_compare(). Once we
|
||||||
|
are positioned, sequential examination of tuples in a scan is done by
|
||||||
|
_bt_checkkeys() using scankeys in which the comparison functions return
|
||||||
|
booleans --- for example, int4lt might be used. These scankeys are the
|
||||||
|
ones originally passed in from outside the btree code. Same
|
||||||
|
representation, but different comparison functions!
|
||||||
|
|
||||||
|
Notes about data representation
|
||||||
|
-------------------------------
|
||||||
|
|
||||||
|
The right-sibling link required by L&Y is kept in the page "opaque
|
||||||
|
data" area, as is the left-sibling link, the page level, and some flags.
|
||||||
|
The page level counts upwards from zero at the leaf level, to the tree
|
||||||
|
depth minus 1 at the root. (Counting up from the leaves ensures that we
|
||||||
|
don't need to renumber any existing pages when splitting the root.)
|
||||||
|
|
||||||
|
The Postgres disk block data format (an array of items) doesn't fit
|
||||||
|
Lehman and Yao's alternating-keys-and-pointers notion of a disk page,
|
||||||
|
so we have to play some games.
|
||||||
|
|
||||||
|
On a page that is not rightmost in its tree level, the "high key" is
|
||||||
|
kept in the page's first item, and real data items start at item 2.
|
||||||
|
The link portion of the "high key" item goes unused. A page that is
|
||||||
|
rightmost has no "high key", so data items start with the first item.
|
||||||
|
Putting the high key at the left, rather than the right, may seem odd,
|
||||||
|
but it avoids moving the high key as we add data items.
|
||||||
|
|
||||||
|
On a leaf page, the data items are simply links to (TIDs of) tuples
|
||||||
|
in the relation being indexed, with the associated key values.
|
||||||
|
|
||||||
|
On a non-leaf page, the data items are down-links to child pages with
|
||||||
|
bounding keys. The key in each data item is the *lower* bound for
|
||||||
|
keys on that child page, so logically the key is to the left of that
|
||||||
|
downlink. The high key (if present) is the upper bound for the last
|
||||||
|
downlink. The first data item on each such page has no lower bound
|
||||||
|
--- or lower bound of minus infinity, if you prefer. The comparison
|
||||||
|
routines must treat it accordingly. The actual key stored in the
|
||||||
|
item is irrelevant, and need not be stored at all. This arrangement
|
||||||
|
corresponds to the fact that an L&Y non-leaf page has one more pointer
|
||||||
|
than key.
|
||||||
|
|
||||||
|
Notes to operator class implementors
|
||||||
|
------------------------------------
|
||||||
|
|
||||||
|
With this implementation, we require each supported datatype to supply
|
||||||
|
us with a comparison procedure via pg_amproc. This procedure must take
|
||||||
|
two nonnull values A and B and return an int32 < 0, 0, or > 0 if A < B,
|
||||||
|
A = B, or A > B, respectively. See nbtcompare.c for examples.
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -9,7 +9,7 @@
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.58 2002/08/06 02:36:33 tgl Exp $
|
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.59 2003/02/21 00:06:21 tgl Exp $
|
||||||
*
|
*
|
||||||
* NOTES
|
* NOTES
|
||||||
* Postgres btree pages look like ordinary relation pages. The opaque
|
* Postgres btree pages look like ordinary relation pages. The opaque
|
||||||
|
@ -47,15 +47,16 @@ extern Buffer _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release);
|
||||||
|
|
||||||
#define USELOCKING (!BuildingBtree && !IsInitProcessingMode())
|
#define USELOCKING (!BuildingBtree && !IsInitProcessingMode())
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* _bt_metapinit() -- Initialize the metadata page of a btree.
|
* _bt_metapinit() -- Initialize the metadata page of a new btree.
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
_bt_metapinit(Relation rel)
|
_bt_metapinit(Relation rel)
|
||||||
{
|
{
|
||||||
Buffer buf;
|
Buffer buf;
|
||||||
Page pg;
|
Page pg;
|
||||||
BTMetaPageData metad;
|
BTMetaPageData *metad;
|
||||||
BTPageOpaque op;
|
BTPageOpaque op;
|
||||||
|
|
||||||
/* can't be sharing this with anyone, now... */
|
/* can't be sharing this with anyone, now... */
|
||||||
|
@ -67,18 +68,51 @@ _bt_metapinit(Relation rel)
|
||||||
RelationGetRelationName(rel));
|
RelationGetRelationName(rel));
|
||||||
|
|
||||||
buf = ReadBuffer(rel, P_NEW);
|
buf = ReadBuffer(rel, P_NEW);
|
||||||
|
Assert(BufferGetBlockNumber(buf) == BTREE_METAPAGE);
|
||||||
pg = BufferGetPage(buf);
|
pg = BufferGetPage(buf);
|
||||||
|
|
||||||
|
/* NO ELOG(ERROR) from here till newmeta op is logged */
|
||||||
|
START_CRIT_SECTION();
|
||||||
|
|
||||||
_bt_pageinit(pg, BufferGetPageSize(buf));
|
_bt_pageinit(pg, BufferGetPageSize(buf));
|
||||||
|
|
||||||
metad.btm_magic = BTREE_MAGIC;
|
metad = BTPageGetMeta(pg);
|
||||||
metad.btm_version = BTREE_VERSION;
|
metad->btm_magic = BTREE_MAGIC;
|
||||||
metad.btm_root = P_NONE;
|
metad->btm_version = BTREE_VERSION;
|
||||||
metad.btm_level = 0;
|
metad->btm_root = P_NONE;
|
||||||
memcpy((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad));
|
metad->btm_level = 0;
|
||||||
|
metad->btm_fastroot = P_NONE;
|
||||||
|
metad->btm_fastlevel = 0;
|
||||||
|
|
||||||
op = (BTPageOpaque) PageGetSpecialPointer(pg);
|
op = (BTPageOpaque) PageGetSpecialPointer(pg);
|
||||||
op->btpo_flags = BTP_META;
|
op->btpo_flags = BTP_META;
|
||||||
|
|
||||||
|
/* XLOG stuff */
|
||||||
|
if (!rel->rd_istemp)
|
||||||
|
{
|
||||||
|
xl_btree_newmeta xlrec;
|
||||||
|
XLogRecPtr recptr;
|
||||||
|
XLogRecData rdata[1];
|
||||||
|
|
||||||
|
xlrec.node = rel->rd_node;
|
||||||
|
xlrec.meta.root = metad->btm_root;
|
||||||
|
xlrec.meta.level = metad->btm_level;
|
||||||
|
xlrec.meta.fastroot = metad->btm_fastroot;
|
||||||
|
xlrec.meta.fastlevel = metad->btm_fastlevel;
|
||||||
|
|
||||||
|
rdata[0].buffer = InvalidBuffer;
|
||||||
|
rdata[0].data = (char *) &xlrec;
|
||||||
|
rdata[0].len = SizeOfBtreeNewmeta;
|
||||||
|
rdata[0].next = NULL;
|
||||||
|
|
||||||
|
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWMETA, rdata);
|
||||||
|
|
||||||
|
PageSetLSN(pg, recptr);
|
||||||
|
PageSetSUI(pg, ThisStartUpID);
|
||||||
|
}
|
||||||
|
|
||||||
|
END_CRIT_SECTION();
|
||||||
|
|
||||||
WriteBuffer(buf);
|
WriteBuffer(buf);
|
||||||
|
|
||||||
/* all done */
|
/* all done */
|
||||||
|
@ -102,6 +136,14 @@ _bt_metapinit(Relation rel)
|
||||||
* NOTE that the returned root page will have only a read lock set
|
* NOTE that the returned root page will have only a read lock set
|
||||||
* on it even if access = BT_WRITE!
|
* on it even if access = BT_WRITE!
|
||||||
*
|
*
|
||||||
|
* The returned page is not necessarily the true root --- it could be
|
||||||
|
* a "fast root" (a page that is alone in its level due to deletions).
|
||||||
|
* Also, if the root page is split while we are "in flight" to it,
|
||||||
|
* what we will return is the old root, which is now just the leftmost
|
||||||
|
* page on a probably-not-very-wide level. For most purposes this is
|
||||||
|
* as good as or better than the true root, so we do not bother to
|
||||||
|
* insist on finding the true root.
|
||||||
|
*
|
||||||
* On successful return, the root page is pinned and read-locked.
|
* On successful return, the root page is pinned and read-locked.
|
||||||
* The metadata page is not locked or pinned on exit.
|
* The metadata page is not locked or pinned on exit.
|
||||||
*/
|
*/
|
||||||
|
@ -162,15 +204,19 @@ _bt_getroot(Relation rel, int access)
|
||||||
rootblkno = BufferGetBlockNumber(rootbuf);
|
rootblkno = BufferGetBlockNumber(rootbuf);
|
||||||
rootpage = BufferGetPage(rootbuf);
|
rootpage = BufferGetPage(rootbuf);
|
||||||
|
|
||||||
|
_bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
|
||||||
|
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
|
||||||
|
rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
|
||||||
|
rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
|
||||||
|
rootopaque->btpo.level = 0;
|
||||||
|
|
||||||
/* NO ELOG(ERROR) till meta is updated */
|
/* NO ELOG(ERROR) till meta is updated */
|
||||||
START_CRIT_SECTION();
|
START_CRIT_SECTION();
|
||||||
|
|
||||||
metad->btm_root = rootblkno;
|
metad->btm_root = rootblkno;
|
||||||
metad->btm_level = 1;
|
metad->btm_level = 0;
|
||||||
|
metad->btm_fastroot = rootblkno;
|
||||||
_bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
|
metad->btm_fastlevel = 0;
|
||||||
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
|
|
||||||
rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT);
|
|
||||||
|
|
||||||
/* XLOG stuff */
|
/* XLOG stuff */
|
||||||
if (!rel->rd_istemp)
|
if (!rel->rd_istemp)
|
||||||
|
@ -180,16 +226,15 @@ _bt_getroot(Relation rel, int access)
|
||||||
XLogRecData rdata;
|
XLogRecData rdata;
|
||||||
|
|
||||||
xlrec.node = rel->rd_node;
|
xlrec.node = rel->rd_node;
|
||||||
xlrec.level = 1;
|
xlrec.rootblk = rootblkno;
|
||||||
BlockIdSet(&(xlrec.rootblk), rootblkno);
|
xlrec.level = 0;
|
||||||
|
|
||||||
rdata.buffer = InvalidBuffer;
|
rdata.buffer = InvalidBuffer;
|
||||||
rdata.data = (char *) &xlrec;
|
rdata.data = (char *) &xlrec;
|
||||||
rdata.len = SizeOfBtreeNewroot;
|
rdata.len = SizeOfBtreeNewroot;
|
||||||
rdata.next = NULL;
|
rdata.next = NULL;
|
||||||
|
|
||||||
recptr = XLogInsert(RM_BTREE_ID,
|
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);
|
||||||
XLOG_BTREE_NEWROOT | XLOG_BTREE_LEAF,
|
|
||||||
&rdata);
|
|
||||||
|
|
||||||
PageSetLSN(rootpage, recptr);
|
PageSetLSN(rootpage, recptr);
|
||||||
PageSetSUI(rootpage, ThisStartUpID);
|
PageSetSUI(rootpage, ThisStartUpID);
|
||||||
|
@ -201,7 +246,11 @@ _bt_getroot(Relation rel, int access)
|
||||||
|
|
||||||
_bt_wrtnorelbuf(rel, rootbuf);
|
_bt_wrtnorelbuf(rel, rootbuf);
|
||||||
|
|
||||||
/* swap write lock for read lock */
|
/*
|
||||||
|
* swap root write lock for read lock. There is no danger of
|
||||||
|
* anyone else accessing the new root page while it's unlocked,
|
||||||
|
* since no one else knows where it is yet.
|
||||||
|
*/
|
||||||
LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
|
LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
|
||||||
LockBuffer(rootbuf, BT_READ);
|
LockBuffer(rootbuf, BT_READ);
|
||||||
|
|
||||||
|
@ -221,86 +270,72 @@ _bt_getroot(Relation rel, int access)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
rootblkno = metad->btm_root;
|
rootblkno = metad->btm_fastroot;
|
||||||
|
|
||||||
_bt_relbuf(rel, metabuf); /* done with the meta page */
|
_bt_relbuf(rel, metabuf); /* done with the meta page */
|
||||||
|
|
||||||
rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
|
rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Race condition: If the root page split between the time we looked
|
* By here, we have a pin and read lock on the root page, and no
|
||||||
* at the metadata page and got the root buffer, then we got the wrong
|
* lock set on the metadata page. Return the root page's buffer.
|
||||||
* buffer. Release it and try again.
|
|
||||||
*/
|
*/
|
||||||
rootpage = BufferGetPage(rootbuf);
|
return rootbuf;
|
||||||
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
|
}
|
||||||
|
|
||||||
if (!P_ISROOT(rootopaque))
|
/*
|
||||||
|
* _bt_gettrueroot() -- Get the true root page of the btree.
|
||||||
|
*
|
||||||
|
* This is the same as the BT_READ case of _bt_getroot(), except
|
||||||
|
* we follow the true-root link not the fast-root link.
|
||||||
|
*
|
||||||
|
* By the time we acquire lock on the root page, it might have been split and
|
||||||
|
* not be the true root anymore. This is okay for the present uses of this
|
||||||
|
* routine; we only really need to be able to move up at least one tree level
|
||||||
|
* from whatever non-root page we were at. If we ever do need to lock the
|
||||||
|
* one true root page, we could loop here, re-reading the metapage on each
|
||||||
|
* failure. (Note that it wouldn't do to hold the lock on the metapage while
|
||||||
|
* moving to the root --- that'd deadlock against any concurrent root split.)
|
||||||
|
*/
|
||||||
|
Buffer
|
||||||
|
_bt_gettrueroot(Relation rel)
|
||||||
|
{
|
||||||
|
Buffer metabuf;
|
||||||
|
Page metapg;
|
||||||
|
BTPageOpaque metaopaque;
|
||||||
|
Buffer rootbuf;
|
||||||
|
BlockNumber rootblkno;
|
||||||
|
BTMetaPageData *metad;
|
||||||
|
|
||||||
|
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
|
||||||
|
metapg = BufferGetPage(metabuf);
|
||||||
|
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
|
||||||
|
metad = BTPageGetMeta(metapg);
|
||||||
|
|
||||||
|
if (!(metaopaque->btpo_flags & BTP_META) ||
|
||||||
|
metad->btm_magic != BTREE_MAGIC)
|
||||||
|
elog(ERROR, "Index %s is not a btree",
|
||||||
|
RelationGetRelationName(rel));
|
||||||
|
|
||||||
|
if (metad->btm_version != BTREE_VERSION)
|
||||||
|
elog(ERROR, "Version mismatch on %s: version %d file, version %d code",
|
||||||
|
RelationGetRelationName(rel),
|
||||||
|
metad->btm_version, BTREE_VERSION);
|
||||||
|
|
||||||
|
/* if no root page initialized yet, fail */
|
||||||
|
if (metad->btm_root == P_NONE)
|
||||||
{
|
{
|
||||||
/*
|
_bt_relbuf(rel, metabuf);
|
||||||
* It happened, but if root page splitter failed to create new
|
return InvalidBuffer;
|
||||||
* root page then we'll go in loop trying to call _bt_getroot
|
|
||||||
* again and again.
|
|
||||||
*/
|
|
||||||
if (FixBTree)
|
|
||||||
{
|
|
||||||
Buffer newrootbuf;
|
|
||||||
|
|
||||||
check_parent:;
|
|
||||||
if (BTreeInvalidParent(rootopaque)) /* unupdated! */
|
|
||||||
{
|
|
||||||
LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
|
|
||||||
LockBuffer(rootbuf, BT_WRITE);
|
|
||||||
|
|
||||||
/* handle concurrent fix of root page */
|
|
||||||
if (BTreeInvalidParent(rootopaque)) /* unupdated! */
|
|
||||||
{
|
|
||||||
elog(WARNING, "bt_getroot[%s]: fixing root page", RelationGetRelationName(rel));
|
|
||||||
newrootbuf = _bt_fixroot(rel, rootbuf, true);
|
|
||||||
LockBuffer(newrootbuf, BUFFER_LOCK_UNLOCK);
|
|
||||||
LockBuffer(newrootbuf, BT_READ);
|
|
||||||
rootbuf = newrootbuf;
|
|
||||||
rootpage = BufferGetPage(rootbuf);
|
|
||||||
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
|
|
||||||
/* New root might be splitted while changing lock */
|
|
||||||
if (P_ISROOT(rootopaque))
|
|
||||||
return (rootbuf);
|
|
||||||
/* rootbuf is read locked */
|
|
||||||
goto check_parent;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* someone else already fixed root */
|
|
||||||
LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
|
|
||||||
LockBuffer(rootbuf, BT_READ);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Ok, here we have old root page with btpo_parent pointing to
|
|
||||||
* upper level - check parent page because of there is good
|
|
||||||
* chance that parent is root page.
|
|
||||||
*/
|
|
||||||
newrootbuf = _bt_getbuf(rel, rootopaque->btpo_parent, BT_READ);
|
|
||||||
_bt_relbuf(rel, rootbuf);
|
|
||||||
rootbuf = newrootbuf;
|
|
||||||
rootpage = BufferGetPage(rootbuf);
|
|
||||||
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
|
|
||||||
if (P_ISROOT(rootopaque))
|
|
||||||
return (rootbuf);
|
|
||||||
/* no luck -:( */
|
|
||||||
}
|
|
||||||
|
|
||||||
/* try again */
|
|
||||||
_bt_relbuf(rel, rootbuf);
|
|
||||||
return _bt_getroot(rel, access);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
rootblkno = metad->btm_root;
|
||||||
* By here, we have a correct lock on the root block, its reference
|
|
||||||
* count is correct, and we have no lock set on the metadata page.
|
_bt_relbuf(rel, metabuf); /* done with the meta page */
|
||||||
* Return the root block.
|
|
||||||
*/
|
rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
|
||||||
|
|
||||||
return rootbuf;
|
return rootbuf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -397,13 +432,14 @@ _bt_wrtnorelbuf(Relation rel, Buffer buf)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* _bt_pageinit() -- Initialize a new page.
|
* _bt_pageinit() -- Initialize a new page.
|
||||||
|
*
|
||||||
|
* On return, the page header is initialized; data space is empty;
|
||||||
|
* special space is zeroed out.
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
_bt_pageinit(Page page, Size size)
|
_bt_pageinit(Page page, Size size)
|
||||||
{
|
{
|
||||||
PageInit(page, size, sizeof(BTPageOpaqueData));
|
PageInit(page, size, sizeof(BTPageOpaqueData));
|
||||||
((BTPageOpaque) PageGetSpecialPointer(page))->btpo_parent =
|
|
||||||
InvalidBlockNumber;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -418,9 +454,12 @@ _bt_pageinit(Page page, Size size)
|
||||||
* at least the old root page when you call this, you're making a big
|
* at least the old root page when you call this, you're making a big
|
||||||
* mistake. On exit, metapage data is correct and we no longer have
|
* mistake. On exit, metapage data is correct and we no longer have
|
||||||
* a pin or lock on the metapage.
|
* a pin or lock on the metapage.
|
||||||
|
*
|
||||||
|
* XXX this is not used for splitting anymore, only in nbtsort.c at the
|
||||||
|
* completion of btree building.
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
_bt_metaproot(Relation rel, BlockNumber rootbknum, int level)
|
_bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
|
||||||
{
|
{
|
||||||
Buffer metabuf;
|
Buffer metabuf;
|
||||||
Page metap;
|
Page metap;
|
||||||
|
@ -431,12 +470,42 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, int level)
|
||||||
metap = BufferGetPage(metabuf);
|
metap = BufferGetPage(metabuf);
|
||||||
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
|
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
|
||||||
Assert(metaopaque->btpo_flags & BTP_META);
|
Assert(metaopaque->btpo_flags & BTP_META);
|
||||||
|
|
||||||
|
/* NO ELOG(ERROR) from here till newmeta op is logged */
|
||||||
|
START_CRIT_SECTION();
|
||||||
|
|
||||||
metad = BTPageGetMeta(metap);
|
metad = BTPageGetMeta(metap);
|
||||||
metad->btm_root = rootbknum;
|
metad->btm_root = rootbknum;
|
||||||
if (level == 0) /* called from _do_insert */
|
metad->btm_level = level;
|
||||||
metad->btm_level += 1;
|
metad->btm_fastroot = rootbknum;
|
||||||
else
|
metad->btm_fastlevel = level;
|
||||||
metad->btm_level = level; /* called from btsort */
|
|
||||||
|
/* XLOG stuff */
|
||||||
|
if (!rel->rd_istemp)
|
||||||
|
{
|
||||||
|
xl_btree_newmeta xlrec;
|
||||||
|
XLogRecPtr recptr;
|
||||||
|
XLogRecData rdata[1];
|
||||||
|
|
||||||
|
xlrec.node = rel->rd_node;
|
||||||
|
xlrec.meta.root = metad->btm_root;
|
||||||
|
xlrec.meta.level = metad->btm_level;
|
||||||
|
xlrec.meta.fastroot = metad->btm_fastroot;
|
||||||
|
xlrec.meta.fastlevel = metad->btm_fastlevel;
|
||||||
|
|
||||||
|
rdata[0].buffer = InvalidBuffer;
|
||||||
|
rdata[0].data = (char *) &xlrec;
|
||||||
|
rdata[0].len = SizeOfBtreeNewmeta;
|
||||||
|
rdata[0].next = NULL;
|
||||||
|
|
||||||
|
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWMETA, rdata);
|
||||||
|
|
||||||
|
PageSetLSN(metap, recptr);
|
||||||
|
PageSetSUI(metap, ThisStartUpID);
|
||||||
|
}
|
||||||
|
|
||||||
|
END_CRIT_SECTION();
|
||||||
|
|
||||||
_bt_wrtbuf(rel, metabuf);
|
_bt_wrtbuf(rel, metabuf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -467,6 +536,7 @@ _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid)
|
||||||
|
|
||||||
xlrec.target.node = rel->rd_node;
|
xlrec.target.node = rel->rd_node;
|
||||||
xlrec.target.tid = *tid;
|
xlrec.target.tid = *tid;
|
||||||
|
|
||||||
rdata[0].buffer = InvalidBuffer;
|
rdata[0].buffer = InvalidBuffer;
|
||||||
rdata[0].data = (char *) &xlrec;
|
rdata[0].data = (char *) &xlrec;
|
||||||
rdata[0].len = SizeOfBtreeDelete;
|
rdata[0].len = SizeOfBtreeDelete;
|
||||||
|
|
|
@ -12,21 +12,17 @@
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.94 2002/11/15 01:26:08 momjian Exp $
|
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.95 2003/02/21 00:06:21 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "postgres.h"
|
#include "postgres.h"
|
||||||
|
|
||||||
#include "access/genam.h"
|
#include "access/genam.h"
|
||||||
#include "access/heapam.h"
|
#include "access/heapam.h"
|
||||||
#include "access/nbtree.h"
|
#include "access/nbtree.h"
|
||||||
#include "catalog/index.h"
|
#include "catalog/index.h"
|
||||||
#include "executor/executor.h"
|
|
||||||
#include "miscadmin.h"
|
#include "miscadmin.h"
|
||||||
#include "storage/sinval.h"
|
|
||||||
#include "access/xlogutils.h"
|
|
||||||
|
|
||||||
|
|
||||||
/* Working state for btbuild and its callback */
|
/* Working state for btbuild and its callback */
|
||||||
|
@ -817,396 +813,3 @@ _bt_restscan(IndexScanDesc scan)
|
||||||
ItemPointerSet(current, blkno, offnum);
|
ItemPointerSet(current, blkno, offnum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
|
||||||
_bt_restore_page(Page page, char *from, int len)
|
|
||||||
{
|
|
||||||
BTItemData btdata;
|
|
||||||
Size itemsz;
|
|
||||||
char *end = from + len;
|
|
||||||
|
|
||||||
for (; from < end;)
|
|
||||||
{
|
|
||||||
memcpy(&btdata, from, sizeof(BTItemData));
|
|
||||||
itemsz = IndexTupleDSize(btdata.bti_itup) +
|
|
||||||
(sizeof(BTItemData) - sizeof(IndexTupleData));
|
|
||||||
itemsz = MAXALIGN(itemsz);
|
|
||||||
if (PageAddItem(page, (Item) from, itemsz,
|
|
||||||
FirstOffsetNumber, LP_USED) == InvalidOffsetNumber)
|
|
||||||
elog(PANIC, "_bt_restore_page: can't add item to page");
|
|
||||||
from += itemsz;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
|
|
||||||
{
|
|
||||||
xl_btree_delete *xlrec;
|
|
||||||
Relation reln;
|
|
||||||
Buffer buffer;
|
|
||||||
Page page;
|
|
||||||
|
|
||||||
if (!redo || (record->xl_info & XLR_BKP_BLOCK_1))
|
|
||||||
return;
|
|
||||||
|
|
||||||
xlrec = (xl_btree_delete *) XLogRecGetData(record);
|
|
||||||
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
|
|
||||||
if (!RelationIsValid(reln))
|
|
||||||
return;
|
|
||||||
buffer = XLogReadBuffer(false, reln,
|
|
||||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
|
||||||
if (!BufferIsValid(buffer))
|
|
||||||
elog(PANIC, "btree_delete_redo: block unfound");
|
|
||||||
page = (Page) BufferGetPage(buffer);
|
|
||||||
if (PageIsNew((PageHeader) page))
|
|
||||||
elog(PANIC, "btree_delete_redo: uninitialized page");
|
|
||||||
|
|
||||||
if (XLByteLE(lsn, PageGetLSN(page)))
|
|
||||||
{
|
|
||||||
UnlockAndReleaseBuffer(buffer);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid)));
|
|
||||||
|
|
||||||
PageSetLSN(page, lsn);
|
|
||||||
PageSetSUI(page, ThisStartUpID);
|
|
||||||
UnlockAndWriteBuffer(buffer);
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
btree_xlog_insert(bool redo, XLogRecPtr lsn, XLogRecord *record)
|
|
||||||
{
|
|
||||||
xl_btree_insert *xlrec;
|
|
||||||
Relation reln;
|
|
||||||
Buffer buffer;
|
|
||||||
Page page;
|
|
||||||
BTPageOpaque pageop;
|
|
||||||
|
|
||||||
if (redo && (record->xl_info & XLR_BKP_BLOCK_1))
|
|
||||||
return;
|
|
||||||
|
|
||||||
xlrec = (xl_btree_insert *) XLogRecGetData(record);
|
|
||||||
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
|
|
||||||
if (!RelationIsValid(reln))
|
|
||||||
return;
|
|
||||||
buffer = XLogReadBuffer(false, reln,
|
|
||||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
|
||||||
if (!BufferIsValid(buffer))
|
|
||||||
elog(PANIC, "btree_insert_%sdo: block unfound", (redo) ? "re" : "un");
|
|
||||||
page = (Page) BufferGetPage(buffer);
|
|
||||||
if (PageIsNew((PageHeader) page))
|
|
||||||
elog(PANIC, "btree_insert_%sdo: uninitialized page", (redo) ? "re" : "un");
|
|
||||||
pageop = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
||||||
|
|
||||||
if (redo)
|
|
||||||
{
|
|
||||||
if (XLByteLE(lsn, PageGetLSN(page)))
|
|
||||||
{
|
|
||||||
UnlockAndReleaseBuffer(buffer);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (PageAddItem(page, (Item) ((char *) xlrec + SizeOfBtreeInsert),
|
|
||||||
record->xl_len - SizeOfBtreeInsert,
|
|
||||||
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
|
|
||||||
LP_USED) == InvalidOffsetNumber)
|
|
||||||
elog(PANIC, "btree_insert_redo: failed to add item");
|
|
||||||
|
|
||||||
PageSetLSN(page, lsn);
|
|
||||||
PageSetSUI(page, ThisStartUpID);
|
|
||||||
UnlockAndWriteBuffer(buffer);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (XLByteLT(PageGetLSN(page), lsn))
|
|
||||||
elog(PANIC, "btree_insert_undo: bad page LSN");
|
|
||||||
|
|
||||||
if (!P_ISLEAF(pageop))
|
|
||||||
{
|
|
||||||
UnlockAndReleaseBuffer(buffer);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
elog(PANIC, "btree_insert_undo: unimplemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
btree_xlog_split(bool redo, bool onleft, XLogRecPtr lsn, XLogRecord *record)
|
|
||||||
{
|
|
||||||
xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
|
|
||||||
Relation reln;
|
|
||||||
BlockNumber blkno;
|
|
||||||
Buffer buffer;
|
|
||||||
Page page;
|
|
||||||
BTPageOpaque pageop;
|
|
||||||
char *op = (redo) ? "redo" : "undo";
|
|
||||||
bool isleaf = (record->xl_info & XLOG_BTREE_LEAF);
|
|
||||||
|
|
||||||
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
|
|
||||||
if (!RelationIsValid(reln))
|
|
||||||
return;
|
|
||||||
|
|
||||||
/* Left (original) sibling */
|
|
||||||
blkno = (onleft) ? ItemPointerGetBlockNumber(&(xlrec->target.tid)) :
|
|
||||||
BlockIdGetBlockNumber(&(xlrec->otherblk));
|
|
||||||
buffer = XLogReadBuffer(false, reln, blkno);
|
|
||||||
if (!BufferIsValid(buffer))
|
|
||||||
elog(PANIC, "btree_split_%s: lost left sibling", op);
|
|
||||||
|
|
||||||
page = (Page) BufferGetPage(buffer);
|
|
||||||
if (redo)
|
|
||||||
_bt_pageinit(page, BufferGetPageSize(buffer));
|
|
||||||
else if (PageIsNew((PageHeader) page))
|
|
||||||
elog(PANIC, "btree_split_undo: uninitialized left sibling");
|
|
||||||
pageop = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
||||||
|
|
||||||
if (redo)
|
|
||||||
{
|
|
||||||
pageop->btpo_parent = BlockIdGetBlockNumber(&(xlrec->parentblk));
|
|
||||||
pageop->btpo_prev = BlockIdGetBlockNumber(&(xlrec->leftblk));
|
|
||||||
if (onleft)
|
|
||||||
pageop->btpo_next = BlockIdGetBlockNumber(&(xlrec->otherblk));
|
|
||||||
else
|
|
||||||
pageop->btpo_next = ItemPointerGetBlockNumber(&(xlrec->target.tid));
|
|
||||||
pageop->btpo_flags = (isleaf) ? BTP_LEAF : 0;
|
|
||||||
|
|
||||||
_bt_restore_page(page, (char *) xlrec + SizeOfBtreeSplit, xlrec->leftlen);
|
|
||||||
|
|
||||||
PageSetLSN(page, lsn);
|
|
||||||
PageSetSUI(page, ThisStartUpID);
|
|
||||||
UnlockAndWriteBuffer(buffer);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
/* undo */
|
|
||||||
{
|
|
||||||
if (XLByteLT(PageGetLSN(page), lsn))
|
|
||||||
elog(PANIC, "btree_split_undo: bad left sibling LSN");
|
|
||||||
elog(PANIC, "btree_split_undo: unimplemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Right (new) sibling */
|
|
||||||
blkno = (onleft) ? BlockIdGetBlockNumber(&(xlrec->otherblk)) :
|
|
||||||
ItemPointerGetBlockNumber(&(xlrec->target.tid));
|
|
||||||
buffer = XLogReadBuffer((redo) ? true : false, reln, blkno);
|
|
||||||
if (!BufferIsValid(buffer))
|
|
||||||
elog(PANIC, "btree_split_%s: lost right sibling", op);
|
|
||||||
|
|
||||||
page = (Page) BufferGetPage(buffer);
|
|
||||||
if (redo)
|
|
||||||
_bt_pageinit(page, BufferGetPageSize(buffer));
|
|
||||||
else if (PageIsNew((PageHeader) page))
|
|
||||||
elog(PANIC, "btree_split_undo: uninitialized right sibling");
|
|
||||||
pageop = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
||||||
|
|
||||||
if (redo)
|
|
||||||
{
|
|
||||||
pageop->btpo_parent = BlockIdGetBlockNumber(&(xlrec->parentblk));
|
|
||||||
pageop->btpo_prev = (onleft) ?
|
|
||||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)) :
|
|
||||||
BlockIdGetBlockNumber(&(xlrec->otherblk));
|
|
||||||
pageop->btpo_next = BlockIdGetBlockNumber(&(xlrec->rightblk));
|
|
||||||
pageop->btpo_flags = (isleaf) ? BTP_LEAF : 0;
|
|
||||||
|
|
||||||
_bt_restore_page(page,
|
|
||||||
(char *) xlrec + SizeOfBtreeSplit + xlrec->leftlen,
|
|
||||||
record->xl_len - SizeOfBtreeSplit - xlrec->leftlen);
|
|
||||||
|
|
||||||
PageSetLSN(page, lsn);
|
|
||||||
PageSetSUI(page, ThisStartUpID);
|
|
||||||
UnlockAndWriteBuffer(buffer);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
/* undo */
|
|
||||||
{
|
|
||||||
if (XLByteLT(PageGetLSN(page), lsn))
|
|
||||||
elog(PANIC, "btree_split_undo: bad right sibling LSN");
|
|
||||||
elog(PANIC, "btree_split_undo: unimplemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!redo || (record->xl_info & XLR_BKP_BLOCK_1))
|
|
||||||
return;
|
|
||||||
|
|
||||||
/* Right (next) page */
|
|
||||||
blkno = BlockIdGetBlockNumber(&(xlrec->rightblk));
|
|
||||||
if (blkno == P_NONE)
|
|
||||||
return;
|
|
||||||
|
|
||||||
buffer = XLogReadBuffer(false, reln, blkno);
|
|
||||||
if (!BufferIsValid(buffer))
|
|
||||||
elog(PANIC, "btree_split_redo: lost next right page");
|
|
||||||
|
|
||||||
page = (Page) BufferGetPage(buffer);
|
|
||||||
if (PageIsNew((PageHeader) page))
|
|
||||||
elog(PANIC, "btree_split_redo: uninitialized next right page");
|
|
||||||
|
|
||||||
if (XLByteLE(lsn, PageGetLSN(page)))
|
|
||||||
{
|
|
||||||
UnlockAndReleaseBuffer(buffer);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
pageop = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
||||||
pageop->btpo_prev = (onleft) ?
|
|
||||||
BlockIdGetBlockNumber(&(xlrec->otherblk)) :
|
|
||||||
ItemPointerGetBlockNumber(&(xlrec->target.tid));
|
|
||||||
|
|
||||||
PageSetLSN(page, lsn);
|
|
||||||
PageSetSUI(page, ThisStartUpID);
|
|
||||||
UnlockAndWriteBuffer(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
btree_xlog_newroot(bool redo, XLogRecPtr lsn, XLogRecord *record)
|
|
||||||
{
|
|
||||||
xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record);
|
|
||||||
Relation reln;
|
|
||||||
Buffer buffer;
|
|
||||||
Page page;
|
|
||||||
BTPageOpaque pageop;
|
|
||||||
Buffer metabuf;
|
|
||||||
Page metapg;
|
|
||||||
BTMetaPageData md;
|
|
||||||
|
|
||||||
if (!redo)
|
|
||||||
return;
|
|
||||||
|
|
||||||
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
|
|
||||||
if (!RelationIsValid(reln))
|
|
||||||
return;
|
|
||||||
buffer = XLogReadBuffer(true, reln, BlockIdGetBlockNumber(&(xlrec->rootblk)));
|
|
||||||
if (!BufferIsValid(buffer))
|
|
||||||
elog(PANIC, "btree_newroot_redo: no root page");
|
|
||||||
metabuf = XLogReadBuffer(false, reln, BTREE_METAPAGE);
|
|
||||||
if (!BufferIsValid(buffer))
|
|
||||||
elog(PANIC, "btree_newroot_redo: no metapage");
|
|
||||||
page = (Page) BufferGetPage(buffer);
|
|
||||||
_bt_pageinit(page, BufferGetPageSize(buffer));
|
|
||||||
pageop = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
||||||
|
|
||||||
pageop->btpo_flags |= BTP_ROOT;
|
|
||||||
pageop->btpo_prev = pageop->btpo_next = P_NONE;
|
|
||||||
pageop->btpo_parent = BTREE_METAPAGE;
|
|
||||||
|
|
||||||
if (record->xl_info & XLOG_BTREE_LEAF)
|
|
||||||
pageop->btpo_flags |= BTP_LEAF;
|
|
||||||
|
|
||||||
if (record->xl_len > SizeOfBtreeNewroot)
|
|
||||||
_bt_restore_page(page,
|
|
||||||
(char *) xlrec + SizeOfBtreeNewroot,
|
|
||||||
record->xl_len - SizeOfBtreeNewroot);
|
|
||||||
|
|
||||||
PageSetLSN(page, lsn);
|
|
||||||
PageSetSUI(page, ThisStartUpID);
|
|
||||||
UnlockAndWriteBuffer(buffer);
|
|
||||||
|
|
||||||
metapg = BufferGetPage(metabuf);
|
|
||||||
_bt_pageinit(metapg, BufferGetPageSize(metabuf));
|
|
||||||
md.btm_magic = BTREE_MAGIC;
|
|
||||||
md.btm_version = BTREE_VERSION;
|
|
||||||
md.btm_root = BlockIdGetBlockNumber(&(xlrec->rootblk));
|
|
||||||
md.btm_level = xlrec->level;
|
|
||||||
memcpy((char *) BTPageGetMeta(metapg), (char *) &md, sizeof(md));
|
|
||||||
|
|
||||||
pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
|
|
||||||
pageop->btpo_flags = BTP_META;
|
|
||||||
|
|
||||||
PageSetLSN(metapg, lsn);
|
|
||||||
PageSetSUI(metapg, ThisStartUpID);
|
|
||||||
UnlockAndWriteBuffer(metabuf);
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
btree_redo(XLogRecPtr lsn, XLogRecord *record)
|
|
||||||
{
|
|
||||||
uint8 info = record->xl_info & ~XLR_INFO_MASK;
|
|
||||||
|
|
||||||
info &= ~XLOG_BTREE_LEAF;
|
|
||||||
if (info == XLOG_BTREE_DELETE)
|
|
||||||
btree_xlog_delete(true, lsn, record);
|
|
||||||
else if (info == XLOG_BTREE_INSERT)
|
|
||||||
btree_xlog_insert(true, lsn, record);
|
|
||||||
else if (info == XLOG_BTREE_SPLIT)
|
|
||||||
btree_xlog_split(true, false, lsn, record); /* new item on the right */
|
|
||||||
else if (info == XLOG_BTREE_SPLEFT)
|
|
||||||
btree_xlog_split(true, true, lsn, record); /* new item on the left */
|
|
||||||
else if (info == XLOG_BTREE_NEWROOT)
|
|
||||||
btree_xlog_newroot(true, lsn, record);
|
|
||||||
else
|
|
||||||
elog(PANIC, "btree_redo: unknown op code %u", info);
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
btree_undo(XLogRecPtr lsn, XLogRecord *record)
|
|
||||||
{
|
|
||||||
uint8 info = record->xl_info & ~XLR_INFO_MASK;
|
|
||||||
|
|
||||||
info &= ~XLOG_BTREE_LEAF;
|
|
||||||
if (info == XLOG_BTREE_DELETE)
|
|
||||||
btree_xlog_delete(false, lsn, record);
|
|
||||||
else if (info == XLOG_BTREE_INSERT)
|
|
||||||
btree_xlog_insert(false, lsn, record);
|
|
||||||
else if (info == XLOG_BTREE_SPLIT)
|
|
||||||
btree_xlog_split(false, false, lsn, record); /* new item on the right */
|
|
||||||
else if (info == XLOG_BTREE_SPLEFT)
|
|
||||||
btree_xlog_split(false, true, lsn, record); /* new item on the left */
|
|
||||||
else if (info == XLOG_BTREE_NEWROOT)
|
|
||||||
btree_xlog_newroot(false, lsn, record);
|
|
||||||
else
|
|
||||||
elog(PANIC, "btree_undo: unknown op code %u", info);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
out_target(char *buf, xl_btreetid *target)
|
|
||||||
{
|
|
||||||
sprintf(buf + strlen(buf), "node %u/%u; tid %u/%u",
|
|
||||||
target->node.tblNode, target->node.relNode,
|
|
||||||
ItemPointerGetBlockNumber(&(target->tid)),
|
|
||||||
ItemPointerGetOffsetNumber(&(target->tid)));
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
btree_desc(char *buf, uint8 xl_info, char *rec)
|
|
||||||
{
|
|
||||||
uint8 info = xl_info & ~XLR_INFO_MASK;
|
|
||||||
|
|
||||||
info &= ~XLOG_BTREE_LEAF;
|
|
||||||
if (info == XLOG_BTREE_INSERT)
|
|
||||||
{
|
|
||||||
xl_btree_insert *xlrec = (xl_btree_insert *) rec;
|
|
||||||
|
|
||||||
strcat(buf, "insert: ");
|
|
||||||
out_target(buf, &(xlrec->target));
|
|
||||||
}
|
|
||||||
else if (info == XLOG_BTREE_DELETE)
|
|
||||||
{
|
|
||||||
xl_btree_delete *xlrec = (xl_btree_delete *) rec;
|
|
||||||
|
|
||||||
strcat(buf, "delete: ");
|
|
||||||
out_target(buf, &(xlrec->target));
|
|
||||||
}
|
|
||||||
else if (info == XLOG_BTREE_SPLIT || info == XLOG_BTREE_SPLEFT)
|
|
||||||
{
|
|
||||||
xl_btree_split *xlrec = (xl_btree_split *) rec;
|
|
||||||
|
|
||||||
sprintf(buf + strlen(buf), "split(%s): ",
|
|
||||||
(info == XLOG_BTREE_SPLIT) ? "right" : "left");
|
|
||||||
out_target(buf, &(xlrec->target));
|
|
||||||
sprintf(buf + strlen(buf), "; oth %u; rgh %u",
|
|
||||||
BlockIdGetBlockNumber(&xlrec->otherblk),
|
|
||||||
BlockIdGetBlockNumber(&xlrec->rightblk));
|
|
||||||
}
|
|
||||||
else if (info == XLOG_BTREE_NEWROOT)
|
|
||||||
{
|
|
||||||
xl_btree_newroot *xlrec = (xl_btree_newroot *) rec;
|
|
||||||
|
|
||||||
sprintf(buf + strlen(buf), "root: node %u/%u; blk %u",
|
|
||||||
xlrec->node.tblNode, xlrec->node.relNode,
|
|
||||||
BlockIdGetBlockNumber(&xlrec->rootblk));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
strcat(buf, "UNKNOWN");
|
|
||||||
}
|
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.72 2002/06/20 20:29:25 momjian Exp $
|
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.73 2003/02/21 00:06:21 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
|
@ -895,6 +895,89 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* _bt_get_endpoint() -- Find the first or last page on a given tree level
|
||||||
|
*
|
||||||
|
* If the index is empty, we will return InvalidBuffer; any other failure
|
||||||
|
* condition causes elog().
|
||||||
|
*
|
||||||
|
* The returned buffer is pinned and read-locked.
|
||||||
|
*/
|
||||||
|
Buffer
|
||||||
|
_bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
|
||||||
|
{
|
||||||
|
Buffer buf;
|
||||||
|
Page page;
|
||||||
|
BTPageOpaque opaque;
|
||||||
|
OffsetNumber offnum;
|
||||||
|
BlockNumber blkno;
|
||||||
|
BTItem btitem;
|
||||||
|
IndexTuple itup;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we are looking for a leaf page, okay to descend from fast root;
|
||||||
|
* otherwise better descend from true root. (There is no point in being
|
||||||
|
* smarter about intermediate levels.)
|
||||||
|
*/
|
||||||
|
if (level == 0)
|
||||||
|
buf = _bt_getroot(rel, BT_READ);
|
||||||
|
else
|
||||||
|
buf = _bt_gettrueroot(rel);
|
||||||
|
|
||||||
|
if (!BufferIsValid(buf))
|
||||||
|
{
|
||||||
|
/* empty index... */
|
||||||
|
return InvalidBuffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
page = BufferGetPage(buf);
|
||||||
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||||
|
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* If we landed on a deleted page, step right to find a live page
|
||||||
|
* (there must be one). Also, if we want the rightmost page,
|
||||||
|
* step right if needed to get to it (this could happen if the
|
||||||
|
* page split since we obtained a pointer to it).
|
||||||
|
*/
|
||||||
|
while (P_ISDELETED(opaque) ||
|
||||||
|
(rightmost && !P_RIGHTMOST(opaque)))
|
||||||
|
{
|
||||||
|
blkno = opaque->btpo_next;
|
||||||
|
if (blkno == P_NONE)
|
||||||
|
elog(ERROR, "_bt_get_endpoint: ran off end of btree");
|
||||||
|
_bt_relbuf(rel, buf);
|
||||||
|
buf = _bt_getbuf(rel, blkno, BT_READ);
|
||||||
|
page = BufferGetPage(buf);
|
||||||
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Done? */
|
||||||
|
if (opaque->btpo.level == level)
|
||||||
|
break;
|
||||||
|
if (opaque->btpo.level < level)
|
||||||
|
elog(ERROR, "_bt_get_endpoint: btree level %u not found", level);
|
||||||
|
|
||||||
|
/* Step to leftmost or rightmost child page */
|
||||||
|
if (rightmost)
|
||||||
|
offnum = PageGetMaxOffsetNumber(page);
|
||||||
|
else
|
||||||
|
offnum = P_FIRSTDATAKEY(opaque);
|
||||||
|
|
||||||
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
||||||
|
itup = &(btitem->bti_itup);
|
||||||
|
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
|
||||||
|
|
||||||
|
_bt_relbuf(rel, buf);
|
||||||
|
buf = _bt_getbuf(rel, blkno, BT_READ);
|
||||||
|
page = BufferGetPage(buf);
|
||||||
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
return buf;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* _bt_endpoint() -- Find the first or last key in the index.
|
* _bt_endpoint() -- Find the first or last key in the index.
|
||||||
*
|
*
|
||||||
|
@ -910,8 +993,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
||||||
Page page;
|
Page page;
|
||||||
BTPageOpaque opaque;
|
BTPageOpaque opaque;
|
||||||
ItemPointer current;
|
ItemPointer current;
|
||||||
OffsetNumber offnum,
|
OffsetNumber maxoff;
|
||||||
maxoff;
|
|
||||||
OffsetNumber start;
|
OffsetNumber start;
|
||||||
BlockNumber blkno;
|
BlockNumber blkno;
|
||||||
BTItem btitem;
|
BTItem btitem;
|
||||||
|
@ -929,7 +1011,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
||||||
* simplified version of _bt_search(). We don't maintain a stack
|
* simplified version of _bt_search(). We don't maintain a stack
|
||||||
* since we know we won't need it.
|
* since we know we won't need it.
|
||||||
*/
|
*/
|
||||||
buf = _bt_getroot(rel, BT_READ);
|
buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
|
||||||
|
|
||||||
if (!BufferIsValid(buf))
|
if (!BufferIsValid(buf))
|
||||||
{
|
{
|
||||||
|
@ -942,51 +1024,14 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
||||||
blkno = BufferGetBlockNumber(buf);
|
blkno = BufferGetBlockNumber(buf);
|
||||||
page = BufferGetPage(buf);
|
page = BufferGetPage(buf);
|
||||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||||
|
Assert(P_ISLEAF(opaque));
|
||||||
|
|
||||||
for (;;)
|
|
||||||
{
|
|
||||||
if (P_ISLEAF(opaque))
|
|
||||||
break;
|
|
||||||
|
|
||||||
if (ScanDirectionIsForward(dir))
|
|
||||||
offnum = P_FIRSTDATAKEY(opaque);
|
|
||||||
else
|
|
||||||
offnum = PageGetMaxOffsetNumber(page);
|
|
||||||
|
|
||||||
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
|
||||||
itup = &(btitem->bti_itup);
|
|
||||||
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
|
|
||||||
|
|
||||||
_bt_relbuf(rel, buf);
|
|
||||||
buf = _bt_getbuf(rel, blkno, BT_READ);
|
|
||||||
|
|
||||||
page = BufferGetPage(buf);
|
|
||||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Race condition: If the child page we just stepped onto was just
|
|
||||||
* split, we need to make sure we're all the way at the right edge
|
|
||||||
* of the tree. See the paper by Lehman and Yao.
|
|
||||||
*/
|
|
||||||
if (ScanDirectionIsBackward(dir) && !P_RIGHTMOST(opaque))
|
|
||||||
{
|
|
||||||
do
|
|
||||||
{
|
|
||||||
blkno = opaque->btpo_next;
|
|
||||||
_bt_relbuf(rel, buf);
|
|
||||||
buf = _bt_getbuf(rel, blkno, BT_READ);
|
|
||||||
page = BufferGetPage(buf);
|
|
||||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
||||||
} while (!P_RIGHTMOST(opaque));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* okay, we've got the {left,right}-most page in the tree */
|
|
||||||
maxoff = PageGetMaxOffsetNumber(page);
|
maxoff = PageGetMaxOffsetNumber(page);
|
||||||
|
|
||||||
if (ScanDirectionIsForward(dir))
|
if (ScanDirectionIsForward(dir))
|
||||||
{
|
{
|
||||||
Assert(P_LEFTMOST(opaque));
|
/* There could be dead pages to the left, so not this: */
|
||||||
|
/* Assert(P_LEFTMOST(opaque)); */
|
||||||
|
|
||||||
start = P_FIRSTDATAKEY(opaque);
|
start = P_FIRSTDATAKEY(opaque);
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,7 +35,7 @@
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.70 2002/11/15 01:26:08 momjian Exp $
|
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.71 2003/02/21 00:06:21 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
|
@ -43,6 +43,7 @@
|
||||||
#include "postgres.h"
|
#include "postgres.h"
|
||||||
|
|
||||||
#include "access/nbtree.h"
|
#include "access/nbtree.h"
|
||||||
|
#include "miscadmin.h"
|
||||||
#include "utils/tuplesort.h"
|
#include "utils/tuplesort.h"
|
||||||
|
|
||||||
|
|
||||||
|
@ -76,7 +77,7 @@ typedef struct BTPageState
|
||||||
BTItem btps_minkey; /* copy of minimum key (first item) on
|
BTItem btps_minkey; /* copy of minimum key (first item) on
|
||||||
* page */
|
* page */
|
||||||
OffsetNumber btps_lastoff; /* last item offset loaded */
|
OffsetNumber btps_lastoff; /* last item offset loaded */
|
||||||
int btps_level; /* tree level (0 = leaf) */
|
uint32 btps_level; /* tree level (0 = leaf) */
|
||||||
Size btps_full; /* "full" if less than this much free
|
Size btps_full; /* "full" if less than this much free
|
||||||
* space */
|
* space */
|
||||||
struct BTPageState *btps_next; /* link to parent level, if any */
|
struct BTPageState *btps_next; /* link to parent level, if any */
|
||||||
|
@ -90,8 +91,9 @@ typedef struct BTPageState
|
||||||
0)
|
0)
|
||||||
|
|
||||||
|
|
||||||
static void _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags);
|
static void _bt_blnewpage(Relation index, Buffer *buf, Page *page,
|
||||||
static BTPageState *_bt_pagestate(Relation index, int flags, int level);
|
uint32 level);
|
||||||
|
static BTPageState *_bt_pagestate(Relation index, uint32 level);
|
||||||
static void _bt_slideleft(Relation index, Buffer buf, Page page);
|
static void _bt_slideleft(Relation index, Buffer buf, Page page);
|
||||||
static void _bt_sortaddtup(Page page, Size itemsize,
|
static void _bt_sortaddtup(Page page, Size itemsize,
|
||||||
BTItem btitem, OffsetNumber itup_off);
|
BTItem btitem, OffsetNumber itup_off);
|
||||||
|
@ -179,7 +181,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
|
||||||
* allocate a new, clean btree page, not linked to any siblings.
|
* allocate a new, clean btree page, not linked to any siblings.
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
|
_bt_blnewpage(Relation index, Buffer *buf, Page *page, uint32 level)
|
||||||
{
|
{
|
||||||
BTPageOpaque opaque;
|
BTPageOpaque opaque;
|
||||||
|
|
||||||
|
@ -192,23 +194,67 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
|
||||||
/* Initialize BT opaque state */
|
/* Initialize BT opaque state */
|
||||||
opaque = (BTPageOpaque) PageGetSpecialPointer(*page);
|
opaque = (BTPageOpaque) PageGetSpecialPointer(*page);
|
||||||
opaque->btpo_prev = opaque->btpo_next = P_NONE;
|
opaque->btpo_prev = opaque->btpo_next = P_NONE;
|
||||||
opaque->btpo_flags = flags;
|
opaque->btpo.level = level;
|
||||||
|
opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
|
||||||
|
|
||||||
/* Make the P_HIKEY line pointer appear allocated */
|
/* Make the P_HIKEY line pointer appear allocated */
|
||||||
((PageHeader) *page)->pd_lower += sizeof(ItemIdData);
|
((PageHeader) *page)->pd_lower += sizeof(ItemIdData);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* emit a completed btree page, and release the lock and pin on it.
|
||||||
|
* This is essentially _bt_wrtbuf except we also emit a WAL record.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
_bt_blwritepage(Relation index, Buffer buf)
|
||||||
|
{
|
||||||
|
Page pg = BufferGetPage(buf);
|
||||||
|
|
||||||
|
/* NO ELOG(ERROR) from here till newpage op is logged */
|
||||||
|
START_CRIT_SECTION();
|
||||||
|
|
||||||
|
/* XLOG stuff */
|
||||||
|
if (!index->rd_istemp)
|
||||||
|
{
|
||||||
|
xl_btree_newpage xlrec;
|
||||||
|
XLogRecPtr recptr;
|
||||||
|
XLogRecData rdata[2];
|
||||||
|
|
||||||
|
xlrec.node = index->rd_node;
|
||||||
|
xlrec.blkno = BufferGetBlockNumber(buf);
|
||||||
|
|
||||||
|
rdata[0].buffer = InvalidBuffer;
|
||||||
|
rdata[0].data = (char *) &xlrec;
|
||||||
|
rdata[0].len = SizeOfBtreeNewpage;
|
||||||
|
rdata[0].next = &(rdata[1]);
|
||||||
|
|
||||||
|
rdata[1].buffer = buf;
|
||||||
|
rdata[1].data = (char *) pg;
|
||||||
|
rdata[1].len = BLCKSZ;
|
||||||
|
rdata[1].next = NULL;
|
||||||
|
|
||||||
|
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWPAGE, rdata);
|
||||||
|
|
||||||
|
PageSetLSN(pg, recptr);
|
||||||
|
PageSetSUI(pg, ThisStartUpID);
|
||||||
|
}
|
||||||
|
|
||||||
|
END_CRIT_SECTION();
|
||||||
|
|
||||||
|
_bt_wrtbuf(index, buf);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* allocate and initialize a new BTPageState. the returned structure
|
* allocate and initialize a new BTPageState. the returned structure
|
||||||
* is suitable for immediate use by _bt_buildadd.
|
* is suitable for immediate use by _bt_buildadd.
|
||||||
*/
|
*/
|
||||||
static BTPageState *
|
static BTPageState *
|
||||||
_bt_pagestate(Relation index, int flags, int level)
|
_bt_pagestate(Relation index, uint32 level)
|
||||||
{
|
{
|
||||||
BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
|
BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
|
||||||
|
|
||||||
/* create initial page */
|
/* create initial page */
|
||||||
_bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags);
|
_bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), level);
|
||||||
|
|
||||||
state->btps_minkey = (BTItem) NULL;
|
state->btps_minkey = (BTItem) NULL;
|
||||||
/* initialize lastoff so first item goes into P_FIRSTKEY */
|
/* initialize lastoff so first item goes into P_FIRSTKEY */
|
||||||
|
@ -365,9 +411,8 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
|
||||||
ItemId hii;
|
ItemId hii;
|
||||||
BTItem obti;
|
BTItem obti;
|
||||||
|
|
||||||
/* Create new page */
|
/* Create new page on same level */
|
||||||
_bt_blnewpage(index, &nbuf, &npage,
|
_bt_blnewpage(index, &nbuf, &npage, state->btps_level);
|
||||||
(state->btps_level > 0) ? 0 : BTP_LEAF);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We copy the last item on the page into the new page, and then
|
* We copy the last item on the page into the new page, and then
|
||||||
|
@ -396,10 +441,8 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
|
||||||
* btree level.
|
* btree level.
|
||||||
*/
|
*/
|
||||||
if (state->btps_next == (BTPageState *) NULL)
|
if (state->btps_next == (BTPageState *) NULL)
|
||||||
{
|
state->btps_next = _bt_pagestate(index, state->btps_level + 1);
|
||||||
state->btps_next =
|
|
||||||
_bt_pagestate(index, 0, state->btps_level + 1);
|
|
||||||
}
|
|
||||||
Assert(state->btps_minkey != NULL);
|
Assert(state->btps_minkey != NULL);
|
||||||
ItemPointerSet(&(state->btps_minkey->bti_itup.t_tid),
|
ItemPointerSet(&(state->btps_minkey->bti_itup.t_tid),
|
||||||
BufferGetBlockNumber(obuf), P_HIKEY);
|
BufferGetBlockNumber(obuf), P_HIKEY);
|
||||||
|
@ -414,16 +457,7 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
|
||||||
state->btps_minkey = _bt_formitem(&(obti->bti_itup));
|
state->btps_minkey = _bt_formitem(&(obti->bti_itup));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Set the sibling links for both pages, and parent links too.
|
* Set the sibling links for both pages.
|
||||||
*
|
|
||||||
* It's not necessary to set the parent link at all, because it's
|
|
||||||
* only used for handling concurrent root splits, but we may as
|
|
||||||
* well do it as a debugging aid. Note we set new page's link as
|
|
||||||
* well as old's, because if the new page turns out to be the last
|
|
||||||
* of the level, _bt_uppershutdown won't change it. The links may
|
|
||||||
* be out of date by the time the build finishes, but that's OK;
|
|
||||||
* they need only point to a left-sibling of the true parent. See
|
|
||||||
* the README file for more info.
|
|
||||||
*/
|
*/
|
||||||
{
|
{
|
||||||
BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
|
BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
|
||||||
|
@ -431,9 +465,7 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
|
||||||
|
|
||||||
oopaque->btpo_next = BufferGetBlockNumber(nbuf);
|
oopaque->btpo_next = BufferGetBlockNumber(nbuf);
|
||||||
nopaque->btpo_prev = BufferGetBlockNumber(obuf);
|
nopaque->btpo_prev = BufferGetBlockNumber(obuf);
|
||||||
nopaque->btpo_next = P_NONE;
|
nopaque->btpo_next = P_NONE; /* redundant */
|
||||||
oopaque->btpo_parent = nopaque->btpo_parent =
|
|
||||||
BufferGetBlockNumber(state->btps_next->btps_buf);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -441,7 +473,7 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)
|
||||||
* can give up our lock (if we had one; most likely BuildingBtree
|
* can give up our lock (if we had one; most likely BuildingBtree
|
||||||
* is set, so we aren't locking).
|
* is set, so we aren't locking).
|
||||||
*/
|
*/
|
||||||
_bt_wrtbuf(index, obuf);
|
_bt_blwritepage(index, obuf);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Reset last_off to point to new page
|
* Reset last_off to point to new page
|
||||||
|
@ -519,7 +551,7 @@ _bt_uppershutdown(Relation index, BTPageState *state)
|
||||||
* slid back one slot. Then we can dump out the page.
|
* slid back one slot. Then we can dump out the page.
|
||||||
*/
|
*/
|
||||||
_bt_slideleft(index, s->btps_buf, s->btps_page);
|
_bt_slideleft(index, s->btps_buf, s->btps_page);
|
||||||
_bt_wrtbuf(index, s->btps_buf);
|
_bt_blwritepage(index, s->btps_buf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -603,7 +635,7 @@ _bt_load(Relation index, BTSpool *btspool, BTSpool *btspool2)
|
||||||
|
|
||||||
/* When we see first tuple, create first index page */
|
/* When we see first tuple, create first index page */
|
||||||
if (state == NULL)
|
if (state == NULL)
|
||||||
state = _bt_pagestate(index, BTP_LEAF, 0);
|
state = _bt_pagestate(index, 0);
|
||||||
|
|
||||||
if (load1)
|
if (load1)
|
||||||
{
|
{
|
||||||
|
@ -623,13 +655,13 @@ _bt_load(Relation index, BTSpool *btspool, BTSpool *btspool2)
|
||||||
_bt_freeskey(indexScanKey);
|
_bt_freeskey(indexScanKey);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
/* merge is unnecessary */
|
|
||||||
{
|
{
|
||||||
|
/* merge is unnecessary */
|
||||||
while (bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, true, &should_free), bti != (BTItem) NULL)
|
while (bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, true, &should_free), bti != (BTItem) NULL)
|
||||||
{
|
{
|
||||||
/* When we see first tuple, create first index page */
|
/* When we see first tuple, create first index page */
|
||||||
if (state == NULL)
|
if (state == NULL)
|
||||||
state = _bt_pagestate(index, BTP_LEAF, 0);
|
state = _bt_pagestate(index, 0);
|
||||||
|
|
||||||
_bt_buildadd(index, state, bti);
|
_bt_buildadd(index, state, bti);
|
||||||
if (should_free)
|
if (should_free)
|
||||||
|
|
|
@ -0,0 +1,780 @@
|
||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* nbtxlog.c
|
||||||
|
* WAL replay logic for btrees.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
|
||||||
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
|
*
|
||||||
|
* IDENTIFICATION
|
||||||
|
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.1 2003/02/21 00:06:21 tgl Exp $
|
||||||
|
*
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
#include "postgres.h"
|
||||||
|
|
||||||
|
#include "access/nbtree.h"
|
||||||
|
#include "access/xlogutils.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We must keep track of expected insertions due to page splits, and apply
|
||||||
|
* them manually if they are not seen in the WAL log during replay. This
|
||||||
|
* makes it safe for page insertion to be a multiple-WAL-action process.
|
||||||
|
*
|
||||||
|
* The data structure is a simple linked list --- this should be good enough,
|
||||||
|
* since we don't expect a page split to remain incomplete for long.
|
||||||
|
*/
|
||||||
|
typedef struct bt_incomplete_split
|
||||||
|
{
|
||||||
|
RelFileNode node; /* the index */
|
||||||
|
BlockNumber leftblk; /* left half of split */
|
||||||
|
BlockNumber rightblk; /* right half of split */
|
||||||
|
bool is_root; /* we split the root */
|
||||||
|
} bt_incomplete_split;
|
||||||
|
|
||||||
|
static List *incomplete_splits;
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
log_incomplete_split(RelFileNode node, BlockNumber leftblk,
|
||||||
|
BlockNumber rightblk, bool is_root)
|
||||||
|
{
|
||||||
|
bt_incomplete_split *split = palloc(sizeof(bt_incomplete_split));
|
||||||
|
|
||||||
|
split->node = node;
|
||||||
|
split->leftblk = leftblk;
|
||||||
|
split->rightblk = rightblk;
|
||||||
|
split->is_root = is_root;
|
||||||
|
incomplete_splits = lappend(incomplete_splits, split);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
forget_matching_split(Relation reln, RelFileNode node,
|
||||||
|
BlockNumber insertblk, OffsetNumber offnum,
|
||||||
|
bool is_root)
|
||||||
|
{
|
||||||
|
Buffer buffer;
|
||||||
|
Page page;
|
||||||
|
BTItem btitem;
|
||||||
|
BlockNumber rightblk;
|
||||||
|
List *l;
|
||||||
|
|
||||||
|
/* Get downlink TID from page */
|
||||||
|
buffer = XLogReadBuffer(false, reln, insertblk);
|
||||||
|
if (!BufferIsValid(buffer))
|
||||||
|
elog(PANIC, "forget_matching_split: block unfound");
|
||||||
|
page = (Page) BufferGetPage(buffer);
|
||||||
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
||||||
|
rightblk = ItemPointerGetBlockNumber(&(btitem->bti_itup.t_tid));
|
||||||
|
Assert(ItemPointerGetOffsetNumber(&(btitem->bti_itup.t_tid)) == P_HIKEY);
|
||||||
|
UnlockAndReleaseBuffer(buffer);
|
||||||
|
|
||||||
|
foreach(l, incomplete_splits)
|
||||||
|
{
|
||||||
|
bt_incomplete_split *split = (bt_incomplete_split *) lfirst(l);
|
||||||
|
|
||||||
|
if (RelFileNodeEquals(node, split->node) &&
|
||||||
|
rightblk == split->rightblk)
|
||||||
|
{
|
||||||
|
if (is_root != split->is_root)
|
||||||
|
elog(LOG, "forget_matching_split: fishy is_root data");
|
||||||
|
incomplete_splits = lremove(split, incomplete_splits);
|
||||||
|
break; /* need not look further */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
_bt_restore_page(Page page, char *from, int len)
|
||||||
|
{
|
||||||
|
BTItemData btdata;
|
||||||
|
Size itemsz;
|
||||||
|
char *end = from + len;
|
||||||
|
|
||||||
|
for (; from < end;)
|
||||||
|
{
|
||||||
|
memcpy(&btdata, from, sizeof(BTItemData));
|
||||||
|
itemsz = IndexTupleDSize(btdata.bti_itup) +
|
||||||
|
(sizeof(BTItemData) - sizeof(IndexTupleData));
|
||||||
|
itemsz = MAXALIGN(itemsz);
|
||||||
|
if (PageAddItem(page, (Item) from, itemsz,
|
||||||
|
FirstOffsetNumber, LP_USED) == InvalidOffsetNumber)
|
||||||
|
elog(PANIC, "_bt_restore_page: can't add item to page");
|
||||||
|
from += itemsz;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
_bt_restore_meta(Relation reln, XLogRecPtr lsn,
|
||||||
|
BlockNumber root, uint32 level,
|
||||||
|
BlockNumber fastroot, uint32 fastlevel)
|
||||||
|
{
|
||||||
|
Buffer metabuf;
|
||||||
|
Page metapg;
|
||||||
|
BTMetaPageData *md;
|
||||||
|
BTPageOpaque pageop;
|
||||||
|
|
||||||
|
metabuf = XLogReadBuffer(true, reln, BTREE_METAPAGE);
|
||||||
|
if (!BufferIsValid(metabuf))
|
||||||
|
elog(PANIC, "_bt_restore_meta: no metapage");
|
||||||
|
|
||||||
|
metapg = BufferGetPage(metabuf);
|
||||||
|
_bt_pageinit(metapg, BufferGetPageSize(metabuf));
|
||||||
|
|
||||||
|
md = BTPageGetMeta(metapg);
|
||||||
|
md->btm_magic = BTREE_MAGIC;
|
||||||
|
md->btm_version = BTREE_VERSION;
|
||||||
|
md->btm_root = root;
|
||||||
|
md->btm_level = level;
|
||||||
|
md->btm_fastroot = fastroot;
|
||||||
|
md->btm_fastlevel = fastlevel;
|
||||||
|
|
||||||
|
pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
|
||||||
|
pageop->btpo_flags = BTP_META;
|
||||||
|
|
||||||
|
PageSetLSN(metapg, lsn);
|
||||||
|
PageSetSUI(metapg, ThisStartUpID);
|
||||||
|
UnlockAndWriteBuffer(metabuf);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
btree_xlog_insert(bool redo, bool isleaf, bool ismeta,
|
||||||
|
XLogRecPtr lsn, XLogRecord *record)
|
||||||
|
{
|
||||||
|
xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
|
||||||
|
Relation reln;
|
||||||
|
Buffer buffer;
|
||||||
|
Page page;
|
||||||
|
BTPageOpaque pageop;
|
||||||
|
char *datapos;
|
||||||
|
int datalen;
|
||||||
|
xl_btree_metadata md;
|
||||||
|
|
||||||
|
datapos = (char *) xlrec + SizeOfBtreeInsert;
|
||||||
|
datalen = record->xl_len - SizeOfBtreeInsert;
|
||||||
|
if (ismeta)
|
||||||
|
{
|
||||||
|
memcpy(&md, datapos, sizeof(xl_btree_metadata));
|
||||||
|
datapos += sizeof(xl_btree_metadata);
|
||||||
|
datalen -= sizeof(xl_btree_metadata);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (redo && (record->xl_info & XLR_BKP_BLOCK_1) && !ismeta &&
|
||||||
|
incomplete_splits == NIL)
|
||||||
|
return; /* nothing to do */
|
||||||
|
|
||||||
|
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
|
||||||
|
if (!RelationIsValid(reln))
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (!redo || !(record->xl_info & XLR_BKP_BLOCK_1))
|
||||||
|
{
|
||||||
|
buffer = XLogReadBuffer(false, reln,
|
||||||
|
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
||||||
|
if (!BufferIsValid(buffer))
|
||||||
|
elog(PANIC, "btree_insert_%sdo: block unfound", (redo) ? "re" : "un");
|
||||||
|
page = (Page) BufferGetPage(buffer);
|
||||||
|
if (PageIsNew((PageHeader) page))
|
||||||
|
elog(PANIC, "btree_insert_%sdo: uninitialized page", (redo) ? "re" : "un");
|
||||||
|
pageop = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||||
|
|
||||||
|
if (redo)
|
||||||
|
{
|
||||||
|
if (XLByteLE(lsn, PageGetLSN(page)))
|
||||||
|
{
|
||||||
|
UnlockAndReleaseBuffer(buffer);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (PageAddItem(page, (Item) datapos, datalen,
|
||||||
|
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
|
||||||
|
LP_USED) == InvalidOffsetNumber)
|
||||||
|
elog(PANIC, "btree_insert_redo: failed to add item");
|
||||||
|
|
||||||
|
PageSetLSN(page, lsn);
|
||||||
|
PageSetSUI(page, ThisStartUpID);
|
||||||
|
UnlockAndWriteBuffer(buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (XLByteLT(PageGetLSN(page), lsn))
|
||||||
|
elog(PANIC, "btree_insert_undo: bad page LSN");
|
||||||
|
|
||||||
|
if (!P_ISLEAF(pageop))
|
||||||
|
{
|
||||||
|
UnlockAndReleaseBuffer(buffer);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
elog(PANIC, "btree_insert_undo: unimplemented");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (redo) /* metapage changes not undoable */
|
||||||
|
{
|
||||||
|
if (ismeta)
|
||||||
|
_bt_restore_meta(reln, lsn,
|
||||||
|
md.root, md.level,
|
||||||
|
md.fastroot, md.fastlevel);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Forget any split this insertion completes */
|
||||||
|
if (redo && !isleaf && incomplete_splits != NIL)
|
||||||
|
{
|
||||||
|
forget_matching_split(reln, xlrec->target.node,
|
||||||
|
ItemPointerGetBlockNumber(&(xlrec->target.tid)),
|
||||||
|
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
|
||||||
|
false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
btree_xlog_split(bool redo, bool onleft, bool isroot,
|
||||||
|
XLogRecPtr lsn, XLogRecord *record)
|
||||||
|
{
|
||||||
|
xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
|
||||||
|
Relation reln;
|
||||||
|
BlockNumber targetblk;
|
||||||
|
BlockNumber leftsib;
|
||||||
|
BlockNumber rightsib;
|
||||||
|
Buffer buffer;
|
||||||
|
Page page;
|
||||||
|
BTPageOpaque pageop;
|
||||||
|
char *op = (redo) ? "redo" : "undo";
|
||||||
|
|
||||||
|
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
|
||||||
|
if (!RelationIsValid(reln))
|
||||||
|
return;
|
||||||
|
|
||||||
|
targetblk = ItemPointerGetBlockNumber(&(xlrec->target.tid));
|
||||||
|
leftsib = (onleft) ? targetblk : xlrec->otherblk;
|
||||||
|
rightsib = (onleft) ? xlrec->otherblk : targetblk;
|
||||||
|
|
||||||
|
/* Left (original) sibling */
|
||||||
|
buffer = XLogReadBuffer(false, reln, leftsib);
|
||||||
|
if (!BufferIsValid(buffer))
|
||||||
|
elog(PANIC, "btree_split_%s: lost left sibling", op);
|
||||||
|
|
||||||
|
page = (Page) BufferGetPage(buffer);
|
||||||
|
if (redo)
|
||||||
|
_bt_pageinit(page, BufferGetPageSize(buffer));
|
||||||
|
else if (PageIsNew((PageHeader) page))
|
||||||
|
elog(PANIC, "btree_split_undo: uninitialized left sibling");
|
||||||
|
pageop = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||||
|
|
||||||
|
if (redo)
|
||||||
|
{
|
||||||
|
pageop->btpo_prev = xlrec->leftblk;
|
||||||
|
pageop->btpo_next = rightsib;
|
||||||
|
pageop->btpo.level = xlrec->level;
|
||||||
|
pageop->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
|
||||||
|
|
||||||
|
_bt_restore_page(page,
|
||||||
|
(char *) xlrec + SizeOfBtreeSplit,
|
||||||
|
xlrec->leftlen);
|
||||||
|
|
||||||
|
PageSetLSN(page, lsn);
|
||||||
|
PageSetSUI(page, ThisStartUpID);
|
||||||
|
UnlockAndWriteBuffer(buffer);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* undo */
|
||||||
|
if (XLByteLT(PageGetLSN(page), lsn))
|
||||||
|
elog(PANIC, "btree_split_undo: bad left sibling LSN");
|
||||||
|
elog(PANIC, "btree_split_undo: unimplemented");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Right (new) sibling */
|
||||||
|
buffer = XLogReadBuffer((redo) ? true : false, reln, rightsib);
|
||||||
|
if (!BufferIsValid(buffer))
|
||||||
|
elog(PANIC, "btree_split_%s: lost right sibling", op);
|
||||||
|
|
||||||
|
page = (Page) BufferGetPage(buffer);
|
||||||
|
if (redo)
|
||||||
|
_bt_pageinit(page, BufferGetPageSize(buffer));
|
||||||
|
else if (PageIsNew((PageHeader) page))
|
||||||
|
elog(PANIC, "btree_split_undo: uninitialized right sibling");
|
||||||
|
pageop = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||||
|
|
||||||
|
if (redo)
|
||||||
|
{
|
||||||
|
pageop->btpo_prev = leftsib;
|
||||||
|
pageop->btpo_next = xlrec->rightblk;
|
||||||
|
pageop->btpo.level = xlrec->level;
|
||||||
|
pageop->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
|
||||||
|
|
||||||
|
_bt_restore_page(page,
|
||||||
|
(char *) xlrec + SizeOfBtreeSplit + xlrec->leftlen,
|
||||||
|
record->xl_len - SizeOfBtreeSplit - xlrec->leftlen);
|
||||||
|
|
||||||
|
PageSetLSN(page, lsn);
|
||||||
|
PageSetSUI(page, ThisStartUpID);
|
||||||
|
UnlockAndWriteBuffer(buffer);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* undo */
|
||||||
|
if (XLByteLT(PageGetLSN(page), lsn))
|
||||||
|
elog(PANIC, "btree_split_undo: bad right sibling LSN");
|
||||||
|
elog(PANIC, "btree_split_undo: unimplemented");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Fix left-link of right (next) page */
|
||||||
|
if (redo && !(record->xl_info & XLR_BKP_BLOCK_1))
|
||||||
|
{
|
||||||
|
if (xlrec->rightblk != P_NONE)
|
||||||
|
{
|
||||||
|
buffer = XLogReadBuffer(false, reln, xlrec->rightblk);
|
||||||
|
if (!BufferIsValid(buffer))
|
||||||
|
elog(PANIC, "btree_split_redo: lost next right page");
|
||||||
|
|
||||||
|
page = (Page) BufferGetPage(buffer);
|
||||||
|
if (PageIsNew((PageHeader) page))
|
||||||
|
elog(PANIC, "btree_split_redo: uninitialized next right page");
|
||||||
|
|
||||||
|
if (XLByteLE(lsn, PageGetLSN(page)))
|
||||||
|
{
|
||||||
|
UnlockAndReleaseBuffer(buffer);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
pageop = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||||
|
pageop->btpo_prev = rightsib;
|
||||||
|
|
||||||
|
PageSetLSN(page, lsn);
|
||||||
|
PageSetSUI(page, ThisStartUpID);
|
||||||
|
UnlockAndWriteBuffer(buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Forget any split this insertion completes */
|
||||||
|
if (redo && xlrec->level > 0 && incomplete_splits != NIL)
|
||||||
|
{
|
||||||
|
forget_matching_split(reln, xlrec->target.node,
|
||||||
|
ItemPointerGetBlockNumber(&(xlrec->target.tid)),
|
||||||
|
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
|
||||||
|
false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The job ain't done till the parent link is inserted... */
|
||||||
|
log_incomplete_split(xlrec->target.node,
|
||||||
|
leftsib, rightsib, isroot);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record)
|
||||||
|
{
|
||||||
|
xl_btree_delete *xlrec;
|
||||||
|
Relation reln;
|
||||||
|
Buffer buffer;
|
||||||
|
Page page;
|
||||||
|
|
||||||
|
if (!redo || (record->xl_info & XLR_BKP_BLOCK_1))
|
||||||
|
return;
|
||||||
|
|
||||||
|
xlrec = (xl_btree_delete *) XLogRecGetData(record);
|
||||||
|
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node);
|
||||||
|
if (!RelationIsValid(reln))
|
||||||
|
return;
|
||||||
|
buffer = XLogReadBuffer(false, reln,
|
||||||
|
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
||||||
|
if (!BufferIsValid(buffer))
|
||||||
|
elog(PANIC, "btree_delete_redo: block unfound");
|
||||||
|
page = (Page) BufferGetPage(buffer);
|
||||||
|
if (PageIsNew((PageHeader) page))
|
||||||
|
elog(PANIC, "btree_delete_redo: uninitialized page");
|
||||||
|
|
||||||
|
if (XLByteLE(lsn, PageGetLSN(page)))
|
||||||
|
{
|
||||||
|
UnlockAndReleaseBuffer(buffer);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid)));
|
||||||
|
|
||||||
|
PageSetLSN(page, lsn);
|
||||||
|
PageSetSUI(page, ThisStartUpID);
|
||||||
|
UnlockAndWriteBuffer(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
btree_xlog_newroot(bool redo, XLogRecPtr lsn, XLogRecord *record)
|
||||||
|
{
|
||||||
|
xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record);
|
||||||
|
Relation reln;
|
||||||
|
Buffer buffer;
|
||||||
|
Page page;
|
||||||
|
BTPageOpaque pageop;
|
||||||
|
|
||||||
|
if (!redo)
|
||||||
|
return; /* not undoable */
|
||||||
|
|
||||||
|
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
|
||||||
|
if (!RelationIsValid(reln))
|
||||||
|
return;
|
||||||
|
buffer = XLogReadBuffer(true, reln, xlrec->rootblk);
|
||||||
|
if (!BufferIsValid(buffer))
|
||||||
|
elog(PANIC, "btree_newroot_redo: no root page");
|
||||||
|
|
||||||
|
page = (Page) BufferGetPage(buffer);
|
||||||
|
_bt_pageinit(page, BufferGetPageSize(buffer));
|
||||||
|
pageop = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||||
|
|
||||||
|
pageop->btpo_flags = BTP_ROOT;
|
||||||
|
pageop->btpo_prev = pageop->btpo_next = P_NONE;
|
||||||
|
pageop->btpo.level = xlrec->level;
|
||||||
|
if (xlrec->level == 0)
|
||||||
|
pageop->btpo_flags |= BTP_LEAF;
|
||||||
|
|
||||||
|
if (record->xl_len > SizeOfBtreeNewroot)
|
||||||
|
_bt_restore_page(page,
|
||||||
|
(char *) xlrec + SizeOfBtreeNewroot,
|
||||||
|
record->xl_len - SizeOfBtreeNewroot);
|
||||||
|
|
||||||
|
PageSetLSN(page, lsn);
|
||||||
|
PageSetSUI(page, ThisStartUpID);
|
||||||
|
UnlockAndWriteBuffer(buffer);
|
||||||
|
|
||||||
|
_bt_restore_meta(reln, lsn,
|
||||||
|
xlrec->rootblk, xlrec->level,
|
||||||
|
xlrec->rootblk, xlrec->level);
|
||||||
|
|
||||||
|
/* Check to see if this satisfies any incomplete insertions */
|
||||||
|
if (record->xl_len > SizeOfBtreeNewroot &&
|
||||||
|
incomplete_splits != NIL)
|
||||||
|
{
|
||||||
|
forget_matching_split(reln, xlrec->node,
|
||||||
|
xlrec->rootblk,
|
||||||
|
P_FIRSTKEY,
|
||||||
|
true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
btree_xlog_newmeta(bool redo, XLogRecPtr lsn, XLogRecord *record)
|
||||||
|
{
|
||||||
|
xl_btree_newmeta *xlrec = (xl_btree_newmeta *) XLogRecGetData(record);
|
||||||
|
Relation reln;
|
||||||
|
|
||||||
|
if (!redo)
|
||||||
|
return; /* not undoable */
|
||||||
|
|
||||||
|
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
|
||||||
|
if (!RelationIsValid(reln))
|
||||||
|
return;
|
||||||
|
|
||||||
|
_bt_restore_meta(reln, lsn,
|
||||||
|
xlrec->meta.root, xlrec->meta.level,
|
||||||
|
xlrec->meta.fastroot, xlrec->meta.fastlevel);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
btree_xlog_newpage(bool redo, XLogRecPtr lsn, XLogRecord *record)
|
||||||
|
{
|
||||||
|
xl_btree_newpage *xlrec = (xl_btree_newpage *) XLogRecGetData(record);
|
||||||
|
Relation reln;
|
||||||
|
Buffer buffer;
|
||||||
|
Page page;
|
||||||
|
|
||||||
|
if (!redo || (record->xl_info & XLR_BKP_BLOCK_1))
|
||||||
|
return;
|
||||||
|
|
||||||
|
reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node);
|
||||||
|
if (!RelationIsValid(reln))
|
||||||
|
return;
|
||||||
|
buffer = XLogReadBuffer(true, reln, xlrec->blkno);
|
||||||
|
if (!BufferIsValid(buffer))
|
||||||
|
elog(PANIC, "btree_newpage_redo: block unfound");
|
||||||
|
page = (Page) BufferGetPage(buffer);
|
||||||
|
|
||||||
|
Assert(record->xl_len == SizeOfBtreeNewpage + BLCKSZ);
|
||||||
|
memcpy(page, (char *) xlrec + SizeOfBtreeNewpage, BLCKSZ);
|
||||||
|
|
||||||
|
PageSetLSN(page, lsn);
|
||||||
|
PageSetSUI(page, ThisStartUpID);
|
||||||
|
UnlockAndWriteBuffer(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
btree_redo(XLogRecPtr lsn, XLogRecord *record)
|
||||||
|
{
|
||||||
|
uint8 info = record->xl_info & ~XLR_INFO_MASK;
|
||||||
|
|
||||||
|
switch (info)
|
||||||
|
{
|
||||||
|
case XLOG_BTREE_INSERT_LEAF:
|
||||||
|
btree_xlog_insert(true, true, false, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_INSERT_UPPER:
|
||||||
|
btree_xlog_insert(true, false, false, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_INSERT_META:
|
||||||
|
btree_xlog_insert(true, false, true, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_SPLIT_L:
|
||||||
|
btree_xlog_split(true, true, false, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_SPLIT_R:
|
||||||
|
btree_xlog_split(true, false, false, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_SPLIT_L_ROOT:
|
||||||
|
btree_xlog_split(true, true, true, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_SPLIT_R_ROOT:
|
||||||
|
btree_xlog_split(true, false, true, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_DELETE:
|
||||||
|
btree_xlog_delete(true, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_DELETE_PAGE:
|
||||||
|
case XLOG_BTREE_DELETE_PAGE_META:
|
||||||
|
// ???
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_NEWROOT:
|
||||||
|
btree_xlog_newroot(true, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_NEWMETA:
|
||||||
|
btree_xlog_newmeta(true, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_NEWPAGE:
|
||||||
|
btree_xlog_newpage(true, lsn, record);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
elog(PANIC, "btree_redo: unknown op code %u", info);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
btree_undo(XLogRecPtr lsn, XLogRecord *record)
|
||||||
|
{
|
||||||
|
uint8 info = record->xl_info & ~XLR_INFO_MASK;
|
||||||
|
|
||||||
|
switch (info)
|
||||||
|
{
|
||||||
|
case XLOG_BTREE_INSERT_LEAF:
|
||||||
|
btree_xlog_insert(false, true, false, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_INSERT_UPPER:
|
||||||
|
btree_xlog_insert(false, false, false, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_INSERT_META:
|
||||||
|
btree_xlog_insert(false, false, true, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_SPLIT_L:
|
||||||
|
btree_xlog_split(false, true, false, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_SPLIT_R:
|
||||||
|
btree_xlog_split(false, false, false, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_SPLIT_L_ROOT:
|
||||||
|
btree_xlog_split(false, true, true, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_SPLIT_R_ROOT:
|
||||||
|
btree_xlog_split(false, false, true, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_DELETE:
|
||||||
|
btree_xlog_delete(false, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_DELETE_PAGE:
|
||||||
|
case XLOG_BTREE_DELETE_PAGE_META:
|
||||||
|
// ???
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_NEWROOT:
|
||||||
|
btree_xlog_newroot(false, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_NEWMETA:
|
||||||
|
btree_xlog_newmeta(false, lsn, record);
|
||||||
|
break;
|
||||||
|
case XLOG_BTREE_NEWPAGE:
|
||||||
|
btree_xlog_newpage(false, lsn, record);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
elog(PANIC, "btree_undo: unknown op code %u", info);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
out_target(char *buf, xl_btreetid *target)
|
||||||
|
{
|
||||||
|
sprintf(buf + strlen(buf), "node %u/%u; tid %u/%u",
|
||||||
|
target->node.tblNode, target->node.relNode,
|
||||||
|
ItemPointerGetBlockNumber(&(target->tid)),
|
||||||
|
ItemPointerGetOffsetNumber(&(target->tid)));
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
btree_desc(char *buf, uint8 xl_info, char *rec)
|
||||||
|
{
|
||||||
|
uint8 info = xl_info & ~XLR_INFO_MASK;
|
||||||
|
|
||||||
|
switch (info)
|
||||||
|
{
|
||||||
|
case XLOG_BTREE_INSERT_LEAF:
|
||||||
|
{
|
||||||
|
xl_btree_insert *xlrec = (xl_btree_insert *) rec;
|
||||||
|
|
||||||
|
strcat(buf, "insert: ");
|
||||||
|
out_target(buf, &(xlrec->target));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case XLOG_BTREE_INSERT_UPPER:
|
||||||
|
{
|
||||||
|
xl_btree_insert *xlrec = (xl_btree_insert *) rec;
|
||||||
|
|
||||||
|
strcat(buf, "insert_upper: ");
|
||||||
|
out_target(buf, &(xlrec->target));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case XLOG_BTREE_INSERT_META:
|
||||||
|
{
|
||||||
|
xl_btree_insert *xlrec = (xl_btree_insert *) rec;
|
||||||
|
|
||||||
|
strcat(buf, "insert_meta: ");
|
||||||
|
out_target(buf, &(xlrec->target));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case XLOG_BTREE_SPLIT_L:
|
||||||
|
{
|
||||||
|
xl_btree_split *xlrec = (xl_btree_split *) rec;
|
||||||
|
|
||||||
|
strcat(buf, "split_l: ");
|
||||||
|
out_target(buf, &(xlrec->target));
|
||||||
|
sprintf(buf + strlen(buf), "; oth %u; rgh %u",
|
||||||
|
xlrec->otherblk, xlrec->rightblk);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case XLOG_BTREE_SPLIT_R:
|
||||||
|
{
|
||||||
|
xl_btree_split *xlrec = (xl_btree_split *) rec;
|
||||||
|
|
||||||
|
strcat(buf, "split_r: ");
|
||||||
|
out_target(buf, &(xlrec->target));
|
||||||
|
sprintf(buf + strlen(buf), "; oth %u; rgh %u",
|
||||||
|
xlrec->otherblk, xlrec->rightblk);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case XLOG_BTREE_SPLIT_L_ROOT:
|
||||||
|
{
|
||||||
|
xl_btree_split *xlrec = (xl_btree_split *) rec;
|
||||||
|
|
||||||
|
strcat(buf, "split_l_root: ");
|
||||||
|
out_target(buf, &(xlrec->target));
|
||||||
|
sprintf(buf + strlen(buf), "; oth %u; rgh %u",
|
||||||
|
xlrec->otherblk, xlrec->rightblk);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case XLOG_BTREE_SPLIT_R_ROOT:
|
||||||
|
{
|
||||||
|
xl_btree_split *xlrec = (xl_btree_split *) rec;
|
||||||
|
|
||||||
|
strcat(buf, "split_r_root: ");
|
||||||
|
out_target(buf, &(xlrec->target));
|
||||||
|
sprintf(buf + strlen(buf), "; oth %u; rgh %u",
|
||||||
|
xlrec->otherblk, xlrec->rightblk);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case XLOG_BTREE_DELETE:
|
||||||
|
{
|
||||||
|
xl_btree_delete *xlrec = (xl_btree_delete *) rec;
|
||||||
|
|
||||||
|
strcat(buf, "delete: ");
|
||||||
|
out_target(buf, &(xlrec->target));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case XLOG_BTREE_DELETE_PAGE:
|
||||||
|
case XLOG_BTREE_DELETE_PAGE_META:
|
||||||
|
{
|
||||||
|
xl_btree_delete_page *xlrec = (xl_btree_delete_page *) rec;
|
||||||
|
|
||||||
|
strcat(buf, "delete_page: ");
|
||||||
|
out_target(buf, &(xlrec->target));
|
||||||
|
sprintf(buf + strlen(buf), "; dead %u; left %u; right %u",
|
||||||
|
xlrec->deadblk, xlrec->leftblk, xlrec->rightblk);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case XLOG_BTREE_NEWROOT:
|
||||||
|
{
|
||||||
|
xl_btree_newroot *xlrec = (xl_btree_newroot *) rec;
|
||||||
|
|
||||||
|
sprintf(buf + strlen(buf), "newroot: node %u/%u; root %u lev %u",
|
||||||
|
xlrec->node.tblNode, xlrec->node.relNode,
|
||||||
|
xlrec->rootblk, xlrec->level);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case XLOG_BTREE_NEWMETA:
|
||||||
|
{
|
||||||
|
xl_btree_newmeta *xlrec = (xl_btree_newmeta *) rec;
|
||||||
|
|
||||||
|
sprintf(buf + strlen(buf), "newmeta: node %u/%u; root %u lev %u fast %u lev %u",
|
||||||
|
xlrec->node.tblNode, xlrec->node.relNode,
|
||||||
|
xlrec->meta.root, xlrec->meta.level,
|
||||||
|
xlrec->meta.fastroot, xlrec->meta.fastlevel);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case XLOG_BTREE_NEWPAGE:
|
||||||
|
{
|
||||||
|
xl_btree_newpage *xlrec = (xl_btree_newpage *) rec;
|
||||||
|
|
||||||
|
sprintf(buf + strlen(buf), "newpage: node %u/%u; page %u",
|
||||||
|
xlrec->node.tblNode, xlrec->node.relNode,
|
||||||
|
xlrec->blkno);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
strcat(buf, "UNKNOWN");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
btree_xlog_startup(void)
|
||||||
|
{
|
||||||
|
incomplete_splits = NIL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
btree_xlog_cleanup(void)
|
||||||
|
{
|
||||||
|
List *l;
|
||||||
|
|
||||||
|
foreach(l, incomplete_splits)
|
||||||
|
{
|
||||||
|
bt_incomplete_split *split = (bt_incomplete_split *) lfirst(l);
|
||||||
|
Relation reln;
|
||||||
|
Buffer lbuf,
|
||||||
|
rbuf;
|
||||||
|
Page lpage,
|
||||||
|
rpage;
|
||||||
|
BTPageOpaque lpageop,
|
||||||
|
rpageop;
|
||||||
|
bool is_only;
|
||||||
|
|
||||||
|
reln = XLogOpenRelation(true, RM_BTREE_ID, split->node);
|
||||||
|
if (!RelationIsValid(reln))
|
||||||
|
continue;
|
||||||
|
lbuf = XLogReadBuffer(false, reln, split->leftblk);
|
||||||
|
if (!BufferIsValid(lbuf))
|
||||||
|
elog(PANIC, "btree_xlog_cleanup: left block unfound");
|
||||||
|
lpage = (Page) BufferGetPage(lbuf);
|
||||||
|
lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
|
||||||
|
rbuf = XLogReadBuffer(false, reln, split->rightblk);
|
||||||
|
if (!BufferIsValid(rbuf))
|
||||||
|
elog(PANIC, "btree_xlog_cleanup: right block unfound");
|
||||||
|
rpage = (Page) BufferGetPage(rbuf);
|
||||||
|
rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
|
||||||
|
|
||||||
|
/* if the two pages are all of their level, it's a only-page split */
|
||||||
|
is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop);
|
||||||
|
|
||||||
|
_bt_insert_parent(reln, lbuf, rbuf, (BTStack) NULL,
|
||||||
|
split->is_root, is_only);
|
||||||
|
}
|
||||||
|
incomplete_splits = NIL;
|
||||||
|
}
|
|
@ -3,7 +3,7 @@
|
||||||
*
|
*
|
||||||
* Resource managers definition
|
* Resource managers definition
|
||||||
*
|
*
|
||||||
* $Header: /cvsroot/pgsql/src/backend/access/transam/rmgr.c,v 1.9 2001/08/25 18:52:41 tgl Exp $
|
* $Header: /cvsroot/pgsql/src/backend/access/transam/rmgr.c,v 1.10 2003/02/21 00:06:22 tgl Exp $
|
||||||
*/
|
*/
|
||||||
#include "postgres.h"
|
#include "postgres.h"
|
||||||
|
|
||||||
|
@ -19,21 +19,22 @@
|
||||||
#include "commands/sequence.h"
|
#include "commands/sequence.h"
|
||||||
|
|
||||||
|
|
||||||
RmgrData RmgrTable[] = {
|
RmgrData RmgrTable[RM_MAX_ID+1] = {
|
||||||
{"XLOG", xlog_redo, xlog_undo, xlog_desc},
|
{"XLOG", xlog_redo, xlog_undo, xlog_desc, NULL, NULL},
|
||||||
{"Transaction", xact_redo, xact_undo, xact_desc},
|
{"Transaction", xact_redo, xact_undo, xact_desc, NULL, NULL},
|
||||||
{"Storage", smgr_redo, smgr_undo, smgr_desc},
|
{"Storage", smgr_redo, smgr_undo, smgr_desc, NULL, NULL},
|
||||||
{"CLOG", clog_redo, clog_undo, clog_desc},
|
{"CLOG", clog_redo, clog_undo, clog_desc, NULL, NULL},
|
||||||
{"Reserved 4", NULL, NULL, NULL},
|
{"Reserved 4", NULL, NULL, NULL, NULL, NULL},
|
||||||
{"Reserved 5", NULL, NULL, NULL},
|
{"Reserved 5", NULL, NULL, NULL, NULL, NULL},
|
||||||
{"Reserved 6", NULL, NULL, NULL},
|
{"Reserved 6", NULL, NULL, NULL, NULL, NULL},
|
||||||
{"Reserved 7", NULL, NULL, NULL},
|
{"Reserved 7", NULL, NULL, NULL, NULL, NULL},
|
||||||
{"Reserved 8", NULL, NULL, NULL},
|
{"Reserved 8", NULL, NULL, NULL, NULL, NULL},
|
||||||
{"Reserved 9", NULL, NULL, NULL},
|
{"Reserved 9", NULL, NULL, NULL, NULL, NULL},
|
||||||
{"Heap", heap_redo, heap_undo, heap_desc},
|
{"Heap", heap_redo, heap_undo, heap_desc, NULL, NULL},
|
||||||
{"Btree", btree_redo, btree_undo, btree_desc},
|
{"Btree", btree_redo, btree_undo, btree_desc,
|
||||||
{"Hash", hash_redo, hash_undo, hash_desc},
|
btree_xlog_startup, btree_xlog_cleanup},
|
||||||
{"Rtree", rtree_redo, rtree_undo, rtree_desc},
|
{"Hash", hash_redo, hash_undo, hash_desc, NULL, NULL},
|
||||||
{"Gist", gist_redo, gist_undo, gist_desc},
|
{"Rtree", rtree_redo, rtree_undo, rtree_desc, NULL, NULL},
|
||||||
{"Sequence", seq_redo, seq_undo, seq_desc}
|
{"Gist", gist_redo, gist_undo, gist_desc, NULL, NULL},
|
||||||
|
{"Sequence", seq_redo, seq_undo, seq_desc, NULL, NULL}
|
||||||
};
|
};
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
|
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.111 2003/01/25 03:06:04 tgl Exp $
|
* $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.112 2003/02/21 00:06:22 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
|
@ -1203,16 +1203,6 @@ XLogFlush(XLogRecPtr record)
|
||||||
XLogRecPtr WriteRqstPtr;
|
XLogRecPtr WriteRqstPtr;
|
||||||
XLogwrtRqst WriteRqst;
|
XLogwrtRqst WriteRqst;
|
||||||
|
|
||||||
if (XLOG_DEBUG)
|
|
||||||
{
|
|
||||||
elog(LOG, "XLogFlush%s%s: request %X/%X; write %X/%X; flush %X/%X",
|
|
||||||
(IsBootstrapProcessingMode()) ? "(bootstrap)" : "",
|
|
||||||
(InRedo) ? "(redo)" : "",
|
|
||||||
record.xlogid, record.xrecoff,
|
|
||||||
LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
|
|
||||||
LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Disabled during REDO */
|
/* Disabled during REDO */
|
||||||
if (InRedo)
|
if (InRedo)
|
||||||
return;
|
return;
|
||||||
|
@ -1221,6 +1211,15 @@ XLogFlush(XLogRecPtr record)
|
||||||
if (XLByteLE(record, LogwrtResult.Flush))
|
if (XLByteLE(record, LogwrtResult.Flush))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
if (XLOG_DEBUG)
|
||||||
|
{
|
||||||
|
elog(LOG, "XLogFlush%s: request %X/%X; write %X/%X; flush %X/%X",
|
||||||
|
(IsBootstrapProcessingMode()) ? "(bootstrap)" : "",
|
||||||
|
record.xlogid, record.xrecoff,
|
||||||
|
LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
|
||||||
|
LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
|
||||||
|
}
|
||||||
|
|
||||||
START_CRIT_SECTION();
|
START_CRIT_SECTION();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -2515,6 +2514,12 @@ StartupXLOG(void)
|
||||||
elog(LOG, "database system was interrupted at %s",
|
elog(LOG, "database system was interrupted at %s",
|
||||||
str_time(ControlFile->time));
|
str_time(ControlFile->time));
|
||||||
|
|
||||||
|
/* This is just to allow attaching to startup process with a debugger */
|
||||||
|
#ifdef XLOG_REPLAY_DELAY
|
||||||
|
if (XLOG_DEBUG && ControlFile->state != DB_SHUTDOWNED)
|
||||||
|
sleep(60);
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Get the last valid checkpoint record. If the latest one according
|
* Get the last valid checkpoint record. If the latest one according
|
||||||
* to pg_control is broken, try the next-to-last one.
|
* to pg_control is broken, try the next-to-last one.
|
||||||
|
@ -2578,14 +2583,23 @@ StartupXLOG(void)
|
||||||
/* REDO */
|
/* REDO */
|
||||||
if (InRecovery)
|
if (InRecovery)
|
||||||
{
|
{
|
||||||
|
int rmid;
|
||||||
|
|
||||||
elog(LOG, "database system was not properly shut down; "
|
elog(LOG, "database system was not properly shut down; "
|
||||||
"automatic recovery in progress");
|
"automatic recovery in progress");
|
||||||
ControlFile->state = DB_IN_RECOVERY;
|
ControlFile->state = DB_IN_RECOVERY;
|
||||||
ControlFile->time = time(NULL);
|
ControlFile->time = time(NULL);
|
||||||
UpdateControlFile();
|
UpdateControlFile();
|
||||||
|
|
||||||
|
/* Start up the recovery environment */
|
||||||
XLogInitRelationCache();
|
XLogInitRelationCache();
|
||||||
|
|
||||||
|
for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
|
||||||
|
{
|
||||||
|
if (RmgrTable[rmid].rm_startup != NULL)
|
||||||
|
RmgrTable[rmid].rm_startup();
|
||||||
|
}
|
||||||
|
|
||||||
/* Is REDO required ? */
|
/* Is REDO required ? */
|
||||||
if (XLByteLT(checkPoint.redo, RecPtr))
|
if (XLByteLT(checkPoint.redo, RecPtr))
|
||||||
record = ReadRecord(&(checkPoint.redo), PANIC, buffer);
|
record = ReadRecord(&(checkPoint.redo), PANIC, buffer);
|
||||||
|
@ -2737,7 +2751,25 @@ StartupXLOG(void)
|
||||||
|
|
||||||
if (InRecovery)
|
if (InRecovery)
|
||||||
{
|
{
|
||||||
|
int rmid;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
* Allow resource managers to do any required cleanup.
|
||||||
|
*/
|
||||||
|
for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
|
||||||
|
{
|
||||||
|
if (RmgrTable[rmid].rm_cleanup != NULL)
|
||||||
|
RmgrTable[rmid].rm_cleanup();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* suppress in-transaction check in CreateCheckPoint */
|
||||||
|
MyLastRecPtr.xrecoff = 0;
|
||||||
|
MyXactMadeXLogEntry = false;
|
||||||
|
MyXactMadeTempRelUpdate = false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Perform a new checkpoint to update our recovery activity to disk.
|
||||||
|
*
|
||||||
* In case we had to use the secondary checkpoint, make sure that
|
* In case we had to use the secondary checkpoint, make sure that
|
||||||
* it will still be shown as the secondary checkpoint after this
|
* it will still be shown as the secondary checkpoint after this
|
||||||
* CreateCheckPoint operation; we don't want the broken primary
|
* CreateCheckPoint operation; we don't want the broken primary
|
||||||
|
@ -2745,6 +2777,10 @@ StartupXLOG(void)
|
||||||
*/
|
*/
|
||||||
ControlFile->checkPoint = checkPointLoc;
|
ControlFile->checkPoint = checkPointLoc;
|
||||||
CreateCheckPoint(true, true);
|
CreateCheckPoint(true, true);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Close down recovery environment
|
||||||
|
*/
|
||||||
XLogCloseRelationCache();
|
XLogCloseRelationCache();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
|
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* $Id: nbtree.h,v 1.63 2002/07/02 05:48:44 momjian Exp $
|
* $Id: nbtree.h,v 1.64 2003/02/21 00:06:22 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
|
@ -22,46 +22,55 @@
|
||||||
/*
|
/*
|
||||||
* BTPageOpaqueData -- At the end of every page, we store a pointer
|
* BTPageOpaqueData -- At the end of every page, we store a pointer
|
||||||
* to both siblings in the tree. This is used to do forward/backward
|
* to both siblings in the tree. This is used to do forward/backward
|
||||||
* index scans. See Lehman and Yao's paper for more
|
* index scans. The next-page link is also critical for recovery when
|
||||||
* info. In addition, we need to know what type of page this is
|
* a search has navigated to the wrong page due to concurrent page splits
|
||||||
* (leaf or internal), and whether the page is available for reuse.
|
* or deletions; see src/backend/access/nbtree/README for more info.
|
||||||
*
|
*
|
||||||
* We also store a back-link to the parent page, but this cannot be trusted
|
* In addition, we store the page's btree level (counting upwards from
|
||||||
* very far since it does not get updated when the parent is split.
|
* zero at a leaf page) as well as some flag bits indicating the page type
|
||||||
* See backend/access/nbtree/README for details.
|
* and status. If the page is deleted, we replace the level with the
|
||||||
|
* next-transaction-ID value indicating when it is safe to reclaim the page.
|
||||||
|
*
|
||||||
|
* NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
|
||||||
|
* instead.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
typedef struct BTPageOpaqueData
|
typedef struct BTPageOpaqueData
|
||||||
{
|
{
|
||||||
BlockNumber btpo_prev; /* used for backward index scans */
|
BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */
|
||||||
BlockNumber btpo_next; /* used for forward index scans */
|
BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */
|
||||||
BlockNumber btpo_parent; /* pointer to parent, but not updated on
|
union
|
||||||
* parent split */
|
{
|
||||||
uint16 btpo_flags; /* LEAF?, ROOT?, FREE?, META?, REORDER? */
|
uint32 level; /* tree level --- zero for leaf pages */
|
||||||
|
TransactionId xact; /* next transaction ID, if deleted */
|
||||||
|
} btpo;
|
||||||
|
uint16 btpo_flags; /* flag bits, see below */
|
||||||
} BTPageOpaqueData;
|
} BTPageOpaqueData;
|
||||||
|
|
||||||
typedef BTPageOpaqueData *BTPageOpaque;
|
typedef BTPageOpaqueData *BTPageOpaque;
|
||||||
|
|
||||||
/* Bits defined in btpo_flags */
|
/* Bits defined in btpo_flags */
|
||||||
#define BTP_LEAF (1 << 0) /* leaf page, if not internal page */
|
#define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */
|
||||||
#define BTP_ROOT (1 << 1) /* root page (has no parent) */
|
#define BTP_ROOT (1 << 1) /* root page (has no parent) */
|
||||||
#define BTP_FREE (1 << 2) /* page not in use */
|
#define BTP_DELETED (1 << 2) /* page has been deleted from tree */
|
||||||
#define BTP_META (1 << 3) /* meta-page */
|
#define BTP_META (1 << 3) /* meta-page */
|
||||||
#define BTP_REORDER (1 << 4) /* items need reordering */
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The Meta page is always the first page in the btree index.
|
* The Meta page is always the first page in the btree index.
|
||||||
* Its primary purpose is to point to the location of the btree root page.
|
* Its primary purpose is to point to the location of the btree root page.
|
||||||
|
* We also point to the "fast" root, which is the current effective root;
|
||||||
|
* see README for discussion.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
typedef struct BTMetaPageData
|
typedef struct BTMetaPageData
|
||||||
{
|
{
|
||||||
uint32 btm_magic;
|
uint32 btm_magic; /* should contain BTREE_MAGIC */
|
||||||
uint32 btm_version;
|
uint32 btm_version; /* should contain BTREE_VERSION */
|
||||||
BlockNumber btm_root;
|
BlockNumber btm_root; /* current root location */
|
||||||
int32 btm_level;
|
uint32 btm_level; /* tree level of the root page */
|
||||||
|
BlockNumber btm_fastroot; /* current "fast" root location */
|
||||||
|
uint32 btm_fastlevel; /* tree level of the "fast" root page */
|
||||||
} BTMetaPageData;
|
} BTMetaPageData;
|
||||||
|
|
||||||
#define BTPageGetMeta(p) \
|
#define BTPageGetMeta(p) \
|
||||||
|
@ -69,12 +78,7 @@ typedef struct BTMetaPageData
|
||||||
|
|
||||||
#define BTREE_METAPAGE 0 /* first page is meta */
|
#define BTREE_METAPAGE 0 /* first page is meta */
|
||||||
#define BTREE_MAGIC 0x053162 /* magic number of btree pages */
|
#define BTREE_MAGIC 0x053162 /* magic number of btree pages */
|
||||||
|
#define BTREE_VERSION 2 /* current version number */
|
||||||
#define BTreeInvalidParent(opaque) \
|
|
||||||
(opaque->btpo_parent == InvalidBlockNumber || \
|
|
||||||
opaque->btpo_parent == BTREE_METAPAGE)
|
|
||||||
|
|
||||||
#define BTREE_VERSION 1
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We actually need to be able to fit three items on every page,
|
* We actually need to be able to fit three items on every page,
|
||||||
|
@ -84,6 +88,295 @@ typedef struct BTMetaPageData
|
||||||
((PageGetPageSize(page) - \
|
((PageGetPageSize(page) - \
|
||||||
sizeof(PageHeaderData) - \
|
sizeof(PageHeaderData) - \
|
||||||
MAXALIGN(sizeof(BTPageOpaqueData))) / 3 - sizeof(ItemIdData))
|
MAXALIGN(sizeof(BTPageOpaqueData))) / 3 - sizeof(ItemIdData))
|
||||||
|
|
||||||
|
/*
|
||||||
|
* BTItems are what we store in the btree. Each item is an index tuple,
|
||||||
|
* including key and pointer values. (In some cases either the key or the
|
||||||
|
* pointer may go unused, see backend/access/nbtree/README for details.)
|
||||||
|
*
|
||||||
|
* Old comments:
|
||||||
|
* In addition, we must guarantee that all tuples in the index are unique,
|
||||||
|
* in order to satisfy some assumptions in Lehman and Yao. The way that we
|
||||||
|
* do this is by generating a new OID for every insertion that we do in the
|
||||||
|
* tree. This adds eight bytes to the size of btree index tuples. Note
|
||||||
|
* that we do not use the OID as part of a composite key; the OID only
|
||||||
|
* serves as a unique identifier for a given index tuple (logical position
|
||||||
|
* within a page).
|
||||||
|
*
|
||||||
|
* New comments:
|
||||||
|
* actually, we must guarantee that all tuples in A LEVEL
|
||||||
|
* are unique, not in ALL INDEX. So, we can use bti_itup->t_tid
|
||||||
|
* as unique identifier for a given index tuple (logical position
|
||||||
|
* within a level). - vadim 04/09/97
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef struct BTItemData
|
||||||
|
{
|
||||||
|
IndexTupleData bti_itup;
|
||||||
|
} BTItemData;
|
||||||
|
|
||||||
|
typedef BTItemData *BTItem;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For XLOG: size without alignment. Sizeof works as long as
|
||||||
|
* IndexTupleData has exactly 8 bytes.
|
||||||
|
*/
|
||||||
|
#define SizeOfBTItem sizeof(BTItemData)
|
||||||
|
|
||||||
|
/* Test whether items are the "same" per the above notes */
|
||||||
|
#define BTItemSame(i1, i2) ( (i1)->bti_itup.t_tid.ip_blkid.bi_hi == \
|
||||||
|
(i2)->bti_itup.t_tid.ip_blkid.bi_hi && \
|
||||||
|
(i1)->bti_itup.t_tid.ip_blkid.bi_lo == \
|
||||||
|
(i2)->bti_itup.t_tid.ip_blkid.bi_lo && \
|
||||||
|
(i1)->bti_itup.t_tid.ip_posid == \
|
||||||
|
(i2)->bti_itup.t_tid.ip_posid )
|
||||||
|
|
||||||
|
/*
|
||||||
|
* In general, the btree code tries to localize its knowledge about
|
||||||
|
* page layout to a couple of routines. However, we need a special
|
||||||
|
* value to indicate "no page number" in those places where we expect
|
||||||
|
* page numbers. We can use zero for this because we never need to
|
||||||
|
* make a pointer to the metadata page.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define P_NONE 0
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Macros to test whether a page is leftmost or rightmost on its tree level,
|
||||||
|
* as well as other state info kept in the opaque data.
|
||||||
|
*/
|
||||||
|
#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE)
|
||||||
|
#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE)
|
||||||
|
#define P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF)
|
||||||
|
#define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT)
|
||||||
|
#define P_ISDELETED(opaque) ((opaque)->btpo_flags & BTP_DELETED)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
|
||||||
|
* page. The high key is not a data key, but gives info about what range of
|
||||||
|
* keys is supposed to be on this page. The high key on a page is required
|
||||||
|
* to be greater than or equal to any data key that appears on the page.
|
||||||
|
* If we find ourselves trying to insert a key > high key, we know we need
|
||||||
|
* to move right (this should only happen if the page was split since we
|
||||||
|
* examined the parent page).
|
||||||
|
*
|
||||||
|
* Our insertion algorithm guarantees that we can use the initial least key
|
||||||
|
* on our right sibling as the high key. Once a page is created, its high
|
||||||
|
* key changes only if the page is split.
|
||||||
|
*
|
||||||
|
* On a non-rightmost page, the high key lives in item 1 and data items
|
||||||
|
* start in item 2. Rightmost pages have no high key, so we store data
|
||||||
|
* items beginning in item 1.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define P_HIKEY ((OffsetNumber) 1)
|
||||||
|
#define P_FIRSTKEY ((OffsetNumber) 2)
|
||||||
|
#define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* XLOG records for btree operations
|
||||||
|
*
|
||||||
|
* XLOG allows to store some information in high 4 bits of log
|
||||||
|
* record xl_info field
|
||||||
|
*/
|
||||||
|
#define XLOG_BTREE_INSERT_LEAF 0x00 /* add btitem without split */
|
||||||
|
#define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */
|
||||||
|
#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
|
||||||
|
#define XLOG_BTREE_SPLIT_L 0x30 /* add btitem with split */
|
||||||
|
#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
|
||||||
|
#define XLOG_BTREE_SPLIT_L_ROOT 0x50 /* add btitem with split of root */
|
||||||
|
#define XLOG_BTREE_SPLIT_R_ROOT 0x60 /* as above, new item on right */
|
||||||
|
#define XLOG_BTREE_DELETE 0x70 /* delete leaf btitem */
|
||||||
|
#define XLOG_BTREE_DELETE_PAGE 0x80 /* delete an entire page */
|
||||||
|
#define XLOG_BTREE_DELETE_PAGE_META 0x90 /* same, plus update metapage */
|
||||||
|
#define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
|
||||||
|
#define XLOG_BTREE_NEWMETA 0xB0 /* update metadata page */
|
||||||
|
#define XLOG_BTREE_NEWPAGE 0xC0 /* new index page during build */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* All that we need to find changed index tuple
|
||||||
|
*/
|
||||||
|
typedef struct xl_btreetid
|
||||||
|
{
|
||||||
|
RelFileNode node;
|
||||||
|
ItemPointerData tid; /* changed tuple id */
|
||||||
|
} xl_btreetid;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* All that we need to regenerate the meta-data page
|
||||||
|
*/
|
||||||
|
typedef struct xl_btree_metadata
|
||||||
|
{
|
||||||
|
BlockNumber root;
|
||||||
|
uint32 level;
|
||||||
|
BlockNumber fastroot;
|
||||||
|
uint32 fastlevel;
|
||||||
|
} xl_btree_metadata;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is what we need to know about simple (without split) insert.
|
||||||
|
*
|
||||||
|
* This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
|
||||||
|
* Note that INSERT_META implies it's not a leaf page.
|
||||||
|
*/
|
||||||
|
typedef struct xl_btree_insert
|
||||||
|
{
|
||||||
|
xl_btreetid target; /* inserted tuple id */
|
||||||
|
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */
|
||||||
|
/* BTITEM FOLLOWS AT END OF STRUCT */
|
||||||
|
} xl_btree_insert;
|
||||||
|
|
||||||
|
#define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* On insert with split we save items of both left and right siblings
|
||||||
|
* and restore content of both pages from log record. This way takes less
|
||||||
|
* xlog space than the normal approach, because if we did it standardly,
|
||||||
|
* XLogInsert would almost always think the right page is new and store its
|
||||||
|
* whole page image.
|
||||||
|
*
|
||||||
|
* Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
|
||||||
|
* The _L and _R variants indicate whether the inserted btitem went into the
|
||||||
|
* left or right split page (and thus, whether otherblk is the right or left
|
||||||
|
* page of the split pair). The _ROOT variants indicate that we are splitting
|
||||||
|
* the root page, and thus that a newroot record rather than an insert or
|
||||||
|
* split record should follow. Note that a split record never carries a
|
||||||
|
* metapage update --- we'll do that in the parent-level update.
|
||||||
|
*/
|
||||||
|
typedef struct xl_btree_split
|
||||||
|
{
|
||||||
|
xl_btreetid target; /* inserted tuple id */
|
||||||
|
BlockNumber otherblk; /* second block participated in split: */
|
||||||
|
/* first one is stored in target' tid */
|
||||||
|
BlockNumber leftblk; /* prev/left block */
|
||||||
|
BlockNumber rightblk; /* next/right block */
|
||||||
|
uint32 level; /* tree level of page being split */
|
||||||
|
uint16 leftlen; /* len of left page items below */
|
||||||
|
/* LEFT AND RIGHT PAGES TUPLES FOLLOW AT THE END */
|
||||||
|
} xl_btree_split;
|
||||||
|
|
||||||
|
#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is what we need to know about delete of an individual leaf btitem
|
||||||
|
*/
|
||||||
|
typedef struct xl_btree_delete
|
||||||
|
{
|
||||||
|
xl_btreetid target; /* deleted tuple id */
|
||||||
|
} xl_btree_delete;
|
||||||
|
|
||||||
|
#define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is what we need to know about deletion of a btree page. The target
|
||||||
|
* identifies the tuple removed from the parent page (note that we remove
|
||||||
|
* this tuple's downlink and the *following* tuple's key). Note we do not
|
||||||
|
* store any content for the deleted page --- it is just rewritten as empty
|
||||||
|
* during recovery.
|
||||||
|
*/
|
||||||
|
typedef struct xl_btree_delete_page
|
||||||
|
{
|
||||||
|
xl_btreetid target; /* deleted tuple id in parent page */
|
||||||
|
BlockNumber deadblk; /* child block being deleted */
|
||||||
|
BlockNumber leftblk; /* child block's left sibling, if any */
|
||||||
|
BlockNumber rightblk; /* child block's right sibling */
|
||||||
|
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_DELETE_PAGE_META */
|
||||||
|
} xl_btree_delete_page;
|
||||||
|
|
||||||
|
#define SizeOfBtreeDeletePage (offsetof(xl_btree_delete_page, rightblk) + sizeof(BlockNumber))
|
||||||
|
|
||||||
|
/*
|
||||||
|
* New root log record. There are zero btitems if this is to establish an
|
||||||
|
* empty root, or two if it is the result of splitting an old root.
|
||||||
|
*
|
||||||
|
* Note that although this implies rewriting the metadata page, we don't need
|
||||||
|
* an xl_btree_metadata record --- the rootblk and level are sufficient.
|
||||||
|
*/
|
||||||
|
typedef struct xl_btree_newroot
|
||||||
|
{
|
||||||
|
RelFileNode node;
|
||||||
|
BlockNumber rootblk; /* location of new root */
|
||||||
|
uint32 level; /* its tree level */
|
||||||
|
/* 0 or 2 BTITEMS FOLLOW AT END OF STRUCT */
|
||||||
|
} xl_btree_newroot;
|
||||||
|
|
||||||
|
#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
|
||||||
|
|
||||||
|
/*
|
||||||
|
* New metapage log record. This is not issued during routine operations;
|
||||||
|
* it's only used when initializing an empty index and at completion of
|
||||||
|
* index build.
|
||||||
|
*/
|
||||||
|
typedef struct xl_btree_newmeta
|
||||||
|
{
|
||||||
|
RelFileNode node;
|
||||||
|
xl_btree_metadata meta;
|
||||||
|
} xl_btree_newmeta;
|
||||||
|
|
||||||
|
#define SizeOfBtreeNewmeta (sizeof(xl_btree_newmeta))
|
||||||
|
|
||||||
|
/*
|
||||||
|
* New index page log record. This is only used while building a new index.
|
||||||
|
*/
|
||||||
|
typedef struct xl_btree_newpage
|
||||||
|
{
|
||||||
|
RelFileNode node;
|
||||||
|
BlockNumber blkno; /* location of new page */
|
||||||
|
/* entire page contents follow at end of record */
|
||||||
|
} xl_btree_newpage;
|
||||||
|
|
||||||
|
#define SizeOfBtreeNewpage (offsetof(xl_btree_newpage, blkno) + sizeof(BlockNumber))
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Operator strategy numbers -- ordering of these is <, <=, =, >=, >
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define BTLessStrategyNumber 1
|
||||||
|
#define BTLessEqualStrategyNumber 2
|
||||||
|
#define BTEqualStrategyNumber 3
|
||||||
|
#define BTGreaterEqualStrategyNumber 4
|
||||||
|
#define BTGreaterStrategyNumber 5
|
||||||
|
#define BTMaxStrategyNumber 5
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When a new operator class is declared, we require that the user
|
||||||
|
* supply us with an amproc procedure for determining whether, for
|
||||||
|
* two keys a and b, a < b, a = b, or a > b. This routine must
|
||||||
|
* return < 0, 0, > 0, respectively, in these three cases. Since we
|
||||||
|
* only have one such proc in amproc, it's number 1.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define BTORDER_PROC 1
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We need to be able to tell the difference between read and write
|
||||||
|
* requests for pages, in order to do locking correctly.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define BT_READ BUFFER_LOCK_SHARE
|
||||||
|
#define BT_WRITE BUFFER_LOCK_EXCLUSIVE
|
||||||
|
|
||||||
|
/*
|
||||||
|
* BTStackData -- As we descend a tree, we push the (location, downlink)
|
||||||
|
* pairs from internal pages onto a private stack. If we split a
|
||||||
|
* leaf, we use this stack to walk back up the tree and insert data
|
||||||
|
* into parent pages (and possibly to split them, too). Lehman and
|
||||||
|
* Yao's update algorithm guarantees that under no circumstances can
|
||||||
|
* our private stack give us an irredeemably bad picture up the tree.
|
||||||
|
* Again, see the paper for details.
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef struct BTStackData
|
||||||
|
{
|
||||||
|
BlockNumber bts_blkno;
|
||||||
|
OffsetNumber bts_offset;
|
||||||
|
BTItemData bts_btitem;
|
||||||
|
struct BTStackData *bts_parent;
|
||||||
|
} BTStackData;
|
||||||
|
|
||||||
|
typedef BTStackData *BTStack;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* BTScanOpaqueData is used to remember which buffers we're currently
|
* BTScanOpaqueData is used to remember which buffers we're currently
|
||||||
* examining in the scan. We keep these buffers pinned (but not locked,
|
* examining in the scan. We keep these buffers pinned (but not locked,
|
||||||
|
@ -116,212 +409,6 @@ typedef struct BTScanOpaqueData
|
||||||
|
|
||||||
typedef BTScanOpaqueData *BTScanOpaque;
|
typedef BTScanOpaqueData *BTScanOpaque;
|
||||||
|
|
||||||
/*
|
|
||||||
* BTItems are what we store in the btree. Each item is an index tuple,
|
|
||||||
* including key and pointer values. (In some cases either the key or the
|
|
||||||
* pointer may go unused, see backend/access/nbtree/README for details.)
|
|
||||||
*
|
|
||||||
* Old comments:
|
|
||||||
* In addition, we must guarantee that all tuples in the index are unique,
|
|
||||||
* in order to satisfy some assumptions in Lehman and Yao. The way that we
|
|
||||||
* do this is by generating a new OID for every insertion that we do in the
|
|
||||||
* tree. This adds eight bytes to the size of btree index tuples. Note
|
|
||||||
* that we do not use the OID as part of a composite key; the OID only
|
|
||||||
* serves as a unique identifier for a given index tuple (logical position
|
|
||||||
* within a page).
|
|
||||||
*
|
|
||||||
* New comments:
|
|
||||||
* actually, we must guarantee that all tuples in A LEVEL
|
|
||||||
* are unique, not in ALL INDEX. So, we can use bti_itup->t_tid
|
|
||||||
* as unique identifier for a given index tuple (logical position
|
|
||||||
* within a level). - vadim 04/09/97
|
|
||||||
*/
|
|
||||||
|
|
||||||
typedef struct BTItemData
|
|
||||||
{
|
|
||||||
IndexTupleData bti_itup;
|
|
||||||
} BTItemData;
|
|
||||||
|
|
||||||
typedef BTItemData *BTItem;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* For XLOG: size without alignement. Sizeof works as long as
|
|
||||||
* IndexTupleData has exactly 8 bytes.
|
|
||||||
*/
|
|
||||||
#define SizeOfBTItem sizeof(BTItemData)
|
|
||||||
|
|
||||||
/* Test whether items are the "same" per the above notes */
|
|
||||||
#define BTItemSame(i1, i2) ( (i1)->bti_itup.t_tid.ip_blkid.bi_hi == \
|
|
||||||
(i2)->bti_itup.t_tid.ip_blkid.bi_hi && \
|
|
||||||
(i1)->bti_itup.t_tid.ip_blkid.bi_lo == \
|
|
||||||
(i2)->bti_itup.t_tid.ip_blkid.bi_lo && \
|
|
||||||
(i1)->bti_itup.t_tid.ip_posid == \
|
|
||||||
(i2)->bti_itup.t_tid.ip_posid )
|
|
||||||
|
|
||||||
/*
|
|
||||||
* BTStackData -- As we descend a tree, we push the (key, pointer)
|
|
||||||
* pairs from internal nodes onto a private stack. If we split a
|
|
||||||
* leaf, we use this stack to walk back up the tree and insert data
|
|
||||||
* into parent nodes (and possibly to split them, too). Lehman and
|
|
||||||
* Yao's update algorithm guarantees that under no circumstances can
|
|
||||||
* our private stack give us an irredeemably bad picture up the tree.
|
|
||||||
* Again, see the paper for details.
|
|
||||||
*/
|
|
||||||
|
|
||||||
typedef struct BTStackData
|
|
||||||
{
|
|
||||||
BlockNumber bts_blkno;
|
|
||||||
OffsetNumber bts_offset;
|
|
||||||
BTItemData bts_btitem;
|
|
||||||
struct BTStackData *bts_parent;
|
|
||||||
} BTStackData;
|
|
||||||
|
|
||||||
typedef BTStackData *BTStack;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We need to be able to tell the difference between read and write
|
|
||||||
* requests for pages, in order to do locking correctly.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define BT_READ BUFFER_LOCK_SHARE
|
|
||||||
#define BT_WRITE BUFFER_LOCK_EXCLUSIVE
|
|
||||||
|
|
||||||
/*
|
|
||||||
* In general, the btree code tries to localize its knowledge about
|
|
||||||
* page layout to a couple of routines. However, we need a special
|
|
||||||
* value to indicate "no page number" in those places where we expect
|
|
||||||
* page numbers. We can use zero for this because we never need to
|
|
||||||
* make a pointer to the metadata page.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define P_NONE 0
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Macros to test whether a page is leftmost or rightmost on its tree level,
|
|
||||||
* as well as other state info kept in the opaque data.
|
|
||||||
*/
|
|
||||||
#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE)
|
|
||||||
#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE)
|
|
||||||
#define P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF)
|
|
||||||
#define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT)
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
|
|
||||||
* page. The high key is not a data key, but gives info about what range of
|
|
||||||
* keys is supposed to be on this page. The high key on a page is required
|
|
||||||
* to be greater than or equal to any data key that appears on the page.
|
|
||||||
* If we find ourselves trying to insert a key > high key, we know we need
|
|
||||||
* to move right (this should only happen if the page was split since we
|
|
||||||
* examined the parent page).
|
|
||||||
*
|
|
||||||
* Our insertion algorithm guarantees that we can use the initial least key
|
|
||||||
* on our right sibling as the high key. Once a page is created, its high
|
|
||||||
* key changes only if the page is split.
|
|
||||||
*
|
|
||||||
* On a non-rightmost page, the high key lives in item 1 and data items
|
|
||||||
* start in item 2. Rightmost pages have no high key, so we store data
|
|
||||||
* items beginning in item 1.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define P_HIKEY ((OffsetNumber) 1)
|
|
||||||
#define P_FIRSTKEY ((OffsetNumber) 2)
|
|
||||||
#define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
|
|
||||||
|
|
||||||
/*
|
|
||||||
* XLOG allows to store some information in high 4 bits of log
|
|
||||||
* record xl_info field
|
|
||||||
*/
|
|
||||||
#define XLOG_BTREE_DELETE 0x00 /* delete btitem */
|
|
||||||
#define XLOG_BTREE_INSERT 0x10 /* add btitem without split */
|
|
||||||
#define XLOG_BTREE_SPLIT 0x20 /* add btitem with split */
|
|
||||||
#define XLOG_BTREE_SPLEFT 0x30 /* as above + flag that new btitem */
|
|
||||||
/* goes to the left sibling */
|
|
||||||
#define XLOG_BTREE_NEWROOT 0x40 /* new root page */
|
|
||||||
|
|
||||||
#define XLOG_BTREE_LEAF 0x80 /* leaf/internal page was changed */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* All what we need to find changed index tuple
|
|
||||||
*/
|
|
||||||
typedef struct xl_btreetid
|
|
||||||
{
|
|
||||||
RelFileNode node;
|
|
||||||
ItemPointerData tid; /* changed tuple id */
|
|
||||||
} xl_btreetid;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* This is what we need to know about delete
|
|
||||||
*/
|
|
||||||
typedef struct xl_btree_delete
|
|
||||||
{
|
|
||||||
xl_btreetid target; /* deleted tuple id */
|
|
||||||
} xl_btree_delete;
|
|
||||||
|
|
||||||
#define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData)
|
|
||||||
|
|
||||||
/*
|
|
||||||
* This is what we need to know about pure (without split) insert
|
|
||||||
*/
|
|
||||||
typedef struct xl_btree_insert
|
|
||||||
{
|
|
||||||
xl_btreetid target; /* inserted tuple id */
|
|
||||||
/* BTITEM FOLLOWS AT END OF STRUCT */
|
|
||||||
} xl_btree_insert;
|
|
||||||
|
|
||||||
#define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData)
|
|
||||||
|
|
||||||
/*
|
|
||||||
* On insert with split we save items of both left and right siblings
|
|
||||||
* and restore content of both pages from log record
|
|
||||||
*/
|
|
||||||
typedef struct xl_btree_split
|
|
||||||
{
|
|
||||||
xl_btreetid target; /* inserted tuple id */
|
|
||||||
BlockIdData otherblk; /* second block participated in split: */
|
|
||||||
/* first one is stored in target' tid */
|
|
||||||
BlockIdData parentblk; /* parent block */
|
|
||||||
BlockIdData leftblk; /* prev left block */
|
|
||||||
BlockIdData rightblk; /* next right block */
|
|
||||||
uint16 leftlen; /* len of left page items below */
|
|
||||||
/* LEFT AND RIGHT PAGES ITEMS FOLLOW AT THE END */
|
|
||||||
} xl_btree_split;
|
|
||||||
|
|
||||||
#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
|
|
||||||
|
|
||||||
/*
|
|
||||||
* New root log record.
|
|
||||||
*/
|
|
||||||
typedef struct xl_btree_newroot
|
|
||||||
{
|
|
||||||
RelFileNode node;
|
|
||||||
int32 level;
|
|
||||||
BlockIdData rootblk;
|
|
||||||
/* 0 or 2 BTITEMS FOLLOW AT END OF STRUCT */
|
|
||||||
} xl_btree_newroot;
|
|
||||||
|
|
||||||
#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, rootblk) + sizeof(BlockIdData))
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Operator strategy numbers -- ordering of these is <, <=, =, >=, >
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define BTLessStrategyNumber 1
|
|
||||||
#define BTLessEqualStrategyNumber 2
|
|
||||||
#define BTEqualStrategyNumber 3
|
|
||||||
#define BTGreaterEqualStrategyNumber 4
|
|
||||||
#define BTGreaterStrategyNumber 5
|
|
||||||
#define BTMaxStrategyNumber 5
|
|
||||||
|
|
||||||
/*
|
|
||||||
* When a new operator class is declared, we require that the user
|
|
||||||
* supply us with an amproc procedure for determining whether, for
|
|
||||||
* two keys a and b, a < b, a = b, or a > b. This routine must
|
|
||||||
* return < 0, 0, > 0, respectively, in these three cases. Since we
|
|
||||||
* only have one such proc in amproc, it's number 1.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define BTORDER_PROC 1
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* prototypes for functions in nbtree.c (external entry points for btree)
|
* prototypes for functions in nbtree.c (external entry points for btree)
|
||||||
*/
|
*/
|
||||||
|
@ -340,27 +427,26 @@ extern Datum btmarkpos(PG_FUNCTION_ARGS);
|
||||||
extern Datum btrestrpos(PG_FUNCTION_ARGS);
|
extern Datum btrestrpos(PG_FUNCTION_ARGS);
|
||||||
extern Datum btbulkdelete(PG_FUNCTION_ARGS);
|
extern Datum btbulkdelete(PG_FUNCTION_ARGS);
|
||||||
|
|
||||||
extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
|
|
||||||
extern void btree_undo(XLogRecPtr lsn, XLogRecord *record);
|
|
||||||
extern void btree_desc(char *buf, uint8 xl_info, char *rec);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* prototypes for functions in nbtinsert.c
|
* prototypes for functions in nbtinsert.c
|
||||||
*/
|
*/
|
||||||
extern InsertIndexResult _bt_doinsert(Relation rel, BTItem btitem,
|
extern InsertIndexResult _bt_doinsert(Relation rel, BTItem btitem,
|
||||||
bool index_is_unique, Relation heapRel);
|
bool index_is_unique, Relation heapRel);
|
||||||
|
extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
|
||||||
|
BTStack stack, bool is_root, bool is_only);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* prototypes for functions in nbtpage.c
|
* prototypes for functions in nbtpage.c
|
||||||
*/
|
*/
|
||||||
extern void _bt_metapinit(Relation rel);
|
extern void _bt_metapinit(Relation rel);
|
||||||
extern Buffer _bt_getroot(Relation rel, int access);
|
extern Buffer _bt_getroot(Relation rel, int access);
|
||||||
|
extern Buffer _bt_gettrueroot(Relation rel);
|
||||||
extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
|
extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
|
||||||
extern void _bt_relbuf(Relation rel, Buffer buf);
|
extern void _bt_relbuf(Relation rel, Buffer buf);
|
||||||
extern void _bt_wrtbuf(Relation rel, Buffer buf);
|
extern void _bt_wrtbuf(Relation rel, Buffer buf);
|
||||||
extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
|
extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
|
||||||
extern void _bt_pageinit(Page page, Size size);
|
extern void _bt_pageinit(Page page, Size size);
|
||||||
extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, int level);
|
extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level);
|
||||||
extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid);
|
extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -377,6 +463,7 @@ extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
|
||||||
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
|
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
|
||||||
extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
|
extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
|
||||||
extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
|
extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
|
||||||
|
extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* prototypes for functions in nbtstrat.c
|
* prototypes for functions in nbtstrat.c
|
||||||
|
@ -407,4 +494,13 @@ extern void _bt_spooldestroy(BTSpool *btspool);
|
||||||
extern void _bt_spool(BTItem btitem, BTSpool *btspool);
|
extern void _bt_spool(BTItem btitem, BTSpool *btspool);
|
||||||
extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
|
extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* prototypes for functions in nbtxlog.c
|
||||||
|
*/
|
||||||
|
extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
|
||||||
|
extern void btree_undo(XLogRecPtr lsn, XLogRecord *record);
|
||||||
|
extern void btree_desc(char *buf, uint8 xl_info, char *rec);
|
||||||
|
extern void btree_xlog_startup(void);
|
||||||
|
extern void btree_xlog_cleanup(void);
|
||||||
|
|
||||||
#endif /* NBTREE_H */
|
#endif /* NBTREE_H */
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
|
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* $Id: xlog.h,v 1.40 2002/11/15 02:44:57 momjian Exp $
|
* $Id: xlog.h,v 1.41 2003/02/21 00:06:22 tgl Exp $
|
||||||
*/
|
*/
|
||||||
#ifndef XLOG_H
|
#ifndef XLOG_H
|
||||||
#define XLOG_H
|
#define XLOG_H
|
||||||
|
@ -145,10 +145,12 @@ typedef XLogPageHeaderData *XLogPageHeader;
|
||||||
*/
|
*/
|
||||||
typedef struct RmgrData
|
typedef struct RmgrData
|
||||||
{
|
{
|
||||||
char *rm_name;
|
const char *rm_name;
|
||||||
void (*rm_redo) (XLogRecPtr lsn, XLogRecord *rptr);
|
void (*rm_redo) (XLogRecPtr lsn, XLogRecord *rptr);
|
||||||
void (*rm_undo) (XLogRecPtr lsn, XLogRecord *rptr);
|
void (*rm_undo) (XLogRecPtr lsn, XLogRecord *rptr);
|
||||||
void (*rm_desc) (char *buf, uint8 xl_info, char *rec);
|
void (*rm_desc) (char *buf, uint8 xl_info, char *rec);
|
||||||
|
void (*rm_startup) (void);
|
||||||
|
void (*rm_cleanup) (void);
|
||||||
} RmgrData;
|
} RmgrData;
|
||||||
|
|
||||||
extern RmgrData RmgrTable[];
|
extern RmgrData RmgrTable[];
|
||||||
|
|
|
@ -37,7 +37,7 @@
|
||||||
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
|
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* $Id: catversion.h,v 1.177 2003/02/16 02:30:39 tgl Exp $
|
* $Id: catversion.h,v 1.178 2003/02/21 00:06:22 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
|
@ -53,6 +53,6 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* yyyymmddN */
|
/* yyyymmddN */
|
||||||
#define CATALOG_VERSION_NO 200302151
|
#define CATALOG_VERSION_NO 200302171
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue