Improve checking of child pages in contrib/amcheck.

This commit eliminates lossiness in check for missing parent downlinks in
B-tree.  Instead of collecting lossy bitmap, we check for missing downlinks
while visiting child pages referenced by downlinks of target level.  We
traverse from previous child page to the subsequent child page by right links.
Intermediate pages are candidates to have lost parent downlinks.

Also this commit introduces matching of child high key to the pivot key of
it's parent.

Discussion: https://postgr.es/m/CAPpHfduoF-c4RhOyOm%3D4-Y367%2B8txq9Q6iM_ty0OYc8si1Abww%40mail.gmail.com
Author: Alexander Korotkov
Reviewed-by: Peter Geoghegan
This commit is contained in:
Alexander Korotkov 2020-03-11 12:00:31 +03:00
parent a88a285c7e
commit d114cc5387
2 changed files with 466 additions and 124 deletions

View File

@ -94,16 +94,26 @@ typedef struct BtreeCheckState
/* Target page's LSN */
XLogRecPtr targetlsn;
/*
* Low key: high key of left sibling of target page. Used only for child
* verification. So, 'lowkey' is kept only when 'readonly' is set.
*/
IndexTuple lowkey;
/*
* The rightlink and incomplete split flag of block one level down to the
* target page, which was visited last time via downlink from taget page.
* We use it to check for missing downlinks.
*/
BlockNumber prevrightlink;
bool previncompletesplit;
/*
* Mutable state, for optional heapallindexed verification:
*/
/* Bloom filter fingerprints B-Tree index */
bloom_filter *filter;
/* Bloom filter fingerprints downlink blocks within tree */
bloom_filter *downlinkfilter;
/* Right half of incomplete split marker */
bool rightsplit;
/* Debug counter */
int64 heaptuplespresent;
} BtreeCheckState;
@ -137,9 +147,14 @@ static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
BtreeLevel level);
static void bt_target_page_check(BtreeCheckState *state);
static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state);
static void bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
BlockNumber childblock);
static void bt_downlink_missing_check(BtreeCheckState *state);
static void bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
OffsetNumber downlinkoffnum);
static void bt_child_highkey_check(BtreeCheckState *state,
OffsetNumber target_downlinkoffnum,
Page loaded_child,
uint32 target_level);
static void bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
BlockNumber targetblock, Page target);
static void bt_tuple_present_callback(Relation index, ItemPointer tid,
Datum *values, bool *isnull,
bool tupleIsAlive, void *checkstate);
@ -470,20 +485,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
errmsg("index \"%s\" cannot be verified using transaction snapshot",
RelationGetRelationName(rel))));
}
else
{
/*
* Extra readonly downlink check.
*
* In readonly case, we know that there cannot be a concurrent
* page split or a concurrent page deletion, which gives us the
* opportunity to verify that every non-ignorable page had a
* downlink one level up. We must be tolerant of interrupted page
* splits and page deletions, though. This is taken care of in
* bt_downlink_missing_check().
*/
state->downlinkfilter = bloom_create(total_pages, work_mem, seed);
}
}
Assert(!state->rootdescend || state->readonly);
@ -532,12 +533,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
current.istruerootlevel = true;
while (current.leftmost != P_NONE)
{
/*
* Leftmost page on level cannot be right half of incomplete split.
* This can go stale immediately in !readonly case.
*/
state->rightsplit = false;
/*
* Verify this level, and get left most page for next level down, if
* not at leaf level
@ -561,16 +556,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
IndexInfo *indexinfo = BuildIndexInfo(state->rel);
TableScanDesc scan;
/* Report on extra downlink checks performed in readonly case */
if (state->readonly)
{
ereport(DEBUG1,
(errmsg_internal("finished verifying presence of downlink blocks within index \"%s\" with bitset %.2f%% set",
RelationGetRelationName(rel),
100.0 * bloom_prop_bits_set(state->downlinkfilter))));
bloom_free(state->downlinkfilter);
}
/*
* Create our own scan for table_index_build_scan(), rather than
* getting it to do so for us. This is required so that we can
@ -673,6 +658,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
level.istruerootlevel ?
" (true root level)" : level.level == 0 ? " (leaf level)" : "");
state->prevrightlink = InvalidBlockNumber;
state->previncompletesplit = false;
do
{
/* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */
@ -692,9 +680,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
* mode, and since a page has no links within other pages
* (siblings and parent) once it is marked fully deleted, it
* should be impossible to land on a fully deleted page in
* readonly mode. See bt_downlink_check() for further details.
* readonly mode. See bt_child_check() for further details.
*
* The bt_downlink_check() P_ISDELETED() check is repeated here so
* The bt_child_check() P_ISDELETED() check is repeated here so
* that pages that are only reachable through sibling links get
* checked.
*/
@ -816,21 +804,57 @@ nextpage:
errmsg("circular link chain found in block %u of index \"%s\"",
current, RelationGetRelationName(state->rel))));
/*
* Record if page that is about to become target is the right half of
* an incomplete page split. This can go stale immediately in
* !readonly case.
*/
state->rightsplit = P_INCOMPLETE_SPLIT(opaque);
leftcurrent = current;
current = opaque->btpo_next;
if (state->lowkey)
{
Assert(state->readonly);
pfree(state->lowkey);
state->lowkey = NULL;
}
/*
* Copy current target high key as the low key of right sibling.
* Allocate memory in upper level context, so it would be cleared
* after reset of target context.
*
* We only need the low key in corner cases of checking child high
* keys. We use high key only when incomplete split on the child level
* falls to the boundary of pages on the target level. See
* bt_child_highkey_check() for details. So, typically we won't end
* up doing anything with low key, but it's simpler for general case
* high key verification to always have it available.
*
* The correctness of managing low key in the case of concurrent
* splits wasn't investigated yet. Thankfully we only need low key
* for readonly verification and concurrent splits won't happen.
*/
if (state->readonly && !P_RIGHTMOST(opaque))
{
IndexTuple itup;
ItemId itemid;
itemid = PageGetItemIdCareful(state, state->targetblock,
state->target, P_HIKEY);
itup = (IndexTuple) PageGetItem(state->target, itemid);
state->lowkey = MemoryContextAlloc(oldcontext, IndexTupleSize(itup));
memcpy(state->lowkey, itup, IndexTupleSize(itup));
}
/* Free page and associated memory for this iteration */
MemoryContextReset(state->targetcontext);
}
while (current != P_NONE);
if (state->lowkey)
{
Assert(state->readonly);
pfree(state->lowkey);
state->lowkey = NULL;
}
/* Don't change context for caller */
MemoryContextSwitchTo(oldcontext);
@ -863,7 +887,9 @@ nextpage:
* tuple.
*
* - That downlink to block was encountered in parent where that's expected.
* (Limited to heapallindexed readonly callers.)
* (Limited to readonly callers.)
*
* - That high keys of child pages matches corresponding pivot keys in parent.
*
* This is also where heapallindexed callers use their Bloom filter to
* fingerprint IndexTuples for later table_index_build_scan() verification.
@ -981,22 +1007,26 @@ bt_target_page_check(BtreeCheckState *state)
(uint32) state->targetlsn)));
}
/* Fingerprint downlink blocks in heapallindexed + readonly case */
if (state->heapallindexed && state->readonly && !P_ISLEAF(topaque))
{
BlockNumber childblock = BTreeTupleGetDownLink(itup);
bloom_add_element(state->downlinkfilter,
(unsigned char *) &childblock,
sizeof(BlockNumber));
}
/*
* Don't try to generate scankey using "negative infinity" item on
* internal pages. They are always truncated to zero attributes.
*/
if (offset_is_negative_infinity(topaque, offset))
{
/*
* We don't call bt_child_check() for "negative infinity" items.
* But if we're performing downlink connectivity check, we do it
* for every item including "negative infinity" one.
*/
if (!P_ISLEAF(topaque) && state->readonly)
{
bt_child_highkey_check(state,
offset,
NULL,
topaque->btpo.level);
}
continue;
}
/*
* Readonly callers may optionally verify that non-pivot tuples can
@ -1340,20 +1370,23 @@ bt_target_page_check(BtreeCheckState *state)
* because it has no useful value to compare).
*/
if (!P_ISLEAF(topaque) && state->readonly)
{
BlockNumber childblock = BTreeTupleGetDownLink(itup);
bt_downlink_check(state, skey, childblock);
}
bt_child_check(state, skey, offset);
}
/*
* * Check if page has a downlink in parent *
* Special case bt_child_highkey_check() call
*
* This can only be checked in heapallindexed + readonly case.
* We don't pass an real downlink, but we've to finish the level
* processing. If condition is satisfied, we've already processed all the
* downlinks from the target level. But there still might be pages to the
* right of the child page pointer to by our rightmost downlink. And they
* might have missing downlinks. This final call checks for them.
*/
if (state->heapallindexed && state->readonly)
bt_downlink_missing_check(state);
if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly)
{
bt_child_highkey_check(state, InvalidOffsetNumber,
NULL, topaque->btpo.level);
}
}
/*
@ -1497,7 +1530,7 @@ bt_right_page_check_scankey(BtreeCheckState *state)
* Top level tree walk caller moves on to next page (makes it the new
* target) following recovery from this race. (cf. The rationale for
* child/downlink verification needing a ShareLock within
* bt_downlink_check(), where page deletion is also the main source of
* bt_child_check(), where page deletion is also the main source of
* trouble.)
*
* Note that it doesn't matter if right sibling page here is actually a
@ -1570,6 +1603,297 @@ bt_right_page_check_scankey(BtreeCheckState *state)
return bt_mkscankey_pivotsearch(state->rel, firstitup);
}
/*
* Check if two tuples are binary identical except the block number. So,
* this function is capable to compare pivot keys on different levels.
*/
static bool
bt_pivot_tuple_identical(IndexTuple itup1, IndexTuple itup2)
{
if (IndexTupleSize(itup1) != IndexTupleSize(itup2))
return false;
if (memcmp(&itup1->t_tid.ip_posid, &itup2->t_tid.ip_posid,
IndexTupleSize(itup1) - offsetof(ItemPointerData, ip_posid)) != 0)
return false;
return true;
}
/*---
* Check high keys on the child level. Traverse rightlinks from previous
* downlink to the current one. Check that there are no intermediate pages
* with missing downlinks.
*
* If 'loaded_child' is given, it's assumed to be the page pointed to by the
* downlink referenced by 'downlinkoffnum' of the target page.
*
* Basically this function is called for each target downlink and checks two
* invariants:
*
* 1) You can reach the next child from previous one via rightlinks;
* 2) Each child high key have matching pivot key on target level.
*
* Consider the sample tree picture.
*
* 1
* / \
* 2 <-> 3
* / \ / \
* 4 <> 5 <> 6 <> 7 <> 8
*
* This function will be called for blocks 4, 5, 6 and 8. Consider what is
* happening for each function call.
*
* - The function call for block 4 initializes data structure and matches high
* key of block 4 to downlink's pivot key of block 2.
* - The high key of block 5 is matched to the high key of block 2.
* - The block 6 has an incomplete split flag set, so its high key isn't
* matched to anything.
* - The function call for block 8 checks that block 8 can be found while
* following rightlinks from block 6. The high key of block 7 will be
* matched to downlink's pivot key in block 3.
*
* There is also final call of this function, which checks that there is no
* missing downlinks for children to the right of the child referenced by
* rightmost downlink in target level.
*/
static void
bt_child_highkey_check(BtreeCheckState *state,
OffsetNumber target_downlinkoffnum,
Page loaded_child,
uint32 target_level)
{
BlockNumber blkno = state->prevrightlink;
Page page;
BTPageOpaque opaque;
bool rightsplit = state->previncompletesplit;
bool first = true;
ItemId itemid;
IndexTuple itup;
BlockNumber downlink;
if (OffsetNumberIsValid(target_downlinkoffnum))
{
itemid = PageGetItemIdCareful(state, state->targetblock,
state->target, target_downlinkoffnum);
itup = (IndexTuple) PageGetItem(state->target, itemid);
downlink = BTreeTupleGetDownLink(itup);
}
else
{
downlink = P_NONE;
}
/*
* If no previous rightlink is memorized for current level just below
* target page's level, we are about to start from the leftmost page. We
* can't follow rightlinks from previous page, because there is no
* previous page. But we still can match high key.
*
* So we initialize variables for the loop above like there is previous
* page referencing current child. Also we imply previous page to not
* have incomplete split flag, that would make us require downlink for
* current child. That's correct, because leftmost page on the level
* should always have parent downlink.
*/
if (!BlockNumberIsValid(blkno))
{
blkno = downlink;
rightsplit = false;
}
/* Move to the right on the child level */
while (true)
{
/*
* Did we traverse the whole tree level and this is check for pages to
* the right of rightmost downlink?
*/
if (blkno == P_NONE && downlink == P_NONE)
{
state->prevrightlink = InvalidBlockNumber;
state->previncompletesplit = false;
return;
}
/* Did we traverse the whole tree level and don't find next downlink? */
if (blkno == P_NONE)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("can't traverse from downlink %u to downlink %u of index \"%s\"",
state->prevrightlink, downlink,
RelationGetRelationName(state->rel))));
/* Load page contents */
if (blkno == downlink && loaded_child)
page = loaded_child;
else
page = palloc_btree_page(state, blkno);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/* The first page we visit at the level should be leftmost */
if (first && !BlockNumberIsValid(state->prevrightlink) && !P_LEFTMOST(opaque))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
state->targetblock, blkno,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
/* Check level for non-ignorable page */
if (!P_IGNORE(opaque) && opaque->btpo.level != target_level - 1)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("block found while following rightlinks from child of index \"%s\" has invalid level",
RelationGetRelationName(state->rel)),
errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
blkno, target_level - 1, opaque->btpo.level)));
/* Try to detect circular links */
if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("circular link chain found in block %u of index \"%s\"",
blkno, RelationGetRelationName(state->rel))));
if (blkno != downlink && !P_IGNORE(opaque))
{
/* blkno probably has missing parent downlink */
bt_downlink_missing_check(state, rightsplit, blkno, page);
}
rightsplit = P_INCOMPLETE_SPLIT(opaque);
/*
* If we visit page with high key, check that it is be equal to the
* target key next to corresponding downlink.
*/
if (!rightsplit && !P_RIGHTMOST(opaque))
{
BTPageOpaque topaque;
IndexTuple highkey;
OffsetNumber pivotkey_offset;
/* Get high key */
itemid = PageGetItemIdCareful(state, blkno, page, P_HIKEY);
highkey = (IndexTuple) PageGetItem(page, itemid);
/*
* There might be two situations when we examine high key. If
* current child page is referenced by given target downlink, we
* should look to the next offset number for matching key from
* target page.
*
* Alternatively, we're following rightlinks somewhere in the
* middle between page referenced by previous target's downlink
* and the page referenced by current target's downlink. If
* current child page hasn't incomplete split flag set, then its
* high key should match to the target's key of current offset
* number. This happens when a previous call here (to
* bt_child_highkey_check()) found an incomplete split, and we
* reach a right sibling page without a downlink -- the right
* sibling page's high key still needs to be matched to a
* separator key on the parent/target level.
*
* Don't apply OffsetNumberNext() to target_downlinkoffnum when we
* already had to step right on the child level. Our traversal of
* the child level must try to move in perfect lockstep behind (to
* the left of) the target/parent level traversal.
*/
if (blkno == downlink)
pivotkey_offset = OffsetNumberNext(target_downlinkoffnum);
else
pivotkey_offset = target_downlinkoffnum;
topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
if (!offset_is_negative_infinity(topaque, pivotkey_offset))
{
/*
* If we're looking for the next pivot tuple in target page,
* but there is no more pivot tuples, then we should match to
* high key instead.
*/
if (pivotkey_offset > PageGetMaxOffsetNumber(state->target))
{
if (P_RIGHTMOST(topaque))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("child high key is greater than rightmost pivot key on target level in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
state->targetblock, blkno,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
pivotkey_offset = P_HIKEY;
}
itemid = PageGetItemIdCareful(state, state->targetblock,
state->target, pivotkey_offset);
itup = (IndexTuple) PageGetItem(state->target, itemid);
}
else
{
/*
* We cannot try to match child's high key to a negative
* infinity key in target, since there is nothing to compare.
* However, it's still possible to match child's high key
* outside of target page. The reason why we're are is that
* bt_child_highkey_check() was previously called for the
* cousin page of 'loaded_child', which is incomplete split.
* So, now we traverse to the right of that cousin page and
* current child level page under consideration still belongs
* to the subtree of target's left sibling. Thus, we need to
* match child's high key to it's left uncle page high key.
* Thankfully we saved it, it's called a "low key" of target
* page.
*/
if (!state->lowkey)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("can't find left sibling high key in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
state->targetblock, blkno,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
itup = state->lowkey;
}
if (!bt_pivot_tuple_identical(highkey, itup))
{
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("mismatch between parent key and child high key in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
state->targetblock, blkno,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
}
}
/* Exit if we already found next downlink */
if (blkno == downlink)
{
state->prevrightlink = opaque->btpo_next;
state->previncompletesplit = rightsplit;
return;
}
/* Traverse to the next page using rightlink */
blkno = opaque->btpo_next;
/* Free page contents if it's allocated by us */
if (page != loaded_child)
pfree(page);
first = false;
}
}
/*
* Checks one of target's downlink against its child page.
*
@ -1578,15 +1902,28 @@ bt_right_page_check_scankey(BtreeCheckState *state)
* The downlink insertion into the target is probably where any problem raised
* here arises, and there is no such thing as a parent link, so doing the
* verification this way around is much more practical.
*
* This function visits child page and it's sequentially called for each
* downlink of target page. Assuming this we also check downlink connectivity
* here in order to save child page visits.
*/
static void
bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
BlockNumber childblock)
bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
OffsetNumber downlinkoffnum)
{
ItemId itemid;
IndexTuple itup;
BlockNumber childblock;
OffsetNumber offset;
OffsetNumber maxoffset;
Page child;
BTPageOpaque copaque;
BTPageOpaque topaque;
itemid = PageGetItemIdCareful(state, state->targetblock,
state->target, downlinkoffnum);
itup = (IndexTuple) PageGetItem(state->target, itemid);
childblock = BTreeTupleGetDownLink(itup);
/*
* Caller must have ShareLock on target relation, because of
@ -1636,10 +1973,18 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
* Check all items, rather than checking just the first and trusting that
* the operator class obeys the transitive law.
*/
topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
child = palloc_btree_page(state, childblock);
copaque = (BTPageOpaque) PageGetSpecialPointer(child);
maxoffset = PageGetMaxOffsetNumber(child);
/*
* Since we've already loaded the child block, combine this check with
* check for downlink connectivity.
*/
bt_child_highkey_check(state, downlinkoffnum,
child, topaque->btpo.level);
/*
* Since there cannot be a concurrent VACUUM operation in readonly mode,
* and since a page has no links within other pages (siblings and parent)
@ -1734,23 +2079,27 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
* be concerned about concurrent page splits or page deletions.
*/
static void
bt_downlink_missing_check(BtreeCheckState *state)
bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
BlockNumber blkno, Page page)
{
BTPageOpaque topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
ItemId itemid;
IndexTuple itup;
Page child;
BTPageOpaque copaque;
uint32 level;
BlockNumber childblk;
XLogRecPtr pagelsn;
Assert(state->heapallindexed && state->readonly);
Assert(!P_IGNORE(topaque));
Assert(state->readonly);
Assert(!P_IGNORE(opaque));
/* No next level up with downlinks to fingerprint from the true root */
if (P_ISROOT(topaque))
if (P_ISROOT(opaque))
return;
pagelsn = PageGetLSN(page);
/*
* Incomplete (interrupted) page splits can account for the lack of a
* downlink. Some inserting transaction should eventually complete the
@ -1772,54 +2121,47 @@ bt_downlink_missing_check(BtreeCheckState *state)
* by design, so it shouldn't be taken to indicate corruption. See
* _bt_pagedel() for full details.
*/
if (state->rightsplit)
if (rightsplit)
{
ereport(DEBUG1,
(errcode(ERRCODE_NO_DATA),
errmsg("harmless interrupted page split detected in index %s",
RelationGetRelationName(state->rel)),
errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
state->targetblock, topaque->btpo.level,
topaque->btpo_prev,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
blkno, opaque->btpo.level,
opaque->btpo_prev,
(uint32) (pagelsn >> 32),
(uint32) pagelsn)));
return;
}
/* Target's downlink is typically present in parent/fingerprinted */
if (!bloom_lacks_element(state->downlinkfilter,
(unsigned char *) &state->targetblock,
sizeof(BlockNumber)))
return;
/*
* Target is probably the "top parent" of a multi-level page deletion.
* We'll need to descend the subtree to make sure that descendant pages
* are consistent with that, though.
* Page under check is probably the "top parent" of a multi-level page
* deletion. We'll need to descend the subtree to make sure that
* descendant pages are consistent with that, though.
*
* If the target page (which must be non-ignorable) is a leaf page, then
* clearly it can't be the top parent. The lack of a downlink is probably
* a symptom of a broad problem that could just as easily cause
* If the page (which must be non-ignorable) is a leaf page, then clearly
* it can't be the top parent. The lack of a downlink is probably a
* symptom of a broad problem that could just as easily cause
* inconsistencies anywhere else.
*/
if (P_ISLEAF(topaque))
if (P_ISLEAF(opaque))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("leaf index block lacks downlink in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Block=%u page lsn=%X/%X.",
state->targetblock,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
blkno,
(uint32) (pagelsn >> 32),
(uint32) pagelsn)));
/* Descend from the target page, which is an internal page */
/* Descend from the given page, which is an internal page */
elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
RelationGetRelationName(state->rel));
level = topaque->btpo.level;
itemid = PageGetItemIdCareful(state, state->targetblock, state->target,
P_FIRSTDATAKEY(topaque));
itup = (IndexTuple) PageGetItem(state->target, itemid);
level = opaque->btpo.level;
itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque));
itup = (IndexTuple) PageGetItem(page, itemid);
childblk = BTreeTupleGetDownLink(itup);
for (;;)
{
@ -1837,8 +2179,8 @@ bt_downlink_missing_check(BtreeCheckState *state)
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
RelationGetRelationName(state->rel)),
errdetail_internal("Top parent/target block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
state->targetblock, childblk,
errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
blkno, childblk,
level - 1, copaque->btpo.level)));
level = copaque->btpo.level;
@ -1854,12 +2196,12 @@ bt_downlink_missing_check(BtreeCheckState *state)
* Since there cannot be a concurrent VACUUM operation in readonly mode,
* and since a page has no links within other pages (siblings and parent)
* once it is marked fully deleted, it should be impossible to land on a
* fully deleted page. See bt_downlink_check() for further details.
* fully deleted page. See bt_child_check() for further details.
*
* The bt_downlink_check() P_ISDELETED() check is repeated here because
* bt_downlink_check() does not visit pages reachable through negative
* infinity items. Besides, bt_downlink_check() is unwilling to descend
* multiple levels. (The similar bt_downlink_check() P_ISDELETED() check
* The bt_child_check() P_ISDELETED() check is repeated here because
* bt_child_check() does not visit pages reachable through negative
* infinity items. Besides, bt_child_check() is unwilling to descend
* multiple levels. (The similar bt_child_check() P_ISDELETED() check
* within bt_check_level_from_leftmost() won't reach the page either,
* since the leaf's live siblings should have their sibling links updated
* to bypass the deletion target page when it is marked fully dead.)
@ -1875,26 +2217,26 @@ bt_downlink_missing_check(BtreeCheckState *state)
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("downlink to deleted leaf page found in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Top parent/target block=%u leaf block=%u top parent/target lsn=%X/%X.",
state->targetblock, childblk,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
errdetail_internal("Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%X.",
blkno, childblk,
(uint32) (pagelsn >> 32),
(uint32) pagelsn)));
/*
* Iff leaf page is half-dead, its high key top parent link should point
* to what VACUUM considered to be the top parent page at the instant it
* was interrupted. Provided the high key link actually points to the
* target page, the missing downlink we detected is consistent with there
* having been an interrupted multi-level page deletion. This means that
* the subtree with the target page at its root (a page deletion chain) is
* in a consistent state, enabling VACUUM to resume deleting the entire
* chain the next time it encounters the half-dead leaf page.
* page under check, the missing downlink we detected is consistent with
* there having been an interrupted multi-level page deletion. This means
* that the subtree with the page under check at its root (a page deletion
* chain) is in a consistent state, enabling VACUUM to resume deleting the
* entire chain the next time it encounters the half-dead leaf page.
*/
if (P_ISHALFDEAD(copaque) && !P_RIGHTMOST(copaque))
{
itemid = PageGetItemIdCareful(state, childblk, child, P_HIKEY);
itup = (IndexTuple) PageGetItem(child, itemid);
if (BTreeTupleGetTopParent(itup) == state->targetblock)
if (BTreeTupleGetTopParent(itup) == blkno)
return;
}
@ -1903,9 +2245,9 @@ bt_downlink_missing_check(BtreeCheckState *state)
errmsg("internal index block lacks downlink in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
state->targetblock, topaque->btpo.level,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
blkno, opaque->btpo.level,
(uint32) (pagelsn >> 32),
(uint32) pagelsn)));
}
/*
@ -2146,7 +2488,7 @@ bt_posting_plain_tuple(IndexTuple itup, int n)
* separator key value won't always be available from parent, though, because
* the first items of internal pages are negative infinity items, truncated
* down to zero attributes during internal page splits. While it's true that
* bt_downlink_check() and the high key check can detect most imaginable key
* bt_child_check() and the high key check can detect most imaginable key
* space problems, there are remaining problems it won't detect with non-pivot
* tuples in cousin leaf pages. Starting a search from the root for every
* existing leaf tuple detects small inconsistencies in upper levels of the

View File

@ -125,8 +125,7 @@ ORDER BY c.relpages DESC LIMIT 10;
Optionally, when the <parameter>heapallindexed</parameter>
argument is <literal>true</literal>, the function verifies the
presence of all heap tuples that should be found within the
index, and that there are no missing downlinks in the index
structure. When the optional <parameter>rootdescend</parameter>
index. When the optional <parameter>rootdescend</parameter>
argument is <literal>true</literal>, verification re-finds
tuples on the leaf level by performing a new search from the
root page for each tuple. The checks that can be performed by
@ -136,7 +135,8 @@ ORDER BY c.relpages DESC LIMIT 10;
a more thorough variant of <function>bt_index_check</function>:
unlike <function>bt_index_check</function>,
<function>bt_index_parent_check</function> also checks
invariants that span parent/child relationships.
invariants that span parent/child relationships, including checking
that there are no missing downlinks in the index structure.
<function>bt_index_parent_check</function> follows the general
convention of raising an error if it finds a logical
inconsistency or other problem.