From 747cef5a58ca9122aca4e7a901d1f22ef70d94bc Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 26 Sep 2023 14:14:49 +0300 Subject: [PATCH] Fix another bug in parent page splitting during GiST index build. Yet another bug in the ilk of commits a7ee7c851 and 741b88435. In 741b88435, we took care to clear the memorized location of the downlink when we split the parent page, because splitting the parent page can move the downlink. But we missed that even *updating* a tuple on the parent can move it, because updating a tuple on a gist page is implemented as a delete+insert, so the updated tuple gets moved to the end of the page. This commit fixes the bug in two different ways (belt and suspenders): 1. Clear the downlink when we update a tuple on the parent page, even if it's not split. This the same approach as in commits a7ee7c851 and 741b88435. I also noticed that gistFindCorrectParent did not clear the 'downlinkoffnum' when it stepped to the right sibling. Fix that too, as it seems like a clear bug even though I haven't been able to find a test case to hit that. 2. Change gistFindCorrectParent so that it treats 'downlinkoffnum' merely as a hint. It now always first checks if the downlink is still at that location, and if not, it scans the page like before. That's more robust if there are still more cases where we fail to clear 'downlinkoffnum' that we haven't yet uncovered. With this, it's no longer necessary to meticulously clear 'downlinkoffnum', so this makes the previous fixes unnecessary, but I didn't revert them because it still seems nice to clear it when we know that the downlink has moved. Also add the test case using the same test data that Alexander posted. I tried to reduce it to a smaller test, and I also tried to reproduce this with different test data, but I was not able to, so let's just include what we have. Backpatch to v12, like the previous fixes. Reported-by: Alexander Lakhin Discussion: https://www.postgresql.org/message-id/18129-caca016eaf0c3702@postgresql.org --- contrib/intarray/expected/_int.out | 91 +++++++++++++ contrib/intarray/sql/_int.sql | 35 +++++ src/backend/access/gist/gist.c | 204 ++++++++++++++++------------- 3 files changed, 238 insertions(+), 92 deletions(-) diff --git a/contrib/intarray/expected/_int.out b/contrib/intarray/expected/_int.out index 7dd28ea145..2f33c7e18d 100644 --- a/contrib/intarray/expected/_int.out +++ b/contrib/intarray/expected/_int.out @@ -857,4 +857,95 @@ SELECT count(*) from test__int WHERE a @@ '!20 & !21'; 6343 (1 row) +DROP INDEX text_idx; +-- Repeat the same queries with an extended data set. The data set is the +-- same that we used before, except that each element in the array is +-- repeated three times, offset by 1000 and 2000. For example, {1, 5} +-- becomes {1, 1001, 2001, 5, 1005, 2005}. +-- +-- That has proven to be unreasonably effective at exercising codepaths in +-- core GiST code related to splitting parent pages, which is not covered by +-- other tests. This is a bit out-of-place as the point is to test core GiST +-- code rather than this extension, but there is no suitable GiST opclass in +-- core that would reach the same codepaths. +CREATE TABLE more__int AS SELECT + -- Leave alone NULLs, empty arrays and the one row that we use to test + -- equality + CASE WHEN a IS NULL OR a = '{}' OR a = '{73,23,20}' THEN a ELSE + (select array_agg(u) || array_agg(u + 1000) || array_agg(u + 2000) from (select unnest(a) u) x) + END AS a, a as b + FROM test__int; +CREATE INDEX ON more__int using gist (a gist__int_ops(numranges = 252)); +SELECT count(*) from more__int WHERE a && '{23,50}'; + count +------- + 403 +(1 row) + +SELECT count(*) from more__int WHERE a @@ '23|50'; + count +------- + 403 +(1 row) + +SELECT count(*) from more__int WHERE a @> '{23,50}'; + count +------- + 12 +(1 row) + +SELECT count(*) from more__int WHERE a @@ '23&50'; + count +------- + 12 +(1 row) + +SELECT count(*) from more__int WHERE a @> '{20,23}'; + count +------- + 12 +(1 row) + +SELECT count(*) from more__int WHERE a <@ '{73,23,20}'; + count +------- + 10 +(1 row) + +SELECT count(*) from more__int WHERE a = '{73,23,20}'; + count +------- + 1 +(1 row) + +SELECT count(*) from more__int WHERE a @@ '50&68'; + count +------- + 9 +(1 row) + +SELECT count(*) from more__int WHERE a @> '{20,23}' or a @> '{50,68}'; + count +------- + 21 +(1 row) + +SELECT count(*) from more__int WHERE a @@ '(20&23)|(50&68)'; + count +------- + 21 +(1 row) + +SELECT count(*) from more__int WHERE a @@ '20 | !21'; + count +------- + 6566 +(1 row) + +SELECT count(*) from more__int WHERE a @@ '!20 & !21'; + count +------- + 6343 +(1 row) + RESET enable_seqscan; diff --git a/contrib/intarray/sql/_int.sql b/contrib/intarray/sql/_int.sql index f8002c93fc..bd3e01208d 100644 --- a/contrib/intarray/sql/_int.sql +++ b/contrib/intarray/sql/_int.sql @@ -180,4 +180,39 @@ SELECT count(*) from test__int WHERE a @@ '(20&23)|(50&68)'; SELECT count(*) from test__int WHERE a @@ '20 | !21'; SELECT count(*) from test__int WHERE a @@ '!20 & !21'; +DROP INDEX text_idx; + +-- Repeat the same queries with an extended data set. The data set is the +-- same that we used before, except that each element in the array is +-- repeated three times, offset by 1000 and 2000. For example, {1, 5} +-- becomes {1, 1001, 2001, 5, 1005, 2005}. +-- +-- That has proven to be unreasonably effective at exercising codepaths in +-- core GiST code related to splitting parent pages, which is not covered by +-- other tests. This is a bit out-of-place as the point is to test core GiST +-- code rather than this extension, but there is no suitable GiST opclass in +-- core that would reach the same codepaths. +CREATE TABLE more__int AS SELECT + -- Leave alone NULLs, empty arrays and the one row that we use to test + -- equality + CASE WHEN a IS NULL OR a = '{}' OR a = '{73,23,20}' THEN a ELSE + (select array_agg(u) || array_agg(u + 1000) || array_agg(u + 2000) from (select unnest(a) u) x) + END AS a, a as b + FROM test__int; +CREATE INDEX ON more__int using gist (a gist__int_ops(numranges = 252)); + +SELECT count(*) from more__int WHERE a && '{23,50}'; +SELECT count(*) from more__int WHERE a @@ '23|50'; +SELECT count(*) from more__int WHERE a @> '{23,50}'; +SELECT count(*) from more__int WHERE a @@ '23&50'; +SELECT count(*) from more__int WHERE a @> '{20,23}'; +SELECT count(*) from more__int WHERE a <@ '{73,23,20}'; +SELECT count(*) from more__int WHERE a = '{73,23,20}'; +SELECT count(*) from more__int WHERE a @@ '50&68'; +SELECT count(*) from more__int WHERE a @> '{20,23}' or a @> '{50,68}'; +SELECT count(*) from more__int WHERE a @@ '(20&23)|(50&68)'; +SELECT count(*) from more__int WHERE a @@ '20 | !21'; +SELECT count(*) from more__int WHERE a @@ '!20 & !21'; + + RESET enable_seqscan; diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 0683f42c25..3c519f743f 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -1017,87 +1017,106 @@ gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum) * remain so at exit, but it might not be the same page anymore. */ static void -gistFindCorrectParent(Relation r, GISTInsertStack *child) +gistFindCorrectParent(Relation r, GISTInsertStack *child, bool is_build) { GISTInsertStack *parent = child->parent; + ItemId iid; + IndexTuple idxtuple; + OffsetNumber maxoff; + GISTInsertStack *ptr; gistcheckpage(r, parent->buffer); parent->page = (Page) BufferGetPage(parent->buffer); + maxoff = PageGetMaxOffsetNumber(parent->page); - /* here we don't need to distinguish between split and page update */ - if (child->downlinkoffnum == InvalidOffsetNumber || - parent->lsn != PageGetLSN(parent->page)) + /* Check if the downlink is still where it was before */ + if (child->downlinkoffnum != InvalidOffsetNumber && child->downlinkoffnum <= maxoff) { - /* parent is changed, look child in right links until found */ - OffsetNumber i, - maxoff; - ItemId iid; - IndexTuple idxtuple; - GISTInsertStack *ptr; - - while (true) - { - maxoff = PageGetMaxOffsetNumber(parent->page); - for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) - { - iid = PageGetItemId(parent->page, i); - idxtuple = (IndexTuple) PageGetItem(parent->page, iid); - if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno) - { - /* yes!!, found */ - child->downlinkoffnum = i; - return; - } - } - - parent->blkno = GistPageGetOpaque(parent->page)->rightlink; - UnlockReleaseBuffer(parent->buffer); - if (parent->blkno == InvalidBlockNumber) - { - /* - * End of chain and still didn't find parent. It's a very-very - * rare situation when root splitted. - */ - break; - } - parent->buffer = ReadBuffer(r, parent->blkno); - LockBuffer(parent->buffer, GIST_EXCLUSIVE); - gistcheckpage(r, parent->buffer); - parent->page = (Page) BufferGetPage(parent->buffer); - } - - /* - * awful!!, we need search tree to find parent ... , but before we - * should release all old parent - */ - - ptr = child->parent->parent; /* child->parent already released - * above */ - while (ptr) - { - ReleaseBuffer(ptr->buffer); - ptr = ptr->parent; - } - - /* ok, find new path */ - ptr = parent = gistFindPath(r, child->blkno, &child->downlinkoffnum); - - /* read all buffers as expected by caller */ - /* note we don't lock them or gistcheckpage them here! */ - while (ptr) - { - ptr->buffer = ReadBuffer(r, ptr->blkno); - ptr->page = (Page) BufferGetPage(ptr->buffer); - ptr = ptr->parent; - } - - /* install new chain of parents to stack */ - child->parent = parent; - - /* make recursive call to normal processing */ - LockBuffer(child->parent->buffer, GIST_EXCLUSIVE); - gistFindCorrectParent(r, child); + iid = PageGetItemId(parent->page, child->downlinkoffnum); + idxtuple = (IndexTuple) PageGetItem(parent->page, iid); + if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno) + return; /* still there */ } + + /* + * The page has changed since we looked. During normal operation, every + * update of a page changes its LSN, so the LSN we memorized should have + * changed too. During index build, however, we don't WAL-log the changes + * until we have built the index, so the LSN doesn't change. There is no + * concurrent activity during index build, but we might have changed the + * parent ourselves. + */ + Assert(parent->lsn != PageGetLSN(parent->page) || is_build); + + /* + * Scan the page to re-find the downlink. If the page was split, it might + * have moved to a different page, so follow the right links until we find + * it. + */ + while (true) + { + OffsetNumber i; + + maxoff = PageGetMaxOffsetNumber(parent->page); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + iid = PageGetItemId(parent->page, i); + idxtuple = (IndexTuple) PageGetItem(parent->page, iid); + if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno) + { + /* yes!!, found */ + child->downlinkoffnum = i; + return; + } + } + + parent->blkno = GistPageGetOpaque(parent->page)->rightlink; + parent->downlinkoffnum = InvalidOffsetNumber; + UnlockReleaseBuffer(parent->buffer); + if (parent->blkno == InvalidBlockNumber) + { + /* + * End of chain and still didn't find parent. It's a very-very + * rare situation when root splitted. + */ + break; + } + parent->buffer = ReadBuffer(r, parent->blkno); + LockBuffer(parent->buffer, GIST_EXCLUSIVE); + gistcheckpage(r, parent->buffer); + parent->page = (Page) BufferGetPage(parent->buffer); + } + + /* + * awful!!, we need search tree to find parent ... , but before we should + * release all old parent + */ + + ptr = child->parent->parent; /* child->parent already released above */ + while (ptr) + { + ReleaseBuffer(ptr->buffer); + ptr = ptr->parent; + } + + /* ok, find new path */ + ptr = parent = gistFindPath(r, child->blkno, &child->downlinkoffnum); + + /* read all buffers as expected by caller */ + /* note we don't lock them or gistcheckpage them here! */ + while (ptr) + { + ptr->buffer = ReadBuffer(r, ptr->blkno); + ptr->page = (Page) BufferGetPage(ptr->buffer); + ptr = ptr->parent; + } + + /* install new chain of parents to stack */ + child->parent = parent; + + /* make recursive call to normal processing */ + LockBuffer(child->parent->buffer, GIST_EXCLUSIVE); + gistFindCorrectParent(r, child, is_build); } /* @@ -1105,7 +1124,7 @@ gistFindCorrectParent(Relation r, GISTInsertStack *child) */ static IndexTuple gistformdownlink(Relation rel, Buffer buf, GISTSTATE *giststate, - GISTInsertStack *stack) + GISTInsertStack *stack, bool is_build) { Page page = BufferGetPage(buf); OffsetNumber maxoff; @@ -1146,7 +1165,7 @@ gistformdownlink(Relation rel, Buffer buf, GISTSTATE *giststate, ItemId iid; LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE); - gistFindCorrectParent(rel, stack); + gistFindCorrectParent(rel, stack, is_build); iid = PageGetItemId(stack->parent->page, stack->downlinkoffnum); downlink = (IndexTuple) PageGetItem(stack->parent->page, iid); downlink = CopyIndexTuple(downlink); @@ -1192,7 +1211,7 @@ gistfixsplit(GISTInsertState *state, GISTSTATE *giststate) page = BufferGetPage(buf); /* Form the new downlink tuples to insert to parent */ - downlink = gistformdownlink(state->r, buf, giststate, stack); + downlink = gistformdownlink(state->r, buf, giststate, stack, state->is_build); si->buf = buf; si->downlink = downlink; @@ -1346,7 +1365,7 @@ gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, right = (GISTPageSplitInfo *) list_nth(splitinfo, pos); left = (GISTPageSplitInfo *) list_nth(splitinfo, pos - 1); - gistFindCorrectParent(state->r, stack); + gistFindCorrectParent(state->r, stack, state->is_build); if (gistinserttuples(state, stack->parent, giststate, &right->downlink, 1, InvalidOffsetNumber, @@ -1371,21 +1390,22 @@ gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, */ tuples[0] = left->downlink; tuples[1] = right->downlink; - gistFindCorrectParent(state->r, stack); - if (gistinserttuples(state, stack->parent, giststate, - tuples, 2, - stack->downlinkoffnum, - left->buf, right->buf, - true, /* Unlock parent */ - unlockbuf /* Unlock stack->buffer if caller wants - * that */ - )) - { - /* - * If the parent page was split, the downlink might have moved. - */ - stack->downlinkoffnum = InvalidOffsetNumber; - } + gistFindCorrectParent(state->r, stack, state->is_build); + (void) gistinserttuples(state, stack->parent, giststate, + tuples, 2, + stack->downlinkoffnum, + left->buf, right->buf, + true, /* Unlock parent */ + unlockbuf /* Unlock stack->buffer if caller + * wants that */ + ); + + /* + * The downlink might have moved when we updated it. Even if the page + * wasn't split, because gistinserttuples() implements updating the old + * tuple by removing and re-inserting it! + */ + stack->downlinkoffnum = InvalidOffsetNumber; Assert(left->buf == stack->buffer);