diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index 39e2c07ad3..86f938686c 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -19,6 +19,7 @@ #include "access/xloginsert.h" #include "miscadmin.h" #include "storage/predicate.h" +#include "utils/injection_point.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -28,6 +29,8 @@ static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, Buffer childbuf, GinStatsData *buildStats); static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack, GinStatsData *buildStats); +static void ginFinishOldSplit(GinBtree btree, GinBtreeStack *stack, + GinStatsData *buildStats, int access); /* * Lock buffer by needed method for search. @@ -108,7 +111,7 @@ ginFindLeafPage(GinBtree btree, bool searchMode, * encounter on the way. */ if (!searchMode && GinPageIsIncompleteSplit(page)) - ginFinishSplit(btree, stack, false, NULL); + ginFinishOldSplit(btree, stack, NULL, access); /* * ok, page is correctly locked, we should check to move right .., @@ -128,7 +131,7 @@ ginFindLeafPage(GinBtree btree, bool searchMode, page = BufferGetPage(stack->buffer); if (!searchMode && GinPageIsIncompleteSplit(page)) - ginFinishSplit(btree, stack, false, NULL); + ginFinishOldSplit(btree, stack, NULL, access); } if (GinPageIsLeaf(page)) /* we found, return locked page */ @@ -164,8 +167,11 @@ ginFindLeafPage(GinBtree btree, bool searchMode, * Step right from current page. * * The next page is locked first, before releasing the current page. This is - * crucial to protect from concurrent page deletion (see comment in - * ginDeletePage). + * crucial to prevent concurrent VACUUM from deleting a page that we are about + * to step to. (The lock-coupling isn't strictly necessary when we are + * traversing the tree to find an insert location, because page deletion grabs + * a cleanup lock on the root to prevent any concurrent inserts. See Page + * deletion section in the README. But there's no harm in doing it always.) */ Buffer ginStepRight(Buffer buffer, Relation index, int lockmode) @@ -262,7 +268,7 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack) ptr->parent = root; ptr->off = InvalidOffsetNumber; - ginFinishSplit(btree, ptr, false, NULL); + ginFinishOldSplit(btree, ptr, NULL, GIN_EXCLUSIVE); } leftmostBlkno = btree->getLeftMostChild(btree, page); @@ -291,7 +297,7 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack) ptr->parent = root; ptr->off = InvalidOffsetNumber; - ginFinishSplit(btree, ptr, false, NULL); + ginFinishOldSplit(btree, ptr, NULL, GIN_EXCLUSIVE); } } @@ -670,15 +676,6 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack, bool done; bool first = true; - /* - * freestack == false when we encounter an incompletely split page during - * a scan, while freestack == true is used in the normal scenario that a - * split is finished right after the initial insert. - */ - if (!freestack) - elog(DEBUG1, "finishing incomplete split of block %u in gin index \"%s\"", - stack->blkno, RelationGetRelationName(btree->index)); - /* this loop crawls up the stack until the insertion is complete */ do { @@ -686,6 +683,13 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack, void *insertdata; BlockNumber updateblkno; +#ifdef USE_INJECTION_POINTS + if (GinPageIsLeaf(BufferGetPage(stack->buffer))) + INJECTION_POINT("gin-leave-leaf-split-incomplete"); + else + INJECTION_POINT("gin-leave-internal-split-incomplete"); +#endif + /* search parent to lock */ LockBuffer(parent->buffer, GIN_EXCLUSIVE); @@ -699,7 +703,7 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack, * would fail. */ if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer))) - ginFinishSplit(btree, parent, false, buildStats); + ginFinishOldSplit(btree, parent, buildStats, GIN_EXCLUSIVE); /* move right if it's needed */ page = BufferGetPage(parent->buffer); @@ -723,7 +727,7 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack, page = BufferGetPage(parent->buffer); if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer))) - ginFinishSplit(btree, parent, false, buildStats); + ginFinishOldSplit(btree, parent, buildStats, GIN_EXCLUSIVE); } /* insert the downlink */ @@ -759,6 +763,43 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack, freeGinBtreeStack(stack); } +/* + * An entry point to ginFinishSplit() that is used when we stumble upon an + * existing incompletely split page in the tree, as opposed to completing a + * split that we just made outselves. The difference is that stack->buffer may + * be merely share-locked on entry, and will be upgraded to exclusive mode. + * + * Note: Upgrading the lock momentarily releases it. Doing that in a scan + * would not be OK, because a concurrent VACUUM might delete the page while + * we're not holding the lock. It's OK in an insert, though, because VACUUM + * has a different mechanism that prevents it from running concurrently with + * inserts. (Namely, it holds a cleanup lock on the root.) + */ +static void +ginFinishOldSplit(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats, int access) +{ + INJECTION_POINT("gin-finish-incomplete-split"); + elog(DEBUG1, "finishing incomplete split of block %u in gin index \"%s\"", + stack->blkno, RelationGetRelationName(btree->index)); + + if (access == GIN_SHARE) + { + LockBuffer(stack->buffer, GIN_UNLOCK); + LockBuffer(stack->buffer, GIN_EXCLUSIVE); + + if (!GinPageIsIncompleteSplit(BufferGetPage(stack->buffer))) + { + /* + * Someone else already completed the split while we were not + * holding the lock. + */ + return; + } + } + + ginFinishSplit(btree, stack, false, buildStats); +} + /* * Insert a value to tree described by stack. * @@ -779,7 +820,7 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack, void *insertdata, /* If the leaf page was incompletely split, finish the split first */ if (GinPageIsIncompleteSplit(BufferGetPage(stack->buffer))) - ginFinishSplit(btree, stack, false, buildStats); + ginFinishOldSplit(btree, stack, buildStats, GIN_EXCLUSIVE); done = ginPlaceToPage(btree, stack, insertdata, InvalidBlockNumber, diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index e32c8925f6..89aa41b5e3 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -40,9 +40,9 @@ SUBDIRS = \ ifeq ($(enable_injection_points),yes) -SUBDIRS += injection_points +SUBDIRS += injection_points gin else -ALWAYS_SUBDIRS += injection_points +ALWAYS_SUBDIRS += injection_points gin endif ifeq ($(with_ssl),openssl) diff --git a/src/test/modules/gin/Makefile b/src/test/modules/gin/Makefile new file mode 100644 index 0000000000..a7731b7fa1 --- /dev/null +++ b/src/test/modules/gin/Makefile @@ -0,0 +1,16 @@ +# src/test/modules/gin/Makefile + +EXTRA_INSTALL = src/test/modules/injection_points + +REGRESS = gin_incomplete_splits + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/gin +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/gin/expected/gin_incomplete_splits.out b/src/test/modules/gin/expected/gin_incomplete_splits.out new file mode 100644 index 0000000000..9a60f8c128 --- /dev/null +++ b/src/test/modules/gin/expected/gin_incomplete_splits.out @@ -0,0 +1,180 @@ +-- +-- The test uses a GIN index over int[]. The table contains arrays +-- with integers from 1 to :next_i. Each integer occurs exactly once, +-- no gaps or duplicates, although the index does contain some +-- duplicate elements because some of the inserting transactions are +-- rolled back during the test. The exact contents of the table depend +-- on the physical layout of the index, which in turn depends at least +-- on the block size, so instead of check for the exact contents, we +-- check those invariants. :next_i psql variable is maintained at all +-- times to hold the last inserted integer + 1. +-- +-- This uses injection points to cause errors that leave some page +-- splits in "incomplete" state +create extension injection_points; +-- Use the index for all the queries +set enable_seqscan=off; +-- Print a NOTICE whenever an incomplete split gets fixed +SELECT injection_points_attach('gin-finish-incomplete-split', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +-- +-- First create the test table and some helper functions +-- +create table gin_incomplete_splits(i int4[]) with (autovacuum_enabled = off); +create index on gin_incomplete_splits using gin (i) with (fastupdate = off); +-- Creates an array with all integers from $1 (inclusive) $2 (exclusive) +create function range_array(int, int) returns int[] language sql immutable as $$ + select array_agg(g) from generate_series($1, $2 - 1) g +$$; +-- Inserts an array with 'n' rows to the test table. Pass :next_i as +-- the first argument, returns the new value for :next_i. +create function insert_n(first_i int, n int) returns int language plpgsql as $$ +begin + insert into gin_incomplete_splits values (range_array(first_i, first_i+n)); + return first_i + n; +end; +$$; +-- Inserts to the table until an insert fails. Like insert_n(), returns the +-- new value for :next_i. +create function insert_until_fail(next_i int, step int default 1) returns int language plpgsql as $$ +declare + i integer; +begin + -- Insert arrays with 'step' elements each, until an error occurs. + loop + begin + select insert_n(next_i, step) into next_i; + exception when others then + raise notice 'failed with: %', sqlerrm; + exit; + end; + + -- The caller is expected to set an injection point that eventuall + -- causes an error. But bail out if still no error after 10000 + -- attempts, so that we don't get stuck in an infinite loop. + i := i + 1; + if i = 10000 then + raise 'no error on inserts after '; + end if; + end loop; + + return next_i; +end; +$$; +-- Check the invariants. +create function verify(next_i int) returns bool language plpgsql as $$ +declare + a integer[]; + elem integer; + c integer; +begin + -- Perform a scan over the trailing part of the index, where the + -- possible incomplete splits are. (We don't check the whole table, + -- because that'd be pretty slow.) + c := 0; + -- Find all arrays that overlap with the last 200 inserted integers. Or + -- the next 100, which shouldn't exist. + for a in select i from gin_incomplete_splits where i && range_array(next_i - 200, next_i + 100) + loop + -- count all elements in those arrays in the window. + foreach elem in ARRAY a loop + if elem >= next_i then + raise 'unexpected value % in array', elem; + end if; + if elem >= next_i - 200 then + c := c + 1; + end if; + end loop; + end loop; + if c <> 200 then + raise 'unexpected count % ', c; + end if; + + return true; +end; +$$; +-- Insert one array to get started. +select insert_n(1, 1000) as next_i +\gset +select verify(:next_i); + verify +-------- + t +(1 row) + +-- +-- Test incomplete leaf split +-- +SELECT injection_points_attach('gin-leave-leaf-split-incomplete', 'error'); + injection_points_attach +------------------------- + +(1 row) + +select insert_until_fail(:next_i) as next_i +\gset +NOTICE: failed with: error triggered for injection point gin-leave-leaf-split-incomplete +SELECT injection_points_detach('gin-leave-leaf-split-incomplete'); + injection_points_detach +------------------------- + +(1 row) + +-- Verify that a scan works even though there's an incomplete split +select verify(:next_i); + verify +-------- + t +(1 row) + +-- Insert some more rows, finishing the split +select insert_n(:next_i, 10) as next_i +\gset +NOTICE: notice triggered for injection point gin-finish-incomplete-split +-- Verify that a scan still works +select verify(:next_i); + verify +-------- + t +(1 row) + +-- +-- Test incomplete internal page split +-- +SELECT injection_points_attach('gin-leave-internal-split-incomplete', 'error'); + injection_points_attach +------------------------- + +(1 row) + +select insert_until_fail(:next_i, 100) as next_i +\gset +NOTICE: failed with: error triggered for injection point gin-leave-internal-split-incomplete +SELECT injection_points_detach('gin-leave-internal-split-incomplete'); + injection_points_detach +------------------------- + +(1 row) + + -- Verify that a scan works even though there's an incomplete split +select verify(:next_i); + verify +-------- + t +(1 row) + +-- Insert some more rows, finishing the split +select insert_n(:next_i, 10) as next_i +\gset +NOTICE: notice triggered for injection point gin-finish-incomplete-split +-- Verify that a scan still works +select verify(:next_i); + verify +-------- + t +(1 row) + diff --git a/src/test/modules/gin/meson.build b/src/test/modules/gin/meson.build new file mode 100644 index 0000000000..9734b51de2 --- /dev/null +++ b/src/test/modules/gin/meson.build @@ -0,0 +1,16 @@ +# Copyright (c) 2022-2024, PostgreSQL Global Development Group + +if not get_option('injection_points') + subdir_done() +endif + +tests += { + 'name': 'gin', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'gin_incomplete_splits', + ], + }, +} diff --git a/src/test/modules/gin/sql/gin_incomplete_splits.sql b/src/test/modules/gin/sql/gin_incomplete_splits.sql new file mode 100644 index 0000000000..4e180b2519 --- /dev/null +++ b/src/test/modules/gin/sql/gin_incomplete_splits.sql @@ -0,0 +1,144 @@ +-- +-- The test uses a GIN index over int[]. The table contains arrays +-- with integers from 1 to :next_i. Each integer occurs exactly once, +-- no gaps or duplicates, although the index does contain some +-- duplicate elements because some of the inserting transactions are +-- rolled back during the test. The exact contents of the table depend +-- on the physical layout of the index, which in turn depends at least +-- on the block size, so instead of check for the exact contents, we +-- check those invariants. :next_i psql variable is maintained at all +-- times to hold the last inserted integer + 1. +-- + +-- This uses injection points to cause errors that leave some page +-- splits in "incomplete" state +create extension injection_points; + +-- Use the index for all the queries +set enable_seqscan=off; + +-- Print a NOTICE whenever an incomplete split gets fixed +SELECT injection_points_attach('gin-finish-incomplete-split', 'notice'); + +-- +-- First create the test table and some helper functions +-- +create table gin_incomplete_splits(i int4[]) with (autovacuum_enabled = off); + +create index on gin_incomplete_splits using gin (i) with (fastupdate = off); + +-- Creates an array with all integers from $1 (inclusive) $2 (exclusive) +create function range_array(int, int) returns int[] language sql immutable as $$ + select array_agg(g) from generate_series($1, $2 - 1) g +$$; + +-- Inserts an array with 'n' rows to the test table. Pass :next_i as +-- the first argument, returns the new value for :next_i. +create function insert_n(first_i int, n int) returns int language plpgsql as $$ +begin + insert into gin_incomplete_splits values (range_array(first_i, first_i+n)); + return first_i + n; +end; +$$; + +-- Inserts to the table until an insert fails. Like insert_n(), returns the +-- new value for :next_i. +create function insert_until_fail(next_i int, step int default 1) returns int language plpgsql as $$ +declare + i integer; +begin + -- Insert arrays with 'step' elements each, until an error occurs. + loop + begin + select insert_n(next_i, step) into next_i; + exception when others then + raise notice 'failed with: %', sqlerrm; + exit; + end; + + -- The caller is expected to set an injection point that eventuall + -- causes an error. But bail out if still no error after 10000 + -- attempts, so that we don't get stuck in an infinite loop. + i := i + 1; + if i = 10000 then + raise 'no error on inserts after '; + end if; + end loop; + + return next_i; +end; +$$; + +-- Check the invariants. +create function verify(next_i int) returns bool language plpgsql as $$ +declare + a integer[]; + elem integer; + c integer; +begin + -- Perform a scan over the trailing part of the index, where the + -- possible incomplete splits are. (We don't check the whole table, + -- because that'd be pretty slow.) + c := 0; + -- Find all arrays that overlap with the last 200 inserted integers. Or + -- the next 100, which shouldn't exist. + for a in select i from gin_incomplete_splits where i && range_array(next_i - 200, next_i + 100) + loop + -- count all elements in those arrays in the window. + foreach elem in ARRAY a loop + if elem >= next_i then + raise 'unexpected value % in array', elem; + end if; + if elem >= next_i - 200 then + c := c + 1; + end if; + end loop; + end loop; + if c <> 200 then + raise 'unexpected count % ', c; + end if; + + return true; +end; +$$; + +-- Insert one array to get started. +select insert_n(1, 1000) as next_i +\gset +select verify(:next_i); + + +-- +-- Test incomplete leaf split +-- +SELECT injection_points_attach('gin-leave-leaf-split-incomplete', 'error'); +select insert_until_fail(:next_i) as next_i +\gset +SELECT injection_points_detach('gin-leave-leaf-split-incomplete'); + +-- Verify that a scan works even though there's an incomplete split +select verify(:next_i); + +-- Insert some more rows, finishing the split +select insert_n(:next_i, 10) as next_i +\gset +-- Verify that a scan still works +select verify(:next_i); + + +-- +-- Test incomplete internal page split +-- +SELECT injection_points_attach('gin-leave-internal-split-incomplete', 'error'); +select insert_until_fail(:next_i, 100) as next_i +\gset +SELECT injection_points_detach('gin-leave-internal-split-incomplete'); + + -- Verify that a scan works even though there's an incomplete split +select verify(:next_i); + +-- Insert some more rows, finishing the split +select insert_n(:next_i, 10) as next_i +\gset +-- Verify that a scan still works +select verify(:next_i); diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build index 397e0906e6..8fbe742d38 100644 --- a/src/test/modules/meson.build +++ b/src/test/modules/meson.build @@ -5,6 +5,7 @@ subdir('commit_ts') subdir('delay_execution') subdir('dummy_index_am') subdir('dummy_seclabel') +subdir('gin') subdir('injection_points') subdir('ldap_password_func') subdir('libpq_pipeline')