diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index a15fe21933..52aa633056 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -1500,12 +1500,14 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) errmsg("index \"%s\" meta page is corrupt", RelationGetRelationName(state->rel)))); - if (metad->btm_version != BTREE_VERSION) + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("version mismatch in index \"%s\": file version %d, code version %d", + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", RelationGetRelationName(state->rel), - metad->btm_version, BTREE_VERSION))); + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); } /* diff --git a/contrib/pageinspect/Makefile b/contrib/pageinspect/Makefile index 0a3cbeeb10..e5a581f141 100644 --- a/contrib/pageinspect/Makefile +++ b/contrib/pageinspect/Makefile @@ -5,7 +5,8 @@ OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o \ brinfuncs.o ginfuncs.o hashfuncs.o $(WIN32RES) EXTENSION = pageinspect -DATA = pageinspect--1.5.sql pageinspect--1.5--1.6.sql \ +DATA = pageinspect--1.6--1.7.sql \ + pageinspect--1.5.sql pageinspect--1.5--1.6.sql \ pageinspect--1.4--1.5.sql pageinspect--1.3--1.4.sql \ pageinspect--1.2--1.3.sql pageinspect--1.1--1.2.sql \ pageinspect--1.0--1.1.sql pageinspect--unpackaged--1.0.sql diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c index 4f834676ea..5133653791 100644 --- a/contrib/pageinspect/btreefuncs.c +++ b/contrib/pageinspect/btreefuncs.c @@ -511,7 +511,7 @@ bt_metap(PG_FUNCTION_ARGS) BTMetaPageData *metad; TupleDesc tupleDesc; int j; - char *values[6]; + char *values[8]; Buffer buffer; Page page; HeapTuple tuple; @@ -555,6 +555,8 @@ bt_metap(PG_FUNCTION_ARGS) values[j++] = psprintf("%d", metad->btm_level); values[j++] = psprintf("%d", metad->btm_fastroot); values[j++] = psprintf("%d", metad->btm_fastlevel); + values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact); + values[j++] = psprintf("%lf", metad->btm_last_cleanup_num_heap_tuples); tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), values); diff --git a/contrib/pageinspect/expected/btree.out b/contrib/pageinspect/expected/btree.out index 67b103add3..2aaa4df53b 100644 --- a/contrib/pageinspect/expected/btree.out +++ b/contrib/pageinspect/expected/btree.out @@ -3,13 +3,15 @@ INSERT INTO test1 VALUES (72057594037927937, 'text'); CREATE INDEX test1_a_idx ON test1 USING btree (a); \x SELECT * FROM bt_metap('test1_a_idx'); --[ RECORD 1 ]----- -magic | 340322 -version | 2 -root | 1 -level | 0 -fastroot | 1 -fastlevel | 0 +-[ RECORD 1 ]-----------+------- +magic | 340322 +version | 3 +root | 1 +level | 0 +fastroot | 1 +fastlevel | 0 +oldest_xact | 0 +last_cleanup_num_tuples | -1 SELECT * FROM bt_page_stats('test1_a_idx', 0); ERROR: block 0 is a meta page diff --git a/contrib/pageinspect/pageinspect--1.6--1.7.sql b/contrib/pageinspect/pageinspect--1.6--1.7.sql new file mode 100644 index 0000000000..2433a21af2 --- /dev/null +++ b/contrib/pageinspect/pageinspect--1.6--1.7.sql @@ -0,0 +1,26 @@ +/* contrib/pageinspect/pageinspect--1.6--1.7.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pageinspect UPDATE TO '1.7'" to load this file. \quit + +-- +-- bt_metap() +-- +DROP FUNCTION bt_metap(IN relname text, + OUT magic int4, + OUT version int4, + OUT root int4, + OUT level int4, + OUT fastroot int4, + OUT fastlevel int4); +CREATE FUNCTION bt_metap(IN relname text, + OUT magic int4, + OUT version int4, + OUT root int4, + OUT level int4, + OUT fastroot int4, + OUT fastlevel int4, + OUT oldest_xact int4, + OUT last_cleanup_num_tuples real) +AS 'MODULE_PATHNAME', 'bt_metap' +LANGUAGE C STRICT PARALLEL SAFE; diff --git a/contrib/pageinspect/pageinspect.control b/contrib/pageinspect/pageinspect.control index 1a61c9f5ad..dcfc61f22d 100644 --- a/contrib/pageinspect/pageinspect.control +++ b/contrib/pageinspect/pageinspect.control @@ -1,5 +1,5 @@ # pageinspect extension comment = 'inspect the contents of database pages at a low level' -default_version = '1.6' +default_version = '1.7' module_pathname = '$libdir/pageinspect' relocatable = true diff --git a/contrib/pgstattuple/expected/pgstattuple.out b/contrib/pgstattuple/expected/pgstattuple.out index 20b5585d03..a7087f6d45 100644 --- a/contrib/pgstattuple/expected/pgstattuple.out +++ b/contrib/pgstattuple/expected/pgstattuple.out @@ -48,7 +48,7 @@ select version, tree_level, from pgstatindex('test_pkey'); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select version, tree_level, @@ -58,7 +58,7 @@ select version, tree_level, from pgstatindex('test_pkey'::text); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select version, tree_level, @@ -68,7 +68,7 @@ select version, tree_level, from pgstatindex('test_pkey'::name); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select version, tree_level, @@ -78,7 +78,7 @@ select version, tree_level, from pgstatindex('test_pkey'::regclass); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select pg_relpages('test'); @@ -229,7 +229,7 @@ create index test_partition_hash_idx on test_partition using hash (a); select pgstatindex('test_partition_idx'); pgstatindex ------------------------------ - (2,0,8192,0,0,0,0,0,NaN,NaN) + (3,0,8192,0,0,0,0,0,NaN,NaN) (1 row) select pgstathashindex('test_partition_hash_idx'); diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index e7d408824e..a189a8efc3 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1882,6 +1882,31 @@ include_dir 'conf.d' + + Index Vacuum + + + vacuum_cleanup_index_scale_factor (floating point) + + vacuum_cleanup_index_scale_factor configuration parameter + + + + + When no tuples were deleted from the heap, B-tree indexes might still + be scanned during VACUUM cleanup stage by two + reasons. The first reason is that B-tree index contains deleted pages + which can be recycled during cleanup. The second reason is that B-tree + index statistics is stalled. The criterion of stalled index statistics + is number of inserted tuples since previous statistics collection + is greater than vacuum_cleanup_index_scale_factor + fraction of total number of heap tuples. + + + + + + Background Writer diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml index 23570af4bf..4d5da186bb 100644 --- a/doc/src/sgml/pageinspect.sgml +++ b/doc/src/sgml/pageinspect.sgml @@ -247,13 +247,15 @@ test=# SELECT * FROM heap_page_item_attrs(get_raw_page('pg_class', 0), 'pg_class index's metapage. For example: test=# SELECT * FROM bt_metap('pg_cast_oid_index'); --[ RECORD 1 ]----- -magic | 340322 -version | 2 -root | 1 -level | 0 -fastroot | 1 -fastlevel | 0 +-[ RECORD 1 ]-----------+------- +magic | 340322 +version | 3 +root | 1 +level | 0 +fastroot | 1 +fastlevel | 0 +oldest_xact | 582 +last_cleanup_num_tuples | 1000 diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index ba1c5d6392..e9521fbfb9 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -369,6 +369,21 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] + + B-tree indexes additionally accept this parameter: + + + + + vacuum_cleanup_index_scale_factor + + + Per-table value for . + + + + + GiST indexes additionally accept this parameter: diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 35c09987ad..69ab2f101c 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -409,6 +409,15 @@ static relopt_real realRelOpts[] = }, 0, -1.0, DBL_MAX }, + { + { + "vacuum_cleanup_index_scale_factor", + "Number of tuple inserts prior to index cleanup as a fraction of reltuples.", + RELOPT_KIND_BTREE, + ShareUpdateExclusiveLock + }, + -1, 0.0, 100.0 + }, /* list terminator */ {{NULL}} }; @@ -1371,7 +1380,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind) {"user_catalog_table", RELOPT_TYPE_BOOL, offsetof(StdRdOptions, user_catalog_table)}, {"parallel_workers", RELOPT_TYPE_INT, - offsetof(StdRdOptions, parallel_workers)} + offsetof(StdRdOptions, parallel_workers)}, + {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL, + offsetof(StdRdOptions, vacuum_cleanup_index_scale_factor)} }; options = parseRelOptions(reloptions, validate, kind, &numoptions); diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 40111990c5..fd7360278d 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -939,6 +939,9 @@ _bt_insertonpg(Relation rel, if (BufferIsValid(metabuf)) { + /* upgrade meta-page if needed */ + if (metad->btm_version < BTREE_VERSION) + _bt_upgrademetapage(metapg); metad->btm_fastroot = itup_blkno; metad->btm_fastlevel = lpageop->btpo.level; MarkBufferDirty(metabuf); @@ -997,6 +1000,9 @@ _bt_insertonpg(Relation rel, xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; + xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; + xlmeta.last_cleanup_num_heap_tuples = + metad->btm_last_cleanup_num_heap_tuples; XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata)); @@ -2049,6 +2055,10 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); + /* upgrade metapage if needed */ + if (metad->btm_version < BTREE_VERSION) + _bt_upgrademetapage(metapg); + /* * Create downlink item for left page (old root). Since this will be the * first item in a non-leaf page, it implicitly has minus-infinity key @@ -2138,6 +2148,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) md.level = metad->btm_level; md.fastroot = rootblknum; md.fastlevel = metad->btm_level; + md.oldest_btpo_xact = metad->btm_oldest_btpo_xact; + md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 92afe2de38..505a67e6ed 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -60,6 +60,8 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) metad->btm_level = level; metad->btm_fastroot = rootbknum; metad->btm_fastlevel = level; + metad->btm_oldest_btpo_xact = InvalidTransactionId; + metad->btm_last_cleanup_num_heap_tuples = -1.0; metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); metaopaque->btpo_flags = BTP_META; @@ -73,6 +75,114 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) ((char *) metad + sizeof(BTMetaPageData)) - (char *) page; } +/* + * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to the new. + * + * This routine does purely in-memory image upgrade. Caller is + * responsible for locking, WAL-logging etc. + */ +void +_bt_upgrademetapage(Page page) +{ + BTMetaPageData *metad; + BTPageOpaque metaopaque; + + metad = BTPageGetMeta(page); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* It must be really a meta page of upgradable version */ + Assert(metaopaque->btpo_flags & BTP_META); + Assert(metad->btm_version < BTREE_VERSION); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + + /* Set version number and fill extra fields added into version 3 */ + metad->btm_version = BTREE_VERSION; + metad->btm_oldest_btpo_xact = InvalidTransactionId; + metad->btm_last_cleanup_num_heap_tuples = -1.0; + + /* Adjust pd_lower (see _bt_initmetapage() for details) */ + ((PageHeader) page)->pd_lower = + ((char *) metad + sizeof(BTMetaPageData)) - (char *) page; +} + +/* + * _bt_update_meta_cleanup_info() -- Update cleanup-related information in + * the metapage. + * + * This routine checks if provided cleanup-related information is matching + * to those written in the metapage. On mismatch, metapage is overritten. + */ +void +_bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, + float8 numHeapTuples) +{ + Buffer metabuf; + Page metapg; + BTPageOpaque metaopaque; + BTMetaPageData *metad; + bool needsRewrite = false; + XLogRecPtr recptr; + + /* read the metapage and check if it needs rewrite */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + metad = BTPageGetMeta(metapg); + + /* outdated version of metapage always needs rewrite */ + if (metad->btm_version < BTREE_VERSION) + needsRewrite = true; + else if (metad->btm_oldest_btpo_xact != oldestBtpoXact || + metad->btm_last_cleanup_num_heap_tuples != numHeapTuples) + needsRewrite = true; + + if (!needsRewrite) + { + _bt_relbuf(rel, metabuf); + return; + } + + /* trade in our read lock for a write lock */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + LockBuffer(metabuf, BT_WRITE); + + START_CRIT_SECTION(); + + /* upgrade meta-page if needed */ + if (metad->btm_version < BTREE_VERSION) + _bt_upgrademetapage(metapg); + + /* update cleanup-related infromation */ + metad->btm_oldest_btpo_xact = oldestBtpoXact; + metad->btm_last_cleanup_num_heap_tuples = numHeapTuples; + MarkBufferDirty(metabuf); + + /* write wal record if needed */ + if (RelationNeedsWAL(rel)) + { + xl_btree_metadata md; + + XLogBeginInsert(); + XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + + md.root = metad->btm_root; + md.level = metad->btm_level; + md.fastroot = metad->btm_fastroot; + md.fastlevel = metad->btm_fastlevel; + md.oldest_btpo_xact = oldestBtpoXact; + md.last_cleanup_num_heap_tuples = numHeapTuples; + + XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata)); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP); + + PageSetLSN(metapg, recptr); + } + + END_CRIT_SECTION(); + _bt_relbuf(rel, metabuf); +} + /* * _bt_getroot() -- Get the root page of the btree. * @@ -124,7 +234,8 @@ _bt_getroot(Relation rel, int access) metad = (BTMetaPageData *) rel->rd_amcache; /* We shouldn't have cached it if any of these fail */ Assert(metad->btm_magic == BTREE_MAGIC); - Assert(metad->btm_version == BTREE_VERSION); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + Assert(metad->btm_version <= BTREE_VERSION); Assert(metad->btm_root != P_NONE); rootblkno = metad->btm_fastroot; @@ -170,12 +281,14 @@ _bt_getroot(Relation rel, int access) errmsg("index \"%s\" is not a btree", RelationGetRelationName(rel)))); - if (metad->btm_version != BTREE_VERSION) + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("version mismatch in index \"%s\": file version %d, code version %d", + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", RelationGetRelationName(rel), - metad->btm_version, BTREE_VERSION))); + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); /* if no root page initialized yet, do it */ if (metad->btm_root == P_NONE) @@ -191,6 +304,10 @@ _bt_getroot(Relation rel, int access) LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); LockBuffer(metabuf, BT_WRITE); + /* upgrade metapage if needed */ + if (metad->btm_version < BTREE_VERSION) + _bt_upgrademetapage(metapg); + /* * Race condition: if someone else initialized the metadata between * the time we released the read lock and acquired the write lock, we @@ -229,6 +346,8 @@ _bt_getroot(Relation rel, int access) metad->btm_level = 0; metad->btm_fastroot = rootblkno; metad->btm_fastlevel = 0; + metad->btm_oldest_btpo_xact = InvalidTransactionId; + metad->btm_last_cleanup_num_heap_tuples = -1.0; MarkBufferDirty(rootbuf); MarkBufferDirty(metabuf); @@ -248,6 +367,8 @@ _bt_getroot(Relation rel, int access) md.level = 0; md.fastroot = rootblkno; md.fastlevel = 0; + md.oldest_btpo_xact = InvalidTransactionId; + md.last_cleanup_num_heap_tuples = -1.0; XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); @@ -373,12 +494,14 @@ _bt_gettrueroot(Relation rel) errmsg("index \"%s\" is not a btree", RelationGetRelationName(rel)))); - if (metad->btm_version != BTREE_VERSION) + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("version mismatch in index \"%s\": file version %d, code version %d", + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", RelationGetRelationName(rel), - metad->btm_version, BTREE_VERSION))); + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); /* if no root page initialized yet, fail */ if (metad->btm_root == P_NONE) @@ -460,12 +583,14 @@ _bt_getrootheight(Relation rel) errmsg("index \"%s\" is not a btree", RelationGetRelationName(rel)))); - if (metad->btm_version != BTREE_VERSION) + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("version mismatch in index \"%s\": file version %d, code version %d", + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", RelationGetRelationName(rel), - metad->btm_version, BTREE_VERSION))); + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); /* * If there's no root page yet, _bt_getroot() doesn't expect a cache @@ -1784,6 +1909,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) /* And update the metapage, if needed */ if (BufferIsValid(metabuf)) { + /* upgrade metapage if needed */ + if (metad->btm_version < BTREE_VERSION) + _bt_upgrademetapage(metapg); metad->btm_fastroot = rightsib; metad->btm_fastlevel = targetlevel; MarkBufferDirty(metabuf); @@ -1834,6 +1962,8 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; + xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; + xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata)); xlinfo = XLOG_BTREE_UNLINK_PAGE_META; diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 6fca8e358f..06badc90ba 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -19,11 +19,14 @@ #include "postgres.h" #include "access/nbtree.h" +#include "access/nbtxlog.h" #include "access/relscan.h" #include "access/xlog.h" #include "commands/vacuum.h" +#include "miscadmin.h" #include "nodes/execnodes.h" #include "pgstat.h" +#include "postmaster/autovacuum.h" #include "storage/condition_variable.h" #include "storage/indexfsm.h" #include "storage/ipc.h" @@ -45,6 +48,7 @@ typedef struct BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */ BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */ BlockNumber totFreePages; /* true total # of free pages */ + TransactionId oldestBtpoXact; MemoryContext pagedelcontext; } BTVacState; @@ -89,7 +93,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc; static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, - BTCycleId cycleid); + BTCycleId cycleid, TransactionId *oldestBtpoXact); static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno); @@ -773,6 +777,70 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan) SpinLockRelease(&btscan->btps_mutex); } +/* + * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that + * btbulkdelete() wasn't called. + */ +static bool +_bt_vacuum_needs_cleanup(IndexVacuumInfo *info) +{ + Buffer metabuf; + Page metapg; + BTPageOpaque metaopaque; + BTMetaPageData *metad; + bool result = false; + + metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + metad = BTPageGetMeta(metapg); + + if (metad->btm_version < BTREE_VERSION) + { + /* + * Do cleanup if metapage needs upgrade, because we don't have + * cleanup-related meta-information yet. + */ + result = true; + } + else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) && + TransactionIdPrecedes(metad->btm_oldest_btpo_xact, + RecentGlobalXmin)) + { + /* + * If oldest btpo.xact in the deleted pages is older than + * RecentGlobalXmin, then at least one deleted page can be recycled. + */ + result = true; + } + else + { + StdRdOptions *relopts; + float8 cleanup_scale_factor; + + /* + * If table receives large enough amount of insertions and no cleanup + * was performed, then index might appear to have stalled statistics. + * In order to evade that, we perform cleanup when table receives + * vacuum_cleanup_index_scale_factor fractions of insertions. + */ + relopts = (StdRdOptions *) info->index->rd_options; + cleanup_scale_factor = (relopts && + relopts->vacuum_cleanup_index_scale_factor >= 0) + ? relopts->vacuum_cleanup_index_scale_factor + : vacuum_cleanup_index_scale_factor; + + if (cleanup_scale_factor < 0 || + metad->btm_last_cleanup_num_heap_tuples < 0 || + info->num_heap_tuples > (1.0 + cleanup_scale_factor) * + metad->btm_last_cleanup_num_heap_tuples) + result = true; + } + + _bt_relbuf(info->index, metabuf); + return result; +} + /* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells @@ -795,9 +863,20 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* The ENSURE stuff ensures we clean up shared memory on failure */ PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); { + TransactionId oldestBtpoXact; + cycleid = _bt_start_vacuum(rel); - btvacuumscan(info, stats, callback, callback_state, cycleid); + btvacuumscan(info, stats, callback, callback_state, cycleid, + &oldestBtpoXact); + + /* + * Update cleanup-related information in metapage. These information + * is used only for cleanup but keeping up them to date can avoid + * unnecessary cleanup even after bulkdelete. + */ + _bt_update_meta_cleanup_info(info->index, oldestBtpoXact, + info->num_heap_tuples); } PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); _bt_end_vacuum(rel); @@ -819,17 +898,28 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) /* * If btbulkdelete was called, we need not do anything, just return the - * stats from the latest btbulkdelete call. If it wasn't called, we must - * still do a pass over the index, to recycle any newly-recyclable pages - * and to obtain index statistics. + * stats from the latest btbulkdelete call. If it wasn't called, we might + * still need to do a pass over the index, to recycle any newly-recyclable + * pages and to obtain index statistics. _bt_vacuum_needs_cleanup checks + * is there are newly-recyclable or stalled index statistics. * * Since we aren't going to actually delete any leaf items, there's no * need to go through all the vacuum-cycle-ID pushups. */ if (stats == NULL) { + TransactionId oldestBtpoXact; + + /* Check if we need a cleanup */ + if (!_bt_vacuum_needs_cleanup(info)) + return NULL; + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); - btvacuumscan(info, stats, NULL, NULL, 0); + btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact); + + /* Update cleanup-related information in the metapage */ + _bt_update_meta_cleanup_info(info->index, oldestBtpoXact, + info->num_heap_tuples); } /* @@ -862,7 +952,7 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, - BTCycleId cycleid) + BTCycleId cycleid, TransactionId *oldestBtpoXact) { Relation rel = info->index; BTVacState vstate; @@ -887,6 +977,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ vstate.lastBlockLocked = BTREE_METAPAGE; vstate.totFreePages = 0; + vstate.oldestBtpoXact = InvalidTransactionId; /* Create a temporary memory context to run _bt_pagedel in */ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, @@ -991,6 +1082,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* update statistics */ stats->num_pages = num_pages; stats->pages_free = vstate.totFreePages; + + if (oldestBtpoXact) + *oldestBtpoXact = vstate.oldestBtpoXact; } /* @@ -1070,6 +1164,11 @@ restart: { /* Already deleted, but can't recycle yet */ stats->pages_deleted++; + + /* Update the oldest btpo.xact */ + if (!TransactionIdIsValid(vstate->oldestBtpoXact) || + TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact)) + vstate->oldestBtpoXact = opaque->btpo.xact; } else if (P_ISHALFDEAD(opaque)) { @@ -1238,7 +1337,12 @@ restart: /* count only this page, else may double-count parent */ if (ndel) + { stats->pages_deleted++; + if (!TransactionIdIsValid(vstate->oldestBtpoXact) || + TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact)) + vstate->oldestBtpoXact = opaque->btpo.xact; + } MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 233c3965d9..b565bcb540 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -108,6 +108,8 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id) md->btm_level = xlrec->level; md->btm_fastroot = xlrec->fastroot; md->btm_fastlevel = xlrec->fastlevel; + md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact; + md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples; pageop = (BTPageOpaque) PageGetSpecialPointer(metapg); pageop->btpo_flags = BTP_META; @@ -985,7 +987,6 @@ btree_xlog_reuse_page(XLogReaderState *record) } } - void btree_redo(XLogReaderState *record) { @@ -1027,6 +1028,9 @@ btree_redo(XLogReaderState *record) case XLOG_BTREE_REUSE_PAGE: btree_xlog_reuse_page(record); break; + case XLOG_BTREE_META_CLEANUP: + _bt_restore_meta(record, 0); + break; default: elog(PANIC, "btree_redo: unknown op code %u", info); } diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 446040d816..c1f0441b08 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -138,3 +138,5 @@ int VacuumPageDirty = 0; int VacuumCostBalance = 0; /* working state for vacuum */ bool VacuumCostActive = false; + +double vacuum_cleanup_index_scale_factor; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 4ffc8451ca..260ae264d8 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -3208,6 +3208,16 @@ static struct config_real ConfigureNamesReal[] = NULL, NULL, NULL }, + { + {"vacuum_cleanup_index_scale_factor", PGC_SIGHUP, AUTOVACUUM, + gettext_noop("Number of tuple inserts prior to index cleanup as a fraction of reltuples."), + NULL + }, + &vacuum_cleanup_index_scale_factor, + 0.1, 0.0, 100.0, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 2b0b1da763..f532f3ffff 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -102,6 +102,11 @@ typedef struct BTMetaPageData uint32 btm_level; /* tree level of the root page */ BlockNumber btm_fastroot; /* current "fast" root location */ uint32 btm_fastlevel; /* tree level of the "fast" root page */ + /* following fields are available since page version 3 */ + TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among of + * deleted pages */ + float4 btm_last_cleanup_num_heap_tuples; /* number of heap tuples + * during last cleanup */ } BTMetaPageData; #define BTPageGetMeta(p) \ @@ -109,7 +114,8 @@ typedef struct BTMetaPageData #define BTREE_METAPAGE 0 /* first page is meta */ #define BTREE_MAGIC 0x053162 /* magic number of btree pages */ -#define BTREE_VERSION 2 /* current version number */ +#define BTREE_VERSION 3 /* current version number */ +#define BTREE_MIN_VERSION 2 /* minimal supported version number */ /* * Maximum size of a btree index entry, including its tuple header. @@ -481,6 +487,9 @@ extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack); * prototypes for functions in nbtpage.c */ extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level); +extern void _bt_update_meta_cleanup_info(Relation rel, + TransactionId oldestBtpoXact, float8 numHeapTuples); +extern void _bt_upgrademetapage(Page page); extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_gettrueroot(Relation rel); extern int _bt_getrootheight(Relation rel); diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 8297df75fe..a8ccdcec42 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -38,6 +38,8 @@ * vacuum */ #define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from * FSM */ +#define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the + * metapage */ /* * All that we need to regenerate the meta-data page @@ -48,6 +50,8 @@ typedef struct xl_btree_metadata uint32 level; BlockNumber fastroot; uint32 fastlevel; + TransactionId oldest_btpo_xact; + double last_cleanup_num_heap_tuples; } xl_btree_metadata; /* diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index a4574cd533..a429a19964 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -256,6 +256,8 @@ extern int VacuumPageDirty; extern int VacuumCostBalance; extern bool VacuumCostActive; +extern double vacuum_cleanup_index_scale_factor; + /* in tcop/postgres.c */ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index c26c395b0b..9826c67fc4 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -287,6 +287,8 @@ typedef struct StdRdOptions { int32 vl_len_; /* varlena header (do not touch directly!) */ int fillfactor; /* page fill factor in percent (0..100) */ + /* fraction of newly inserted tuples prior to trigger index cleanup */ + float8 vacuum_cleanup_index_scale_factor; int toast_tuple_target; /* target for tuple toasting */ AutoVacOpts autovacuum; /* autovacuum-related options */ bool user_catalog_table; /* use as an additional catalog relation */ diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index 755cd17792..4778ac14a4 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -150,3 +150,32 @@ vacuum btree_tall_tbl; -- need to insert some rows to cause the fast root page to split. insert into btree_tall_tbl (id, t) select g, repeat('x', 100) from generate_series(1, 500) g; +-- +-- Test vacuum_cleanup_index_scale_factor +-- +-- Simple create +create table btree_test(a int); +create index btree_idx1 on btree_test(a) with (vacuum_cleanup_index_scale_factor = 40.0); +select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass; + reloptions +------------------------------------------ + {vacuum_cleanup_index_scale_factor=40.0} +(1 row) + +-- Fail while setting improper values +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = -10.0); +ERROR: value -10.0 out of bounds for option "vacuum_cleanup_index_scale_factor" +DETAIL: Valid values are between "0.000000" and "100.000000". +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 100.0); +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 'string'); +ERROR: invalid value for floating point option "vacuum_cleanup_index_scale_factor": string +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = true); +ERROR: invalid value for floating point option "vacuum_cleanup_index_scale_factor": true +-- Simple ALTER INDEX +alter index btree_idx1 set (vacuum_cleanup_index_scale_factor = 70.0); +select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass; + reloptions +------------------------------------------ + {vacuum_cleanup_index_scale_factor=70.0} +(1 row) + diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index 65b08c8282..21171f7762 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -92,3 +92,22 @@ vacuum btree_tall_tbl; -- need to insert some rows to cause the fast root page to split. insert into btree_tall_tbl (id, t) select g, repeat('x', 100) from generate_series(1, 500) g; + +-- +-- Test vacuum_cleanup_index_scale_factor +-- + +-- Simple create +create table btree_test(a int); +create index btree_idx1 on btree_test(a) with (vacuum_cleanup_index_scale_factor = 40.0); +select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass; + +-- Fail while setting improper values +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = -10.0); +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 100.0); +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 'string'); +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = true); + +-- Simple ALTER INDEX +alter index btree_idx1 set (vacuum_cleanup_index_scale_factor = 70.0); +select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;