From 857f9c36cda520030381bd8c2af20adf0ce0e1d4 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Wed, 4 Apr 2018 19:29:00 +0300 Subject: [PATCH] Skip full index scan during cleanup of B-tree indexes when possible Vacuum of index consists from two stages: multiple (zero of more) ambulkdelete calls and one amvacuumcleanup call. When workload on particular table is append-only, then autovacuum isn't intended to touch this table. However, user may run vacuum manually in order to fill visibility map and get benefits of index-only scans. Then ambulkdelete wouldn't be called for indexes of such table (because no heap tuples were deleted), only amvacuumcleanup would be called In this case, amvacuumcleanup would perform full index scan for two objectives: put recyclable pages into free space map and update index statistics. This patch allows btvacuumclanup to skip full index scan when two conditions are satisfied: no pages are going to be put into free space map and index statistics isn't stalled. In order to check first condition, we store oldest btpo_xact in the meta-page. When it's precedes RecentGlobalXmin, then there are some recyclable pages. In order to check second condition we store number of heap tuples observed during previous full index scan by cleanup. If fraction of newly inserted tuples is less than vacuum_cleanup_index_scale_factor, then statistics isn't considered to be stalled. vacuum_cleanup_index_scale_factor can be defined as both reloption and GUC (default). This patch bumps B-tree meta-page version. Upgrade of meta-page is performed "on the fly": during VACUUM meta-page is rewritten with new version. No special handling in pg_upgrade is required. Author: Masahiko Sawada, Alexander Korotkov Review by: Peter Geoghegan, Kyotaro Horiguchi, Alexander Korotkov, Yura Sokolov Discussion: https://www.postgresql.org/message-id/flat/CAD21AoAX+d2oD_nrd9O2YkpzHaFr=uQeGr9s1rKC3O4ENc568g@mail.gmail.com --- contrib/amcheck/verify_nbtree.c | 8 +- contrib/pageinspect/Makefile | 3 +- contrib/pageinspect/btreefuncs.c | 4 +- contrib/pageinspect/expected/btree.out | 16 +- contrib/pageinspect/pageinspect--1.6--1.7.sql | 26 +++ contrib/pageinspect/pageinspect.control | 2 +- contrib/pgstattuple/expected/pgstattuple.out | 10 +- doc/src/sgml/config.sgml | 25 +++ doc/src/sgml/pageinspect.sgml | 16 +- doc/src/sgml/ref/create_index.sgml | 15 ++ src/backend/access/common/reloptions.c | 13 +- src/backend/access/nbtree/nbtinsert.c | 12 ++ src/backend/access/nbtree/nbtpage.c | 150 ++++++++++++++++-- src/backend/access/nbtree/nbtree.c | 118 +++++++++++++- src/backend/access/nbtree/nbtxlog.c | 6 +- src/backend/utils/init/globals.c | 2 + src/backend/utils/misc/guc.c | 10 ++ src/include/access/nbtree.h | 11 +- src/include/access/nbtxlog.h | 4 + src/include/miscadmin.h | 2 + src/include/utils/rel.h | 2 + src/test/regress/expected/btree_index.out | 29 ++++ src/test/regress/sql/btree_index.sql | 19 +++ 23 files changed, 458 insertions(+), 45 deletions(-) create mode 100644 contrib/pageinspect/pageinspect--1.6--1.7.sql diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index a15fe21933..52aa633056 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -1500,12 +1500,14 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) errmsg("index \"%s\" meta page is corrupt", RelationGetRelationName(state->rel)))); - if (metad->btm_version != BTREE_VERSION) + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("version mismatch in index \"%s\": file version %d, code version %d", + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", RelationGetRelationName(state->rel), - metad->btm_version, BTREE_VERSION))); + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); } /* diff --git a/contrib/pageinspect/Makefile b/contrib/pageinspect/Makefile index 0a3cbeeb10..e5a581f141 100644 --- a/contrib/pageinspect/Makefile +++ b/contrib/pageinspect/Makefile @@ -5,7 +5,8 @@ OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o \ brinfuncs.o ginfuncs.o hashfuncs.o $(WIN32RES) EXTENSION = pageinspect -DATA = pageinspect--1.5.sql pageinspect--1.5--1.6.sql \ +DATA = pageinspect--1.6--1.7.sql \ + pageinspect--1.5.sql pageinspect--1.5--1.6.sql \ pageinspect--1.4--1.5.sql pageinspect--1.3--1.4.sql \ pageinspect--1.2--1.3.sql pageinspect--1.1--1.2.sql \ pageinspect--1.0--1.1.sql pageinspect--unpackaged--1.0.sql diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c index 4f834676ea..5133653791 100644 --- a/contrib/pageinspect/btreefuncs.c +++ b/contrib/pageinspect/btreefuncs.c @@ -511,7 +511,7 @@ bt_metap(PG_FUNCTION_ARGS) BTMetaPageData *metad; TupleDesc tupleDesc; int j; - char *values[6]; + char *values[8]; Buffer buffer; Page page; HeapTuple tuple; @@ -555,6 +555,8 @@ bt_metap(PG_FUNCTION_ARGS) values[j++] = psprintf("%d", metad->btm_level); values[j++] = psprintf("%d", metad->btm_fastroot); values[j++] = psprintf("%d", metad->btm_fastlevel); + values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact); + values[j++] = psprintf("%lf", metad->btm_last_cleanup_num_heap_tuples); tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), values); diff --git a/contrib/pageinspect/expected/btree.out b/contrib/pageinspect/expected/btree.out index 67b103add3..2aaa4df53b 100644 --- a/contrib/pageinspect/expected/btree.out +++ b/contrib/pageinspect/expected/btree.out @@ -3,13 +3,15 @@ INSERT INTO test1 VALUES (72057594037927937, 'text'); CREATE INDEX test1_a_idx ON test1 USING btree (a); \x SELECT * FROM bt_metap('test1_a_idx'); --[ RECORD 1 ]----- -magic | 340322 -version | 2 -root | 1 -level | 0 -fastroot | 1 -fastlevel | 0 +-[ RECORD 1 ]-----------+------- +magic | 340322 +version | 3 +root | 1 +level | 0 +fastroot | 1 +fastlevel | 0 +oldest_xact | 0 +last_cleanup_num_tuples | -1 SELECT * FROM bt_page_stats('test1_a_idx', 0); ERROR: block 0 is a meta page diff --git a/contrib/pageinspect/pageinspect--1.6--1.7.sql b/contrib/pageinspect/pageinspect--1.6--1.7.sql new file mode 100644 index 0000000000..2433a21af2 --- /dev/null +++ b/contrib/pageinspect/pageinspect--1.6--1.7.sql @@ -0,0 +1,26 @@ +/* contrib/pageinspect/pageinspect--1.6--1.7.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pageinspect UPDATE TO '1.7'" to load this file. \quit + +-- +-- bt_metap() +-- +DROP FUNCTION bt_metap(IN relname text, + OUT magic int4, + OUT version int4, + OUT root int4, + OUT level int4, + OUT fastroot int4, + OUT fastlevel int4); +CREATE FUNCTION bt_metap(IN relname text, + OUT magic int4, + OUT version int4, + OUT root int4, + OUT level int4, + OUT fastroot int4, + OUT fastlevel int4, + OUT oldest_xact int4, + OUT last_cleanup_num_tuples real) +AS 'MODULE_PATHNAME', 'bt_metap' +LANGUAGE C STRICT PARALLEL SAFE; diff --git a/contrib/pageinspect/pageinspect.control b/contrib/pageinspect/pageinspect.control index 1a61c9f5ad..dcfc61f22d 100644 --- a/contrib/pageinspect/pageinspect.control +++ b/contrib/pageinspect/pageinspect.control @@ -1,5 +1,5 @@ # pageinspect extension comment = 'inspect the contents of database pages at a low level' -default_version = '1.6' +default_version = '1.7' module_pathname = '$libdir/pageinspect' relocatable = true diff --git a/contrib/pgstattuple/expected/pgstattuple.out b/contrib/pgstattuple/expected/pgstattuple.out index 20b5585d03..a7087f6d45 100644 --- a/contrib/pgstattuple/expected/pgstattuple.out +++ b/contrib/pgstattuple/expected/pgstattuple.out @@ -48,7 +48,7 @@ select version, tree_level, from pgstatindex('test_pkey'); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select version, tree_level, @@ -58,7 +58,7 @@ select version, tree_level, from pgstatindex('test_pkey'::text); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select version, tree_level, @@ -68,7 +68,7 @@ select version, tree_level, from pgstatindex('test_pkey'::name); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select version, tree_level, @@ -78,7 +78,7 @@ select version, tree_level, from pgstatindex('test_pkey'::regclass); version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation ---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+-------------------- - 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN + 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN (1 row) select pg_relpages('test'); @@ -229,7 +229,7 @@ create index test_partition_hash_idx on test_partition using hash (a); select pgstatindex('test_partition_idx'); pgstatindex ------------------------------ - (2,0,8192,0,0,0,0,0,NaN,NaN) + (3,0,8192,0,0,0,0,0,NaN,NaN) (1 row) select pgstathashindex('test_partition_hash_idx'); diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index e7d408824e..a189a8efc3 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1882,6 +1882,31 @@ include_dir 'conf.d' + + Index Vacuum + + + vacuum_cleanup_index_scale_factor (floating point) + + vacuum_cleanup_index_scale_factor configuration parameter + + + + + When no tuples were deleted from the heap, B-tree indexes might still + be scanned during VACUUM cleanup stage by two + reasons. The first reason is that B-tree index contains deleted pages + which can be recycled during cleanup. The second reason is that B-tree + index statistics is stalled. The criterion of stalled index statistics + is number of inserted tuples since previous statistics collection + is greater than vacuum_cleanup_index_scale_factor + fraction of total number of heap tuples. + + + + + + Background Writer diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml index 23570af4bf..4d5da186bb 100644 --- a/doc/src/sgml/pageinspect.sgml +++ b/doc/src/sgml/pageinspect.sgml @@ -247,13 +247,15 @@ test=# SELECT * FROM heap_page_item_attrs(get_raw_page('pg_class', 0), 'pg_class index's metapage. For example: test=# SELECT * FROM bt_metap('pg_cast_oid_index'); --[ RECORD 1 ]----- -magic | 340322 -version | 2 -root | 1 -level | 0 -fastroot | 1 -fastlevel | 0 +-[ RECORD 1 ]-----------+------- +magic | 340322 +version | 3 +root | 1 +level | 0 +fastroot | 1 +fastlevel | 0 +oldest_xact | 582 +last_cleanup_num_tuples | 1000 diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index ba1c5d6392..e9521fbfb9 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -369,6 +369,21 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] + + B-tree indexes additionally accept this parameter: + + + + + vacuum_cleanup_index_scale_factor + + + Per-table value for . + + + + + GiST indexes additionally accept this parameter: diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 35c09987ad..69ab2f101c 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -409,6 +409,15 @@ static relopt_real realRelOpts[] = }, 0, -1.0, DBL_MAX }, + { + { + "vacuum_cleanup_index_scale_factor", + "Number of tuple inserts prior to index cleanup as a fraction of reltuples.", + RELOPT_KIND_BTREE, + ShareUpdateExclusiveLock + }, + -1, 0.0, 100.0 + }, /* list terminator */ {{NULL}} }; @@ -1371,7 +1380,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind) {"user_catalog_table", RELOPT_TYPE_BOOL, offsetof(StdRdOptions, user_catalog_table)}, {"parallel_workers", RELOPT_TYPE_INT, - offsetof(StdRdOptions, parallel_workers)} + offsetof(StdRdOptions, parallel_workers)}, + {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL, + offsetof(StdRdOptions, vacuum_cleanup_index_scale_factor)} }; options = parseRelOptions(reloptions, validate, kind, &numoptions); diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 40111990c5..fd7360278d 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -939,6 +939,9 @@ _bt_insertonpg(Relation rel, if (BufferIsValid(metabuf)) { + /* upgrade meta-page if needed */ + if (metad->btm_version < BTREE_VERSION) + _bt_upgrademetapage(metapg); metad->btm_fastroot = itup_blkno; metad->btm_fastlevel = lpageop->btpo.level; MarkBufferDirty(metabuf); @@ -997,6 +1000,9 @@ _bt_insertonpg(Relation rel, xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; + xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; + xlmeta.last_cleanup_num_heap_tuples = + metad->btm_last_cleanup_num_heap_tuples; XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata)); @@ -2049,6 +2055,10 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); + /* upgrade metapage if needed */ + if (metad->btm_version < BTREE_VERSION) + _bt_upgrademetapage(metapg); + /* * Create downlink item for left page (old root). Since this will be the * first item in a non-leaf page, it implicitly has minus-infinity key @@ -2138,6 +2148,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) md.level = metad->btm_level; md.fastroot = rootblknum; md.fastlevel = metad->btm_level; + md.oldest_btpo_xact = metad->btm_oldest_btpo_xact; + md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 92afe2de38..505a67e6ed 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -60,6 +60,8 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) metad->btm_level = level; metad->btm_fastroot = rootbknum; metad->btm_fastlevel = level; + metad->btm_oldest_btpo_xact = InvalidTransactionId; + metad->btm_last_cleanup_num_heap_tuples = -1.0; metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); metaopaque->btpo_flags = BTP_META; @@ -73,6 +75,114 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) ((char *) metad + sizeof(BTMetaPageData)) - (char *) page; } +/* + * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to the new. + * + * This routine does purely in-memory image upgrade. Caller is + * responsible for locking, WAL-logging etc. + */ +void +_bt_upgrademetapage(Page page) +{ + BTMetaPageData *metad; + BTPageOpaque metaopaque; + + metad = BTPageGetMeta(page); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* It must be really a meta page of upgradable version */ + Assert(metaopaque->btpo_flags & BTP_META); + Assert(metad->btm_version < BTREE_VERSION); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + + /* Set version number and fill extra fields added into version 3 */ + metad->btm_version = BTREE_VERSION; + metad->btm_oldest_btpo_xact = InvalidTransactionId; + metad->btm_last_cleanup_num_heap_tuples = -1.0; + + /* Adjust pd_lower (see _bt_initmetapage() for details) */ + ((PageHeader) page)->pd_lower = + ((char *) metad + sizeof(BTMetaPageData)) - (char *) page; +} + +/* + * _bt_update_meta_cleanup_info() -- Update cleanup-related information in + * the metapage. + * + * This routine checks if provided cleanup-related information is matching + * to those written in the metapage. On mismatch, metapage is overritten. + */ +void +_bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, + float8 numHeapTuples) +{ + Buffer metabuf; + Page metapg; + BTPageOpaque metaopaque; + BTMetaPageData *metad; + bool needsRewrite = false; + XLogRecPtr recptr; + + /* read the metapage and check if it needs rewrite */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + metad = BTPageGetMeta(metapg); + + /* outdated version of metapage always needs rewrite */ + if (metad->btm_version < BTREE_VERSION) + needsRewrite = true; + else if (metad->btm_oldest_btpo_xact != oldestBtpoXact || + metad->btm_last_cleanup_num_heap_tuples != numHeapTuples) + needsRewrite = true; + + if (!needsRewrite) + { + _bt_relbuf(rel, metabuf); + return; + } + + /* trade in our read lock for a write lock */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + LockBuffer(metabuf, BT_WRITE); + + START_CRIT_SECTION(); + + /* upgrade meta-page if needed */ + if (metad->btm_version < BTREE_VERSION) + _bt_upgrademetapage(metapg); + + /* update cleanup-related infromation */ + metad->btm_oldest_btpo_xact = oldestBtpoXact; + metad->btm_last_cleanup_num_heap_tuples = numHeapTuples; + MarkBufferDirty(metabuf); + + /* write wal record if needed */ + if (RelationNeedsWAL(rel)) + { + xl_btree_metadata md; + + XLogBeginInsert(); + XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + + md.root = metad->btm_root; + md.level = metad->btm_level; + md.fastroot = metad->btm_fastroot; + md.fastlevel = metad->btm_fastlevel; + md.oldest_btpo_xact = oldestBtpoXact; + md.last_cleanup_num_heap_tuples = numHeapTuples; + + XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata)); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP); + + PageSetLSN(metapg, recptr); + } + + END_CRIT_SECTION(); + _bt_relbuf(rel, metabuf); +} + /* * _bt_getroot() -- Get the root page of the btree. * @@ -124,7 +234,8 @@ _bt_getroot(Relation rel, int access) metad = (BTMetaPageData *) rel->rd_amcache; /* We shouldn't have cached it if any of these fail */ Assert(metad->btm_magic == BTREE_MAGIC); - Assert(metad->btm_version == BTREE_VERSION); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + Assert(metad->btm_version <= BTREE_VERSION); Assert(metad->btm_root != P_NONE); rootblkno = metad->btm_fastroot; @@ -170,12 +281,14 @@ _bt_getroot(Relation rel, int access) errmsg("index \"%s\" is not a btree", RelationGetRelationName(rel)))); - if (metad->btm_version != BTREE_VERSION) + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("version mismatch in index \"%s\": file version %d, code version %d", + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", RelationGetRelationName(rel), - metad->btm_version, BTREE_VERSION))); + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); /* if no root page initialized yet, do it */ if (metad->btm_root == P_NONE) @@ -191,6 +304,10 @@ _bt_getroot(Relation rel, int access) LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); LockBuffer(metabuf, BT_WRITE); + /* upgrade metapage if needed */ + if (metad->btm_version < BTREE_VERSION) + _bt_upgrademetapage(metapg); + /* * Race condition: if someone else initialized the metadata between * the time we released the read lock and acquired the write lock, we @@ -229,6 +346,8 @@ _bt_getroot(Relation rel, int access) metad->btm_level = 0; metad->btm_fastroot = rootblkno; metad->btm_fastlevel = 0; + metad->btm_oldest_btpo_xact = InvalidTransactionId; + metad->btm_last_cleanup_num_heap_tuples = -1.0; MarkBufferDirty(rootbuf); MarkBufferDirty(metabuf); @@ -248,6 +367,8 @@ _bt_getroot(Relation rel, int access) md.level = 0; md.fastroot = rootblkno; md.fastlevel = 0; + md.oldest_btpo_xact = InvalidTransactionId; + md.last_cleanup_num_heap_tuples = -1.0; XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); @@ -373,12 +494,14 @@ _bt_gettrueroot(Relation rel) errmsg("index \"%s\" is not a btree", RelationGetRelationName(rel)))); - if (metad->btm_version != BTREE_VERSION) + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("version mismatch in index \"%s\": file version %d, code version %d", + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", RelationGetRelationName(rel), - metad->btm_version, BTREE_VERSION))); + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); /* if no root page initialized yet, fail */ if (metad->btm_root == P_NONE) @@ -460,12 +583,14 @@ _bt_getrootheight(Relation rel) errmsg("index \"%s\" is not a btree", RelationGetRelationName(rel)))); - if (metad->btm_version != BTREE_VERSION) + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("version mismatch in index \"%s\": file version %d, code version %d", + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", RelationGetRelationName(rel), - metad->btm_version, BTREE_VERSION))); + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); /* * If there's no root page yet, _bt_getroot() doesn't expect a cache @@ -1784,6 +1909,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) /* And update the metapage, if needed */ if (BufferIsValid(metabuf)) { + /* upgrade metapage if needed */ + if (metad->btm_version < BTREE_VERSION) + _bt_upgrademetapage(metapg); metad->btm_fastroot = rightsib; metad->btm_fastlevel = targetlevel; MarkBufferDirty(metabuf); @@ -1834,6 +1962,8 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; + xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; + xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata)); xlinfo = XLOG_BTREE_UNLINK_PAGE_META; diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 6fca8e358f..06badc90ba 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -19,11 +19,14 @@ #include "postgres.h" #include "access/nbtree.h" +#include "access/nbtxlog.h" #include "access/relscan.h" #include "access/xlog.h" #include "commands/vacuum.h" +#include "miscadmin.h" #include "nodes/execnodes.h" #include "pgstat.h" +#include "postmaster/autovacuum.h" #include "storage/condition_variable.h" #include "storage/indexfsm.h" #include "storage/ipc.h" @@ -45,6 +48,7 @@ typedef struct BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */ BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */ BlockNumber totFreePages; /* true total # of free pages */ + TransactionId oldestBtpoXact; MemoryContext pagedelcontext; } BTVacState; @@ -89,7 +93,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc; static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, - BTCycleId cycleid); + BTCycleId cycleid, TransactionId *oldestBtpoXact); static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno); @@ -773,6 +777,70 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan) SpinLockRelease(&btscan->btps_mutex); } +/* + * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that + * btbulkdelete() wasn't called. + */ +static bool +_bt_vacuum_needs_cleanup(IndexVacuumInfo *info) +{ + Buffer metabuf; + Page metapg; + BTPageOpaque metaopaque; + BTMetaPageData *metad; + bool result = false; + + metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + metad = BTPageGetMeta(metapg); + + if (metad->btm_version < BTREE_VERSION) + { + /* + * Do cleanup if metapage needs upgrade, because we don't have + * cleanup-related meta-information yet. + */ + result = true; + } + else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) && + TransactionIdPrecedes(metad->btm_oldest_btpo_xact, + RecentGlobalXmin)) + { + /* + * If oldest btpo.xact in the deleted pages is older than + * RecentGlobalXmin, then at least one deleted page can be recycled. + */ + result = true; + } + else + { + StdRdOptions *relopts; + float8 cleanup_scale_factor; + + /* + * If table receives large enough amount of insertions and no cleanup + * was performed, then index might appear to have stalled statistics. + * In order to evade that, we perform cleanup when table receives + * vacuum_cleanup_index_scale_factor fractions of insertions. + */ + relopts = (StdRdOptions *) info->index->rd_options; + cleanup_scale_factor = (relopts && + relopts->vacuum_cleanup_index_scale_factor >= 0) + ? relopts->vacuum_cleanup_index_scale_factor + : vacuum_cleanup_index_scale_factor; + + if (cleanup_scale_factor < 0 || + metad->btm_last_cleanup_num_heap_tuples < 0 || + info->num_heap_tuples > (1.0 + cleanup_scale_factor) * + metad->btm_last_cleanup_num_heap_tuples) + result = true; + } + + _bt_relbuf(info->index, metabuf); + return result; +} + /* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells @@ -795,9 +863,20 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* The ENSURE stuff ensures we clean up shared memory on failure */ PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); { + TransactionId oldestBtpoXact; + cycleid = _bt_start_vacuum(rel); - btvacuumscan(info, stats, callback, callback_state, cycleid); + btvacuumscan(info, stats, callback, callback_state, cycleid, + &oldestBtpoXact); + + /* + * Update cleanup-related information in metapage. These information + * is used only for cleanup but keeping up them to date can avoid + * unnecessary cleanup even after bulkdelete. + */ + _bt_update_meta_cleanup_info(info->index, oldestBtpoXact, + info->num_heap_tuples); } PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); _bt_end_vacuum(rel); @@ -819,17 +898,28 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) /* * If btbulkdelete was called, we need not do anything, just return the - * stats from the latest btbulkdelete call. If it wasn't called, we must - * still do a pass over the index, to recycle any newly-recyclable pages - * and to obtain index statistics. + * stats from the latest btbulkdelete call. If it wasn't called, we might + * still need to do a pass over the index, to recycle any newly-recyclable + * pages and to obtain index statistics. _bt_vacuum_needs_cleanup checks + * is there are newly-recyclable or stalled index statistics. * * Since we aren't going to actually delete any leaf items, there's no * need to go through all the vacuum-cycle-ID pushups. */ if (stats == NULL) { + TransactionId oldestBtpoXact; + + /* Check if we need a cleanup */ + if (!_bt_vacuum_needs_cleanup(info)) + return NULL; + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); - btvacuumscan(info, stats, NULL, NULL, 0); + btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact); + + /* Update cleanup-related information in the metapage */ + _bt_update_meta_cleanup_info(info->index, oldestBtpoXact, + info->num_heap_tuples); } /* @@ -862,7 +952,7 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, - BTCycleId cycleid) + BTCycleId cycleid, TransactionId *oldestBtpoXact) { Relation rel = info->index; BTVacState vstate; @@ -887,6 +977,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ vstate.lastBlockLocked = BTREE_METAPAGE; vstate.totFreePages = 0; + vstate.oldestBtpoXact = InvalidTransactionId; /* Create a temporary memory context to run _bt_pagedel in */ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, @@ -991,6 +1082,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* update statistics */ stats->num_pages = num_pages; stats->pages_free = vstate.totFreePages; + + if (oldestBtpoXact) + *oldestBtpoXact = vstate.oldestBtpoXact; } /* @@ -1070,6 +1164,11 @@ restart: { /* Already deleted, but can't recycle yet */ stats->pages_deleted++; + + /* Update the oldest btpo.xact */ + if (!TransactionIdIsValid(vstate->oldestBtpoXact) || + TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact)) + vstate->oldestBtpoXact = opaque->btpo.xact; } else if (P_ISHALFDEAD(opaque)) { @@ -1238,7 +1337,12 @@ restart: /* count only this page, else may double-count parent */ if (ndel) + { stats->pages_deleted++; + if (!TransactionIdIsValid(vstate->oldestBtpoXact) || + TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact)) + vstate->oldestBtpoXact = opaque->btpo.xact; + } MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 233c3965d9..b565bcb540 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -108,6 +108,8 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id) md->btm_level = xlrec->level; md->btm_fastroot = xlrec->fastroot; md->btm_fastlevel = xlrec->fastlevel; + md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact; + md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples; pageop = (BTPageOpaque) PageGetSpecialPointer(metapg); pageop->btpo_flags = BTP_META; @@ -985,7 +987,6 @@ btree_xlog_reuse_page(XLogReaderState *record) } } - void btree_redo(XLogReaderState *record) { @@ -1027,6 +1028,9 @@ btree_redo(XLogReaderState *record) case XLOG_BTREE_REUSE_PAGE: btree_xlog_reuse_page(record); break; + case XLOG_BTREE_META_CLEANUP: + _bt_restore_meta(record, 0); + break; default: elog(PANIC, "btree_redo: unknown op code %u", info); } diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 446040d816..c1f0441b08 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -138,3 +138,5 @@ int VacuumPageDirty = 0; int VacuumCostBalance = 0; /* working state for vacuum */ bool VacuumCostActive = false; + +double vacuum_cleanup_index_scale_factor; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 4ffc8451ca..260ae264d8 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -3208,6 +3208,16 @@ static struct config_real ConfigureNamesReal[] = NULL, NULL, NULL }, + { + {"vacuum_cleanup_index_scale_factor", PGC_SIGHUP, AUTOVACUUM, + gettext_noop("Number of tuple inserts prior to index cleanup as a fraction of reltuples."), + NULL + }, + &vacuum_cleanup_index_scale_factor, + 0.1, 0.0, 100.0, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 2b0b1da763..f532f3ffff 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -102,6 +102,11 @@ typedef struct BTMetaPageData uint32 btm_level; /* tree level of the root page */ BlockNumber btm_fastroot; /* current "fast" root location */ uint32 btm_fastlevel; /* tree level of the "fast" root page */ + /* following fields are available since page version 3 */ + TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among of + * deleted pages */ + float4 btm_last_cleanup_num_heap_tuples; /* number of heap tuples + * during last cleanup */ } BTMetaPageData; #define BTPageGetMeta(p) \ @@ -109,7 +114,8 @@ typedef struct BTMetaPageData #define BTREE_METAPAGE 0 /* first page is meta */ #define BTREE_MAGIC 0x053162 /* magic number of btree pages */ -#define BTREE_VERSION 2 /* current version number */ +#define BTREE_VERSION 3 /* current version number */ +#define BTREE_MIN_VERSION 2 /* minimal supported version number */ /* * Maximum size of a btree index entry, including its tuple header. @@ -481,6 +487,9 @@ extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack); * prototypes for functions in nbtpage.c */ extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level); +extern void _bt_update_meta_cleanup_info(Relation rel, + TransactionId oldestBtpoXact, float8 numHeapTuples); +extern void _bt_upgrademetapage(Page page); extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_gettrueroot(Relation rel); extern int _bt_getrootheight(Relation rel); diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 8297df75fe..a8ccdcec42 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -38,6 +38,8 @@ * vacuum */ #define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from * FSM */ +#define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the + * metapage */ /* * All that we need to regenerate the meta-data page @@ -48,6 +50,8 @@ typedef struct xl_btree_metadata uint32 level; BlockNumber fastroot; uint32 fastlevel; + TransactionId oldest_btpo_xact; + double last_cleanup_num_heap_tuples; } xl_btree_metadata; /* diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index a4574cd533..a429a19964 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -256,6 +256,8 @@ extern int VacuumPageDirty; extern int VacuumCostBalance; extern bool VacuumCostActive; +extern double vacuum_cleanup_index_scale_factor; + /* in tcop/postgres.c */ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index c26c395b0b..9826c67fc4 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -287,6 +287,8 @@ typedef struct StdRdOptions { int32 vl_len_; /* varlena header (do not touch directly!) */ int fillfactor; /* page fill factor in percent (0..100) */ + /* fraction of newly inserted tuples prior to trigger index cleanup */ + float8 vacuum_cleanup_index_scale_factor; int toast_tuple_target; /* target for tuple toasting */ AutoVacOpts autovacuum; /* autovacuum-related options */ bool user_catalog_table; /* use as an additional catalog relation */ diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index 755cd17792..4778ac14a4 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -150,3 +150,32 @@ vacuum btree_tall_tbl; -- need to insert some rows to cause the fast root page to split. insert into btree_tall_tbl (id, t) select g, repeat('x', 100) from generate_series(1, 500) g; +-- +-- Test vacuum_cleanup_index_scale_factor +-- +-- Simple create +create table btree_test(a int); +create index btree_idx1 on btree_test(a) with (vacuum_cleanup_index_scale_factor = 40.0); +select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass; + reloptions +------------------------------------------ + {vacuum_cleanup_index_scale_factor=40.0} +(1 row) + +-- Fail while setting improper values +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = -10.0); +ERROR: value -10.0 out of bounds for option "vacuum_cleanup_index_scale_factor" +DETAIL: Valid values are between "0.000000" and "100.000000". +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 100.0); +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 'string'); +ERROR: invalid value for floating point option "vacuum_cleanup_index_scale_factor": string +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = true); +ERROR: invalid value for floating point option "vacuum_cleanup_index_scale_factor": true +-- Simple ALTER INDEX +alter index btree_idx1 set (vacuum_cleanup_index_scale_factor = 70.0); +select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass; + reloptions +------------------------------------------ + {vacuum_cleanup_index_scale_factor=70.0} +(1 row) + diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index 65b08c8282..21171f7762 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -92,3 +92,22 @@ vacuum btree_tall_tbl; -- need to insert some rows to cause the fast root page to split. insert into btree_tall_tbl (id, t) select g, repeat('x', 100) from generate_series(1, 500) g; + +-- +-- Test vacuum_cleanup_index_scale_factor +-- + +-- Simple create +create table btree_test(a int); +create index btree_idx1 on btree_test(a) with (vacuum_cleanup_index_scale_factor = 40.0); +select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass; + +-- Fail while setting improper values +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = -10.0); +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 100.0); +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 'string'); +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = true); + +-- Simple ALTER INDEX +alter index btree_idx1 set (vacuum_cleanup_index_scale_factor = 70.0); +select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;