From 7772dece98506233f40c6751e5adb7983a0c1635 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Thu, 18 Jul 2019 13:22:54 -0700 Subject: [PATCH] Fix nbtree metapage cache upgrade bug. Commit 857f9c36cda, which taught nbtree VACUUM to avoid unnecessary index scans, bumped the nbtree version number from 2 to 3, while adding the ability for nbtree indexes to be upgraded on-the-fly. Various assertions that assumed that an nbtree index was always on version 2 had to be changed to accept any supported version (version 2 or 3 on Postgres 11). However, a few assertions were missed in the initial commit, all of which were in code paths that cache a local copy of the metapage metadata, where the index had been expected to be on the current version (no longer version 2) as a generic sanity check. Rather than simply update the assertions, follow-up commit 0a64b45152b intentionally made the metapage caching code update the per-backend cached metadata version without changing the on-disk version at the same time. This could even happen when the planner needed to determine the height of a B-Tree for costing purposes. The assertions only fail on Postgres v12 when upgrading from v10, because they were adjusted to use the authoritative shared memory metapage by v12's commit dd299df8. To fix, remove the cache-only upgrade mechanism entirely, and update the assertions themselves to accept any supported version (go back to using the cached version in v12). The fix is almost a full revert of commit 0a64b45152b on the v11 branch. VACUUM only considers the authoritative metapage, and never bothers with a locally cached version, whereas everywhere else isn't interested in the metapage fields that were added by commit 857f9c36cda. It seems unlikely that this bug has affected any user on v11. Reported-By: Christoph Berg Bug: #15896 Discussion: https://postgr.es/m/15896-5b25e260fdb0b081%40postgresql.org Backpatch: 11-, where VACUUM was taught to avoid unnecessary index scans. --- src/backend/access/nbtree/nbtpage.c | 111 +++++++++------------------- src/backend/access/nbtree/nbtxlog.c | 2 + src/include/access/nbtree.h | 4 +- 3 files changed, 40 insertions(+), 77 deletions(-) diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 03570300d8..6b2655c573 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -33,7 +33,6 @@ #include "storage/predicate.h" #include "utils/snapmgr.h" -static void _bt_cachemetadata(Relation rel, BTMetaPageData *input); static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack); static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, @@ -109,48 +108,13 @@ _bt_upgrademetapage(Page page) ((char *) metad + sizeof(BTMetaPageData)) - (char *) page; } -/* - * Cache metadata from input meta page to rel->rd_amcache. - */ -static void -_bt_cachemetadata(Relation rel, BTMetaPageData *input) -{ - BTMetaPageData *cached_metad; - - /* We assume rel->rd_amcache was already freed by caller */ - Assert(rel->rd_amcache == NULL); - rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, - sizeof(BTMetaPageData)); - - /* Meta page should be of supported version */ - Assert(input->btm_version >= BTREE_MIN_VERSION && - input->btm_version <= BTREE_VERSION); - - cached_metad = (BTMetaPageData *) rel->rd_amcache; - if (input->btm_version >= BTREE_NOVAC_VERSION) - { - /* Version with compatible meta-data, no need to upgrade */ - memcpy(cached_metad, input, sizeof(BTMetaPageData)); - } - else - { - /* - * Upgrade meta-data: copy available information from meta-page and - * fill new fields with default values. - * - * Note that we cannot upgrade to version 4+ without a REINDEX, since - * extensive on-disk changes are required. - */ - memcpy(cached_metad, input, offsetof(BTMetaPageData, btm_oldest_btpo_xact)); - cached_metad->btm_version = BTREE_NOVAC_VERSION; - cached_metad->btm_oldest_btpo_xact = InvalidTransactionId; - cached_metad->btm_last_cleanup_num_heap_tuples = -1.0; - } -} - /* * Get metadata from share-locked buffer containing metapage, while performing - * standard sanity checks. Sanity checks here must match _bt_getroot(). + * standard sanity checks. + * + * Callers that cache data returned here in local cache should note that an + * on-the-fly upgrade using _bt_upgrademetapage() can change the version field + * and BTREE_NOVAC_VERSION specific fields without invalidating local cache. */ static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf) @@ -293,8 +257,6 @@ Buffer _bt_getroot(Relation rel, int access) { Buffer metabuf; - Page metapg; - BTPageOpaque metaopaque; Buffer rootbuf; Page rootpage; BTPageOpaque rootopaque; @@ -347,30 +309,13 @@ _bt_getroot(Relation rel, int access) } metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); - metapg = BufferGetPage(metabuf); - metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); - metad = BTPageGetMeta(metapg); - - /* sanity-check the metapage */ - if (!P_ISMETA(metaopaque) || - metad->btm_magic != BTREE_MAGIC) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("index \"%s\" is not a btree", - RelationGetRelationName(rel)))); - - if (metad->btm_version < BTREE_MIN_VERSION || - metad->btm_version > BTREE_VERSION) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("version mismatch in index \"%s\": file version %d, " - "current version %d, minimal supported version %d", - RelationGetRelationName(rel), - metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); + metad = _bt_getmeta(rel, metabuf); /* if no root page initialized yet, do it */ if (metad->btm_root == P_NONE) { + Page metapg; + /* If access = BT_READ, caller doesn't want us to create root yet */ if (access == BT_READ) { @@ -412,6 +357,8 @@ _bt_getroot(Relation rel, int access) rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT); rootopaque->btpo.level = 0; rootopaque->btpo_cycleid = 0; + /* Get raw page pointer for metapage */ + metapg = BufferGetPage(metabuf); /* NO ELOG(ERROR) till meta is updated */ START_CRIT_SECTION(); @@ -473,7 +420,7 @@ _bt_getroot(Relation rel, int access) LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK); LockBuffer(rootbuf, BT_READ); - /* okay, metadata is correct, release lock on it */ + /* okay, metadata is correct, release lock on it without caching */ _bt_relbuf(rel, metabuf); } else @@ -485,7 +432,9 @@ _bt_getroot(Relation rel, int access) /* * Cache the metapage data for next time */ - _bt_cachemetadata(rel, metad); + rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(BTMetaPageData)); + memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); /* * We are done with the metapage; arrange to release it via first @@ -659,16 +608,19 @@ _bt_getrootheight(Relation rel) /* * Cache the metapage data for next time */ - _bt_cachemetadata(rel, metad); - /* We shouldn't have cached it if any of these fail */ - Assert(metad->btm_magic == BTREE_MAGIC); - Assert(metad->btm_version >= BTREE_NOVAC_VERSION); - Assert(metad->btm_fastroot != P_NONE); + rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(BTMetaPageData)); + memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); _bt_relbuf(rel, metabuf); } /* Get cached page */ metad = (BTMetaPageData *) rel->rd_amcache; + /* We shouldn't have cached it if any of these fail */ + Assert(metad->btm_magic == BTREE_MAGIC); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + Assert(metad->btm_version <= BTREE_VERSION); + Assert(metad->btm_fastroot != P_NONE); return metad->btm_fastlevel; } @@ -709,17 +661,26 @@ _bt_heapkeyspace(Relation rel) /* * Cache the metapage data for next time + * + * An on-the-fly version upgrade performed by _bt_upgrademetapage() + * can change the nbtree version for an index without invalidating any + * local cache. This is okay because it can only happen when moving + * from version 2 to version 3, both of which are !heapkeyspace + * versions. */ - _bt_cachemetadata(rel, metad); - /* We shouldn't have cached it if any of these fail */ - Assert(metad->btm_magic == BTREE_MAGIC); - Assert(metad->btm_version >= BTREE_NOVAC_VERSION); - Assert(metad->btm_fastroot != P_NONE); + rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(BTMetaPageData)); + memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); _bt_relbuf(rel, metabuf); } /* Get cached page */ metad = (BTMetaPageData *) rel->rd_amcache; + /* We shouldn't have cached it if any of these fail */ + Assert(metad->btm_magic == BTREE_MAGIC); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + Assert(metad->btm_version <= BTREE_VERSION); + Assert(metad->btm_fastroot != P_NONE); return metad->btm_version > BTREE_NOVAC_VERSION; } diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 3147ea4726..dd5315c1aa 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -107,6 +107,8 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id) md->btm_level = xlrec->level; md->btm_fastroot = xlrec->fastroot; md->btm_fastlevel = xlrec->fastlevel; + /* Cannot log BTREE_MIN_VERSION index metapage without upgrade */ + Assert(md->btm_version >= BTREE_NOVAC_VERSION); md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact; md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples; diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index a3583f225b..83e0e6c28e 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -97,12 +97,12 @@ typedef BTPageOpaqueData *BTPageOpaque; typedef struct BTMetaPageData { uint32 btm_magic; /* should contain BTREE_MAGIC */ - uint32 btm_version; /* should contain BTREE_VERSION */ + uint32 btm_version; /* nbtree version (always <= BTREE_VERSION) */ BlockNumber btm_root; /* current root location */ uint32 btm_level; /* tree level of the root page */ BlockNumber btm_fastroot; /* current "fast" root location */ uint32 btm_fastlevel; /* tree level of the "fast" root page */ - /* following fields are available since page version 3 */ + /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */ TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among all deleted * pages */ float8 btm_last_cleanup_num_heap_tuples; /* number of heap tuples