From ad98fb14226ae6456fbaed7990ee7591cbe5efd2 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 13 Jan 2024 13:46:27 -0500 Subject: [PATCH] Cope with catcache entries becoming stale during detoasting. We've long had a policy that any toasted fields in a catalog tuple should be pulled in-line before entering the tuple in a catalog cache. However, that requires access to the catalog's toast table, and we'll typically do AcceptInvalidationMessages while opening the toast table. So it's possible that the catalog tuple is outdated by the time we finish detoasting it. Since no cache entry exists yet, we can't mark the entry stale during AcceptInvalidationMessages, and instead we'll press forward and build an apparently-valid cache entry. The upshot is that we have a race condition whereby an out-of-date entry could be made in a backend's catalog cache, and persist there indefinitely causing indeterminate misbehavior. To fix, use the existing systable_recheck_tuple code to recheck whether the catalog tuple is still up-to-date after we finish detoasting it. If not, loop around and restart the process of searching the catalog and constructing cache entries from the top. The case is rare enough that this shouldn't create any meaningful performance penalty, even in the SearchCatCacheList case where we need to tear down and reconstruct the whole list. Indeed, the case is so rare that AFAICT it doesn't occur during our regression tests, and there doesn't seem to be any easy way to build a test that would exercise it reliably. To allow testing of the retry code paths, add logic (in USE_ASSERT_CHECKING builds only) that randomly pretends that the recheck failed about one time out of a thousand. This is enough to ensure that we'll pass through the retry paths during most regression test runs. By adding an extra level of looping, this commit creates a need to reindent most of SearchCatCacheMiss and SearchCatCacheList. I'll do that separately, to allow putting those changes in .git-blame-ignore-revs. Patch by me; thanks to Alexander Lakhin for having built a test case to prove the bug is real, and to Xiaoran Wang for review. Back-patch to all supported branches. Discussion: https://postgr.es/m/1393953.1698353013@sss.pgh.pa.us Discussion: https://postgr.es/m/CAGjhLkOoBEC9mLsnB42d3CO1vcMx71MLSEuigeABbQ8oRdA6gw@mail.gmail.com --- src/backend/utils/cache/catcache.c | 164 ++++++++++++++++++++++------- 1 file changed, 124 insertions(+), 40 deletions(-) diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index c0591cffe5..0dcd45d2f0 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -24,6 +24,7 @@ #include "catalog/pg_operator.h" #include "catalog/pg_type.h" #include "common/hashfn.h" +#include "common/pg_prng.h" #include "miscadmin.h" #include "port/pg_bitutils.h" #ifdef CATCACHE_STATS @@ -90,10 +91,10 @@ static void CatCachePrintStats(int code, Datum arg); static void CatCacheRemoveCTup(CatCache *cache, CatCTup *ct); static void CatCacheRemoveCList(CatCache *cache, CatCList *cl); static void CatalogCacheInitializeCache(CatCache *cache); -static CatCTup *CatalogCacheCreateEntry(CatCache *cache, HeapTuple ntp, +static CatCTup *CatalogCacheCreateEntry(CatCache *cache, + HeapTuple ntp, SysScanDesc scandesc, Datum *arguments, - uint32 hashValue, Index hashIndex, - bool negative); + uint32 hashValue, Index hashIndex); static void ReleaseCatCacheWithOwner(HeapTuple tuple, ResourceOwner resowner); static void ReleaseCatCacheListWithOwner(CatCList *list, ResourceOwner resowner); @@ -1372,6 +1373,7 @@ SearchCatCacheMiss(CatCache *cache, SysScanDesc scandesc; HeapTuple ntp; CatCTup *ct; + bool stale; Datum arguments[CATCACHE_MAXKEYS]; /* Initialize local parameter array */ @@ -1380,16 +1382,6 @@ SearchCatCacheMiss(CatCache *cache, arguments[2] = v3; arguments[3] = v4; - /* - * Ok, need to make a lookup in the relation, copy the scankey and fill - * out any per-call fields. - */ - memcpy(cur_skey, cache->cc_skey, sizeof(ScanKeyData) * nkeys); - cur_skey[0].sk_argument = v1; - cur_skey[1].sk_argument = v2; - cur_skey[2].sk_argument = v3; - cur_skey[3].sk_argument = v4; - /* * Tuple was not found in cache, so we have to try to retrieve it directly * from the relation. If found, we will add it to the cache; if not @@ -1404,9 +1396,28 @@ SearchCatCacheMiss(CatCache *cache, * will eventually age out of the cache, so there's no functional problem. * This case is rare enough that it's not worth expending extra cycles to * detect. + * + * Another case, which we *must* handle, is that the tuple could become + * outdated during CatalogCacheCreateEntry's attempt to detoast it (since + * AcceptInvalidationMessages can run during TOAST table access). We do + * not want to return already-stale catcache entries, so we loop around + * and do the table scan again if that happens. */ relation = table_open(cache->cc_reloid, AccessShareLock); + do + { + /* + * Ok, need to make a lookup in the relation, copy the scankey and + * fill out any per-call fields. (We must re-do this when retrying, + * because systable_beginscan scribbles on the scankey.) + */ + memcpy(cur_skey, cache->cc_skey, sizeof(ScanKeyData) * nkeys); + cur_skey[0].sk_argument = v1; + cur_skey[1].sk_argument = v2; + cur_skey[2].sk_argument = v3; + cur_skey[3].sk_argument = v4; + scandesc = systable_beginscan(relation, cache->cc_indexoid, IndexScanOK(cache, cur_skey), @@ -1415,12 +1426,18 @@ SearchCatCacheMiss(CatCache *cache, cur_skey); ct = NULL; + stale = false; while (HeapTupleIsValid(ntp = systable_getnext(scandesc))) { - ct = CatalogCacheCreateEntry(cache, ntp, arguments, - hashValue, hashIndex, - false); + ct = CatalogCacheCreateEntry(cache, ntp, scandesc, NULL, + hashValue, hashIndex); + /* upon failure, we must start the scan over */ + if (ct == NULL) + { + stale = true; + break; + } /* immediately set the refcount to 1 */ ResourceOwnerEnlarge(CurrentResourceOwner); ct->refcount++; @@ -1429,6 +1446,7 @@ SearchCatCacheMiss(CatCache *cache, } systable_endscan(scandesc); + } while (stale); table_close(relation, AccessShareLock); @@ -1447,9 +1465,11 @@ SearchCatCacheMiss(CatCache *cache, if (IsBootstrapProcessingMode()) return NULL; - ct = CatalogCacheCreateEntry(cache, NULL, arguments, - hashValue, hashIndex, - true); + ct = CatalogCacheCreateEntry(cache, NULL, NULL, arguments, + hashValue, hashIndex); + + /* Creating a negative cache entry shouldn't fail */ + Assert(ct != NULL); CACHE_elog(DEBUG2, "SearchCatCache(%s): Contains %d/%d tuples", cache->cc_relname, cache->cc_ntup, CacheHdr->ch_ntup); @@ -1663,7 +1683,8 @@ SearchCatCacheList(CatCache *cache, * We have to bump the member refcounts temporarily to ensure they won't * get dropped from the cache while loading other members. We use a PG_TRY * block to ensure we can undo those refcounts if we get an error before - * we finish constructing the CatCList. + * we finish constructing the CatCList. ctlist must be valid throughout + * the PG_TRY block. */ ctlist = NIL; @@ -1672,19 +1693,23 @@ SearchCatCacheList(CatCache *cache, ScanKeyData cur_skey[CATCACHE_MAXKEYS]; Relation relation; SysScanDesc scandesc; - - /* - * Ok, need to make a lookup in the relation, copy the scankey and - * fill out any per-call fields. - */ - memcpy(cur_skey, cache->cc_skey, sizeof(ScanKeyData) * cache->cc_nkeys); - cur_skey[0].sk_argument = v1; - cur_skey[1].sk_argument = v2; - cur_skey[2].sk_argument = v3; - cur_skey[3].sk_argument = v4; + bool stale; relation = table_open(cache->cc_reloid, AccessShareLock); + do + { + /* + * Ok, need to make a lookup in the relation, copy the scankey and + * fill out any per-call fields. (We must re-do this when + * retrying, because systable_beginscan scribbles on the scankey.) + */ + memcpy(cur_skey, cache->cc_skey, sizeof(ScanKeyData) * cache->cc_nkeys); + cur_skey[0].sk_argument = v1; + cur_skey[1].sk_argument = v2; + cur_skey[2].sk_argument = v3; + cur_skey[3].sk_argument = v4; + scandesc = systable_beginscan(relation, cache->cc_indexoid, IndexScanOK(cache, cur_skey), @@ -1695,6 +1720,8 @@ SearchCatCacheList(CatCache *cache, /* The list will be ordered iff we are doing an index scan */ ordered = (scandesc->irel != NULL); + stale = false; + while (HeapTupleIsValid(ntp = systable_getnext(scandesc))) { uint32 hashValue; @@ -1737,9 +1764,32 @@ SearchCatCacheList(CatCache *cache, if (!found) { /* We didn't find a usable entry, so make a new one */ - ct = CatalogCacheCreateEntry(cache, ntp, arguments, - hashValue, hashIndex, - false); + ct = CatalogCacheCreateEntry(cache, ntp, scandesc, NULL, + hashValue, hashIndex); + /* upon failure, we must start the scan over */ + if (ct == NULL) + { + /* + * Release refcounts on any items we already had. We dare + * not try to free them if they're now unreferenced, since + * an error while doing that would result in the PG_CATCH + * below doing extra refcount decrements. Besides, we'll + * likely re-adopt those items in the next iteration, so + * it's not worth complicating matters to try to get rid + * of them. + */ + foreach(ctlist_item, ctlist) + { + ct = (CatCTup *) lfirst(ctlist_item); + Assert(ct->c_list == NULL); + Assert(ct->refcount > 0); + ct->refcount--; + } + /* Reset ctlist in preparation for new try */ + ctlist = NIL; + stale = true; + break; + } } /* Careful here: add entry to ctlist, then bump its refcount */ @@ -1749,6 +1799,7 @@ SearchCatCacheList(CatCache *cache, } systable_endscan(scandesc); + } while (stale); table_close(relation, AccessShareLock); @@ -1865,22 +1916,42 @@ ReleaseCatCacheListWithOwner(CatCList *list, ResourceOwner resowner) * CatalogCacheCreateEntry * Create a new CatCTup entry, copying the given HeapTuple and other * supplied data into it. The new entry initially has refcount 0. + * + * To create a normal cache entry, ntp must be the HeapTuple just fetched + * from scandesc, and "arguments" is not used. To create a negative cache + * entry, pass NULL for ntp and scandesc; then "arguments" is the cache + * keys to use. In either case, hashValue/hashIndex are the hash values + * computed from the cache keys. + * + * Returns NULL if we attempt to detoast the tuple and observe that it + * became stale. (This cannot happen for a negative entry.) Caller must + * retry the tuple lookup in that case. */ static CatCTup * -CatalogCacheCreateEntry(CatCache *cache, HeapTuple ntp, Datum *arguments, - uint32 hashValue, Index hashIndex, - bool negative) +CatalogCacheCreateEntry(CatCache *cache, HeapTuple ntp, SysScanDesc scandesc, + Datum *arguments, + uint32 hashValue, Index hashIndex) { CatCTup *ct; HeapTuple dtp; MemoryContext oldcxt; - /* negative entries have no tuple associated */ if (ntp) { int i; - Assert(!negative); + /* + * The visibility recheck below essentially never fails during our + * regression tests, and there's no easy way to force it to fail for + * testing purposes. To ensure we have test coverage for the retry + * paths in our callers, make debug builds randomly fail about 0.1% of + * the times through this code path, even when there's no toasted + * fields. + */ +#ifdef USE_ASSERT_CHECKING + if (pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 1000)) + return NULL; +#endif /* * If there are any out-of-line toasted fields in the tuple, expand @@ -1890,7 +1961,20 @@ CatalogCacheCreateEntry(CatCache *cache, HeapTuple ntp, Datum *arguments, * something using a slightly stale catcache entry. */ if (HeapTupleHasExternal(ntp)) + { dtp = toast_flatten_tuple(ntp, cache->cc_tupdesc); + + /* + * The tuple could become stale while we are doing toast table + * access (since AcceptInvalidationMessages can run then), so we + * must recheck its visibility afterwards. + */ + if (!systable_recheck_tuple(scandesc, ntp)) + { + heap_freetuple(dtp); + return NULL; + } + } else dtp = ntp; @@ -1929,7 +2013,7 @@ CatalogCacheCreateEntry(CatCache *cache, HeapTuple ntp, Datum *arguments, } else { - Assert(negative); + /* Set up keys for a negative cache entry */ oldcxt = MemoryContextSwitchTo(CacheMemoryContext); ct = (CatCTup *) palloc(sizeof(CatCTup)); @@ -1951,7 +2035,7 @@ CatalogCacheCreateEntry(CatCache *cache, HeapTuple ntp, Datum *arguments, ct->c_list = NULL; ct->refcount = 0; /* for the moment */ ct->dead = false; - ct->negative = negative; + ct->negative = (ntp == NULL); ct->hash_value = hashValue; dlist_push_head(&cache->cc_bucket[hashIndex], &ct->cache_elem);