Fix bugs in vacuum of shared rels, by keeping their relcache entries current.

When vacuum processes a relation it uses the corresponding relcache
entry's relfrozenxid / relminmxid as a cutoff for when to remove
tuples etc. Unfortunately for nailed relations (i.e. critical system
catalogs) bugs could frequently lead to the corresponding relcache
entry being stale.

This set of bugs could cause actual data corruption as vacuum would
potentially not remove the correct row versions, potentially reviving
them at a later point.  After 699bf7d05c some corruptions in this vein
were prevented, but the additional error checks could also trigger
spuriously. Examples of such errors are:
  ERROR: found xmin ... from before relfrozenxid ...
and
  ERROR: found multixact ... from before relminmxid ...
To be caused by this bug the errors have to occur on system catalog
tables.

The two bugs are:

1) Invalidations for nailed relations were ignored, based on the
   theory that the relcache entry for such tables doesn't
   change. Which is largely true, except for fields like relfrozenxid
   etc.  This means that changes to relations vacuumed in other
   sessions weren't picked up by already existing sessions.  Luckily
   autovacuum doesn't have particularly longrunning sessions.

2) For shared *and* nailed relations, the shared relcache init file
   was never invalidated while running.  That means that for such
   tables (e.g. pg_authid, pg_database) it's not just already existing
   sessions that are affected, but even new connections are as well.
   That explains why the reports usually were about pg_authid et. al.

To fix 1), revalidate the rd_rel portion of a relcache entry when
invalid. This implies a bit of extra complexity to deal with
bootstrapping, but it's not too bad.  The fix for 2) is simpler,
simply always remove both the shared and local init files.

Author: Andres Freund
Reviewed-By: Alvaro Herrera
Discussion:
    https://postgr.es/m/20180525203736.crkbg36muzxrjj5e@alap3.anarazel.de
    https://postgr.es/m/CAMa1XUhKSJd98JW4o9StWPrfS=11bPgG+_GDMxe25TvUY4Sugg@mail.gmail.com
    https://postgr.es/m/CAKMFJucqbuoDRfxPDX39WhA3vJyxweRg_zDVXzncr6+5wOguWA@mail.gmail.com
    https://postgr.es/m/CAGewt-ujGpMLQ09gXcUFMZaZsGJC98VXHEFbF-tpPB0fB13K+A@mail.gmail.com
Backpatch: 9.3-
This commit is contained in:
Andres Freund 2018-06-12 11:13:21 -07:00
parent 256d43128d
commit 6a46aba1cd
3 changed files with 146 additions and 72 deletions

View File

@ -515,10 +515,12 @@ RegisterRelcacheInvalidation(Oid dbId, Oid relId)
(void) GetCurrentCommandId(true);
/*
* If the relation being invalidated is one of those cached in the local
* relcache init file, mark that we need to zap that file at commit.
* If the relation being invalidated is one of those cached in a relcache
* init file, mark that we need to zap that file at commit. For simplicity
* invalidations for a specific database always invalidate the shared file
* as well.
*/
if (OidIsValid(dbId) && RelationIdIsInInitFile(relId))
if (RelationIdIsInInitFile(relId))
transInvalInfo->RelcacheInitFileInval = true;
}
@ -870,18 +872,26 @@ ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs,
if (RelcacheInitFileInval)
{
elog(trace_recovery(DEBUG4), "removing relcache init files for database %u",
dbid);
/*
* RelationCacheInitFilePreInvalidate requires DatabasePath to be set,
* but we should not use SetDatabasePath during recovery, since it is
* RelationCacheInitFilePreInvalidate, when the invalidation message
* is for a specific database, requires DatabasePath to be set, but we
* should not use SetDatabasePath during recovery, since it is
* intended to be used only once by normal backends. Hence, a quick
* hack: set DatabasePath directly then unset after use.
*/
DatabasePath = GetDatabasePath(dbid, tsid);
elog(trace_recovery(DEBUG4), "removing relcache init file in \"%s\"",
DatabasePath);
if (OidIsValid(dbid))
DatabasePath = GetDatabasePath(dbid, tsid);
RelationCacheInitFilePreInvalidate();
pfree(DatabasePath);
DatabasePath = NULL;
if (OidIsValid(dbid))
{
pfree(DatabasePath);
DatabasePath = NULL;
}
}
SendSharedInvalidMessages(msgs, nmsgs);

View File

@ -241,6 +241,7 @@ static void RelationDestroyRelation(Relation relation, bool remember_tupdesc);
static void RelationClearRelation(Relation relation, bool rebuild);
static void RelationReloadIndexInfo(Relation relation);
static void RelationReloadNailed(Relation relation);
static void RelationFlushRelation(Relation relation);
static void RememberToFreeTupleDescAtEOX(TupleDesc td);
static void AtEOXact_cleanup(Relation relation, bool isCommit);
@ -277,7 +278,7 @@ static void IndexSupportInitialize(oidvector *indclass,
static OpClassCacheEnt *LookupOpclassInfo(Oid operatorClassOid,
StrategyNumber numSupport);
static void RelationCacheInitFileRemoveInDir(const char *tblspcpath);
static void unlink_initfile(const char *initfilename);
static void unlink_initfile(const char *initfilename, int elevel);
/*
@ -1782,7 +1783,16 @@ RelationIdGetRelation(Oid relationId)
RelationReloadIndexInfo(rd);
else
RelationClearRelation(rd, true);
Assert(rd->rd_isvalid);
/*
* Normally entries need to be valid here, but before the relcache
* has been initialized, not enough infrastructure exists to
* perform pg_class lookups. The structure of such entries doesn't
* change, but we still want to update the rd_rel entry. So
* rd_isvalid = false is left in place for a later lookup.
*/
Assert(rd->rd_isvalid ||
(rd->rd_isnailed && !criticalRelcachesBuilt));
}
return rd;
}
@ -1985,6 +1995,81 @@ RelationReloadIndexInfo(Relation relation)
relation->rd_isvalid = true;
}
/*
* RelationReloadNailed - reload minimal information for nailed relations.
*
* The structure of a nailed relation can never change (which is good, because
* we rely on knowing their structure to be able to read catalog content). But
* some parts, e.g. pg_class.relfrozenxid, are still important to have
* accurate content for. Therefore those need to be reloaded after the arrival
* of invalidations.
*/
static void
RelationReloadNailed(Relation relation)
{
Assert(relation->rd_isnailed);
/*
* Redo RelationInitPhysicalAddr in case it is a mapped relation whose
* mapping changed.
*/
RelationInitPhysicalAddr(relation);
/* flag as needing to be revalidated */
relation->rd_isvalid = false;
/*
* Can only reread catalog contents if in a transaction. If the relation
* is currently open (not counting the nailed refcount), do so
* immediately. Otherwise we've already marked the entry as possibly
* invalid, and it'll be fixed when next opened.
*/
if (!IsTransactionState() || relation->rd_refcnt <= 1)
return;
if (relation->rd_rel->relkind == RELKIND_INDEX)
{
/*
* If it's a nailed-but-not-mapped index, then we need to re-read the
* pg_class row to see if its relfilenode changed.
*/
RelationReloadIndexInfo(relation);
}
else
{
/*
* Reload a non-index entry. We can't easily do so if relcaches
* aren't yet built, but that's fine because at that stage the
* attributes that need to be current (like relfrozenxid) aren't yet
* accessed. To ensure the entry will later be revalidated, we leave
* it in invalid state, but allow use (cf. RelationIdGetRelation()).
*/
if (criticalRelcachesBuilt)
{
HeapTuple pg_class_tuple;
Form_pg_class relp;
/*
* NB: Mark the entry as valid before starting to scan, to avoid
* self-recursion when re-building pg_class.
*/
relation->rd_isvalid = true;
pg_class_tuple = ScanPgRelation(RelationGetRelid(relation),
true, false);
relp = (Form_pg_class) GETSTRUCT(pg_class_tuple);
memcpy(relation->rd_rel, relp, CLASS_TUPLE_SIZE);
heap_freetuple(pg_class_tuple);
/*
* Again mark as valid, to protect against concurrently arriving
* invalidations.
*/
relation->rd_isvalid = true;
}
}
}
/*
* RelationDestroyRelation
*
@ -2089,26 +2174,12 @@ RelationClearRelation(Relation relation, bool rebuild)
RelationCloseSmgr(relation);
/*
* Never, never ever blow away a nailed-in system relation, because we'd
* be unable to recover. However, we must redo RelationInitPhysicalAddr
* in case it is a mapped relation whose mapping changed.
*
* If it's a nailed-but-not-mapped index, then we need to re-read the
* pg_class row to see if its relfilenode changed. We do that immediately
* if we're inside a valid transaction and the relation is open (not
* counting the nailed refcount). Otherwise just mark the entry as
* possibly invalid, and it'll be fixed when next opened.
* Treat nailed-in system relations separately, they always need to be
* accessible, so we can't blow them away.
*/
if (relation->rd_isnailed)
{
RelationInitPhysicalAddr(relation);
if (relation->rd_rel->relkind == RELKIND_INDEX)
{
relation->rd_isvalid = false; /* needs to be revalidated */
if (relation->rd_refcnt > 1 && IsTransactionState())
RelationReloadIndexInfo(relation);
}
RelationReloadNailed(relation);
return;
}
@ -5380,24 +5451,26 @@ write_item(const void *data, Size len, FILE *fp)
/*
* Determine whether a given relation (identified by OID) is one of the ones
* we should store in the local relcache init file.
* we should store in a relcache init file.
*
* We must cache all nailed rels, and for efficiency we should cache every rel
* that supports a syscache. The former set is almost but not quite a subset
* of the latter. Currently, we must special-case TriggerRelidNameIndexId,
* which RelationCacheInitializePhase3 chooses to nail for efficiency reasons,
* but which does not support any syscache.
*
* Note: this function is currently never called for shared rels. If it were,
* we'd probably also need a special case for DatabaseNameIndexId, which is
* critical but does not support a syscache.
* of the latter. The special cases are relations where
* RelationCacheInitializePhase2/3 chooses to nail for efficiency reasons, but
* which do not support any syscache.
*/
bool
RelationIdIsInInitFile(Oid relationId)
{
if (relationId == TriggerRelidNameIndexId)
if (relationId == SharedSecLabelRelationId ||
relationId == TriggerRelidNameIndexId ||
relationId == DatabaseNameIndexId ||
relationId == SharedSecLabelObjectIndexId)
{
/* If this Assert fails, we don't need this special case anymore. */
/*
* If this Assert fails, we don't need the applicable special case
* anymore.
*/
Assert(!RelationSupportsSysCache(relationId));
return true;
}
@ -5471,38 +5544,30 @@ RelationHasUnloggedIndex(Relation rel)
* We take the lock and do the unlink in RelationCacheInitFilePreInvalidate,
* then release the lock in RelationCacheInitFilePostInvalidate. Caller must
* send any pending SI messages between those calls.
*
* Notice this deals only with the local init file, not the shared init file.
* The reason is that there can never be a "significant" change to the
* relcache entry of a shared relation; the most that could happen is
* updates of noncritical fields such as relpages/reltuples. So, while
* it's worth updating the shared init file from time to time, it can never
* be invalid enough to make it necessary to remove it.
*/
void
RelationCacheInitFilePreInvalidate(void)
{
char initfilename[MAXPGPATH];
char localinitfname[MAXPGPATH];
char sharedinitfname[MAXPGPATH];
snprintf(initfilename, sizeof(initfilename), "%s/%s",
DatabasePath, RELCACHE_INIT_FILENAME);
if (DatabasePath)
snprintf(localinitfname, sizeof(localinitfname), "%s/%s",
DatabasePath, RELCACHE_INIT_FILENAME);
snprintf(sharedinitfname, sizeof(sharedinitfname), "global/%s",
RELCACHE_INIT_FILENAME);
LWLockAcquire(RelCacheInitLock, LW_EXCLUSIVE);
if (unlink(initfilename) < 0)
{
/*
* The file might not be there if no backend has been started since
* the last removal. But complain about failures other than ENOENT.
* Fortunately, it's not too late to abort the transaction if we can't
* get rid of the would-be-obsolete init file.
*/
if (errno != ENOENT)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not remove cache file \"%s\": %m",
initfilename)));
}
/*
* The files might not be there if no backend has been started since the
* last removal. But complain about failures other than ENOENT with
* ERROR. Fortunately, it's not too late to abort the transaction if we
* can't get rid of the would-be-obsolete init file.
*/
if (DatabasePath)
unlink_initfile(localinitfname, ERROR);
unlink_initfile(sharedinitfname, ERROR);
}
void
@ -5528,13 +5593,9 @@ RelationCacheInitFileRemove(void)
struct dirent *de;
char path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)];
/*
* We zap the shared cache file too. In theory it can't get out of sync
* enough to be a problem, but in data-corruption cases, who knows ...
*/
snprintf(path, sizeof(path), "global/%s",
RELCACHE_INIT_FILENAME);
unlink_initfile(path);
unlink_initfile(path, LOG);
/* Scan everything in the default tablespace */
RelationCacheInitFileRemoveInDir("base");
@ -5586,7 +5647,7 @@ RelationCacheInitFileRemoveInDir(const char *tblspcpath)
/* Try to remove the init file in each database */
snprintf(initfilename, sizeof(initfilename), "%s/%s/%s",
tblspcpath, de->d_name, RELCACHE_INIT_FILENAME);
unlink_initfile(initfilename);
unlink_initfile(initfilename, LOG);
}
}
@ -5594,12 +5655,15 @@ RelationCacheInitFileRemoveInDir(const char *tblspcpath)
}
static void
unlink_initfile(const char *initfilename)
unlink_initfile(const char *initfilename, int elevel)
{
if (unlink(initfilename) < 0)
{
/* It might not be there, but log any error other than ENOENT */
if (errno != ENOENT)
elog(LOG, "could not remove cache file \"%s\": %m", initfilename);
ereport(elevel,
(errcode_for_file_access(),
errmsg("could not remove cache file \"%s\": %m",
initfilename)));
}
}

View File

@ -64,7 +64,7 @@ typedef struct xl_invalidations
{
Oid dbId; /* MyDatabaseId */
Oid tsId; /* MyDatabaseTableSpace */
bool relcacheInitFileInval; /* invalidate relcache init file */
bool relcacheInitFileInval; /* invalidate relcache init files */
int nmsgs; /* number of shared inval msgs */
SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER];
} xl_invalidations;