Commit a54e1f15 authored by Andres Freund's avatar Andres Freund

Fix bugs in vacuum of shared rels, by keeping their relcache entries current.

When vacuum processes a relation it uses the corresponding relcache
entry's relfrozenxid / relminmxid as a cutoff for when to remove
tuples etc. Unfortunately for nailed relations (i.e. critical system
catalogs) bugs could frequently lead to the corresponding relcache
entry being stale.

This set of bugs could cause actual data corruption as vacuum would
potentially not remove the correct row versions, potentially reviving
them at a later point.  After 699bf7d0 some corruptions in this vein
were prevented, but the additional error checks could also trigger
spuriously. Examples of such errors are:
  ERROR: found xmin ... from before relfrozenxid ...
and
  ERROR: found multixact ... from before relminmxid ...
To be caused by this bug the errors have to occur on system catalog
tables.

The two bugs are:

1) Invalidations for nailed relations were ignored, based on the
   theory that the relcache entry for such tables doesn't
   change. Which is largely true, except for fields like relfrozenxid
   etc.  This means that changes to relations vacuumed in other
   sessions weren't picked up by already existing sessions.  Luckily
   autovacuum doesn't have particularly longrunning sessions.

2) For shared *and* nailed relations, the shared relcache init file
   was never invalidated while running.  That means that for such
   tables (e.g. pg_authid, pg_database) it's not just already existing
   sessions that are affected, but even new connections are as well.
   That explains why the reports usually were about pg_authid et. al.

To fix 1), revalidate the rd_rel portion of a relcache entry when
invalid. This implies a bit of extra complexity to deal with
bootstrapping, but it's not too bad.  The fix for 2) is simpler,
simply always remove both the shared and local init files.

Author: Andres Freund
Reviewed-By: Alvaro Herrera
Discussion:
    https://postgr.es/m/20180525203736.crkbg36muzxrjj5e@alap3.anarazel.de
    https://postgr.es/m/CAMa1XUhKSJd98JW4o9StWPrfS=11bPgG+_GDMxe25TvUY4Sugg@mail.gmail.com
    https://postgr.es/m/CAKMFJucqbuoDRfxPDX39WhA3vJyxweRg_zDVXzncr6+5wOguWA@mail.gmail.com
    https://postgr.es/m/CAGewt-ujGpMLQ09gXcUFMZaZsGJC98VXHEFbF-tpPB0fB13K+A@mail.gmail.com
Backpatch: 9.3-
parent 8a07ebb3
...@@ -521,12 +521,12 @@ RegisterRelcacheInvalidation(Oid dbId, Oid relId) ...@@ -521,12 +521,12 @@ RegisterRelcacheInvalidation(Oid dbId, Oid relId)
(void) GetCurrentCommandId(true); (void) GetCurrentCommandId(true);
/* /*
* If the relation being invalidated is one of those cached in the local * If the relation being invalidated is one of those cached in a relcache
* relcache init file, mark that we need to zap that file at commit. Same * init file, mark that we need to zap that file at commit. For simplicity
* is true when we are invalidating whole relcache. * invalidations for a specific database always invalidate the shared file
* as well. Also zap when we are invalidating whole relcache.
*/ */
if (OidIsValid(dbId) && if (relId == InvalidOid || RelationIdIsInInitFile(relId))
(RelationIdIsInInitFile(relId) || relId == InvalidOid))
transInvalInfo->RelcacheInitFileInval = true; transInvalInfo->RelcacheInitFileInval = true;
} }
...@@ -881,19 +881,27 @@ ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs, ...@@ -881,19 +881,27 @@ ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs,
if (RelcacheInitFileInval) if (RelcacheInitFileInval)
{ {
elog(trace_recovery(DEBUG4), "removing relcache init files for database %u",
dbid);
/* /*
* RelationCacheInitFilePreInvalidate requires DatabasePath to be set, * RelationCacheInitFilePreInvalidate, when the invalidation message
* but we should not use SetDatabasePath during recovery, since it is * is for a specific database, requires DatabasePath to be set, but we
* should not use SetDatabasePath during recovery, since it is
* intended to be used only once by normal backends. Hence, a quick * intended to be used only once by normal backends. Hence, a quick
* hack: set DatabasePath directly then unset after use. * hack: set DatabasePath directly then unset after use.
*/ */
if (OidIsValid(dbid))
DatabasePath = GetDatabasePath(dbid, tsid); DatabasePath = GetDatabasePath(dbid, tsid);
elog(trace_recovery(DEBUG4), "removing relcache init file in \"%s\"",
DatabasePath);
RelationCacheInitFilePreInvalidate(); RelationCacheInitFilePreInvalidate();
if (OidIsValid(dbid))
{
pfree(DatabasePath); pfree(DatabasePath);
DatabasePath = NULL; DatabasePath = NULL;
} }
}
SendSharedInvalidMessages(msgs, nmsgs); SendSharedInvalidMessages(msgs, nmsgs);
......
...@@ -250,6 +250,7 @@ static void RelationDestroyRelation(Relation relation, bool remember_tupdesc); ...@@ -250,6 +250,7 @@ static void RelationDestroyRelation(Relation relation, bool remember_tupdesc);
static void RelationClearRelation(Relation relation, bool rebuild); static void RelationClearRelation(Relation relation, bool rebuild);
static void RelationReloadIndexInfo(Relation relation); static void RelationReloadIndexInfo(Relation relation);
static void RelationReloadNailed(Relation relation);
static void RelationFlushRelation(Relation relation); static void RelationFlushRelation(Relation relation);
static void RememberToFreeTupleDescAtEOX(TupleDesc td); static void RememberToFreeTupleDescAtEOX(TupleDesc td);
static void AtEOXact_cleanup(Relation relation, bool isCommit); static void AtEOXact_cleanup(Relation relation, bool isCommit);
...@@ -286,7 +287,7 @@ static void IndexSupportInitialize(oidvector *indclass, ...@@ -286,7 +287,7 @@ static void IndexSupportInitialize(oidvector *indclass,
static OpClassCacheEnt *LookupOpclassInfo(Oid operatorClassOid, static OpClassCacheEnt *LookupOpclassInfo(Oid operatorClassOid,
StrategyNumber numSupport); StrategyNumber numSupport);
static void RelationCacheInitFileRemoveInDir(const char *tblspcpath); static void RelationCacheInitFileRemoveInDir(const char *tblspcpath);
static void unlink_initfile(const char *initfilename); static void unlink_initfile(const char *initfilename, int elevel);
static bool equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1, static bool equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1,
PartitionDesc partdesc2); PartitionDesc partdesc2);
...@@ -1931,7 +1932,16 @@ RelationIdGetRelation(Oid relationId) ...@@ -1931,7 +1932,16 @@ RelationIdGetRelation(Oid relationId)
RelationReloadIndexInfo(rd); RelationReloadIndexInfo(rd);
else else
RelationClearRelation(rd, true); RelationClearRelation(rd, true);
Assert(rd->rd_isvalid);
/*
* Normally entries need to be valid here, but before the relcache
* has been initialized, not enough infrastructure exists to
* perform pg_class lookups. The structure of such entries doesn't
* change, but we still want to update the rd_rel entry. So
* rd_isvalid = false is left in place for a later lookup.
*/
Assert(rd->rd_isvalid ||
(rd->rd_isnailed && !criticalRelcachesBuilt));
} }
return rd; return rd;
} }
...@@ -2135,6 +2145,81 @@ RelationReloadIndexInfo(Relation relation) ...@@ -2135,6 +2145,81 @@ RelationReloadIndexInfo(Relation relation)
relation->rd_isvalid = true; relation->rd_isvalid = true;
} }
/*
* RelationReloadNailed - reload minimal information for nailed relations.
*
* The structure of a nailed relation can never change (which is good, because
* we rely on knowing their structure to be able to read catalog content). But
* some parts, e.g. pg_class.relfrozenxid, are still important to have
* accurate content for. Therefore those need to be reloaded after the arrival
* of invalidations.
*/
static void
RelationReloadNailed(Relation relation)
{
Assert(relation->rd_isnailed);
/*
* Redo RelationInitPhysicalAddr in case it is a mapped relation whose
* mapping changed.
*/
RelationInitPhysicalAddr(relation);
/* flag as needing to be revalidated */
relation->rd_isvalid = false;
/*
* Can only reread catalog contents if in a transaction. If the relation
* is currently open (not counting the nailed refcount), do so
* immediately. Otherwise we've already marked the entry as possibly
* invalid, and it'll be fixed when next opened.
*/
if (!IsTransactionState() || relation->rd_refcnt <= 1)
return;
if (relation->rd_rel->relkind == RELKIND_INDEX)
{
/*
* If it's a nailed-but-not-mapped index, then we need to re-read the
* pg_class row to see if its relfilenode changed.
*/
RelationReloadIndexInfo(relation);
}
else
{
/*
* Reload a non-index entry. We can't easily do so if relcaches
* aren't yet built, but that's fine because at that stage the
* attributes that need to be current (like relfrozenxid) aren't yet
* accessed. To ensure the entry will later be revalidated, we leave
* it in invalid state, but allow use (cf. RelationIdGetRelation()).
*/
if (criticalRelcachesBuilt)
{
HeapTuple pg_class_tuple;
Form_pg_class relp;
/*
* NB: Mark the entry as valid before starting to scan, to avoid
* self-recursion when re-building pg_class.
*/
relation->rd_isvalid = true;
pg_class_tuple = ScanPgRelation(RelationGetRelid(relation),
true, false);
relp = (Form_pg_class) GETSTRUCT(pg_class_tuple);
memcpy(relation->rd_rel, relp, CLASS_TUPLE_SIZE);
heap_freetuple(pg_class_tuple);
/*
* Again mark as valid, to protect against concurrently arriving
* invalidations.
*/
relation->rd_isvalid = true;
}
}
}
/* /*
* RelationDestroyRelation * RelationDestroyRelation
* *
...@@ -2250,27 +2335,12 @@ RelationClearRelation(Relation relation, bool rebuild) ...@@ -2250,27 +2335,12 @@ RelationClearRelation(Relation relation, bool rebuild)
RelationCloseSmgr(relation); RelationCloseSmgr(relation);
/* /*
* Never, never ever blow away a nailed-in system relation, because we'd * Treat nailed-in system relations separately, they always need to be
* be unable to recover. However, we must redo RelationInitPhysicalAddr * accessible, so we can't blow them away.
* in case it is a mapped relation whose mapping changed.
*
* If it's a nailed-but-not-mapped index, then we need to re-read the
* pg_class row to see if its relfilenode changed. We do that immediately
* if we're inside a valid transaction and the relation is open (not
* counting the nailed refcount). Otherwise just mark the entry as
* possibly invalid, and it'll be fixed when next opened.
*/ */
if (relation->rd_isnailed) if (relation->rd_isnailed)
{ {
RelationInitPhysicalAddr(relation); RelationReloadNailed(relation);
if (relation->rd_rel->relkind == RELKIND_INDEX ||
relation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
{
relation->rd_isvalid = false; /* needs to be revalidated */
if (relation->rd_refcnt > 1 && IsTransactionState())
RelationReloadIndexInfo(relation);
}
return; return;
} }
...@@ -5907,24 +5977,26 @@ write_item(const void *data, Size len, FILE *fp) ...@@ -5907,24 +5977,26 @@ write_item(const void *data, Size len, FILE *fp)
/* /*
* Determine whether a given relation (identified by OID) is one of the ones * Determine whether a given relation (identified by OID) is one of the ones
* we should store in the local relcache init file. * we should store in a relcache init file.
* *
* We must cache all nailed rels, and for efficiency we should cache every rel * We must cache all nailed rels, and for efficiency we should cache every rel
* that supports a syscache. The former set is almost but not quite a subset * that supports a syscache. The former set is almost but not quite a subset
* of the latter. Currently, we must special-case TriggerRelidNameIndexId, * of the latter. The special cases are relations where
* which RelationCacheInitializePhase3 chooses to nail for efficiency reasons, * RelationCacheInitializePhase2/3 chooses to nail for efficiency reasons, but
* but which does not support any syscache. * which do not support any syscache.
*
* Note: this function is currently never called for shared rels. If it were,
* we'd probably also need a special case for DatabaseNameIndexId, which is
* critical but does not support a syscache.
*/ */
bool bool
RelationIdIsInInitFile(Oid relationId) RelationIdIsInInitFile(Oid relationId)
{ {
if (relationId == TriggerRelidNameIndexId) if (relationId == SharedSecLabelRelationId ||
relationId == TriggerRelidNameIndexId ||
relationId == DatabaseNameIndexId ||
relationId == SharedSecLabelObjectIndexId)
{ {
/* If this Assert fails, we don't need this special case anymore. */ /*
* If this Assert fails, we don't need the applicable special case
* anymore.
*/
Assert(!RelationSupportsSysCache(relationId)); Assert(!RelationSupportsSysCache(relationId));
return true; return true;
} }
...@@ -5994,38 +6066,30 @@ RelationHasUnloggedIndex(Relation rel) ...@@ -5994,38 +6066,30 @@ RelationHasUnloggedIndex(Relation rel)
* We take the lock and do the unlink in RelationCacheInitFilePreInvalidate, * We take the lock and do the unlink in RelationCacheInitFilePreInvalidate,
* then release the lock in RelationCacheInitFilePostInvalidate. Caller must * then release the lock in RelationCacheInitFilePostInvalidate. Caller must
* send any pending SI messages between those calls. * send any pending SI messages between those calls.
*
* Notice this deals only with the local init file, not the shared init file.
* The reason is that there can never be a "significant" change to the
* relcache entry of a shared relation; the most that could happen is
* updates of noncritical fields such as relpages/reltuples. So, while
* it's worth updating the shared init file from time to time, it can never
* be invalid enough to make it necessary to remove it.
*/ */
void void
RelationCacheInitFilePreInvalidate(void) RelationCacheInitFilePreInvalidate(void)
{ {
char initfilename[MAXPGPATH]; char localinitfname[MAXPGPATH];
char sharedinitfname[MAXPGPATH];
snprintf(initfilename, sizeof(initfilename), "%s/%s", if (DatabasePath)
snprintf(localinitfname, sizeof(localinitfname), "%s/%s",
DatabasePath, RELCACHE_INIT_FILENAME); DatabasePath, RELCACHE_INIT_FILENAME);
snprintf(sharedinitfname, sizeof(sharedinitfname), "global/%s",
RELCACHE_INIT_FILENAME);
LWLockAcquire(RelCacheInitLock, LW_EXCLUSIVE); LWLockAcquire(RelCacheInitLock, LW_EXCLUSIVE);
if (unlink(initfilename) < 0)
{
/* /*
* The file might not be there if no backend has been started since * The files might not be there if no backend has been started since the
* the last removal. But complain about failures other than ENOENT. * last removal. But complain about failures other than ENOENT with
* Fortunately, it's not too late to abort the transaction if we can't * ERROR. Fortunately, it's not too late to abort the transaction if we
* get rid of the would-be-obsolete init file. * can't get rid of the would-be-obsolete init file.
*/ */
if (errno != ENOENT) if (DatabasePath)
ereport(ERROR, unlink_initfile(localinitfname, ERROR);
(errcode_for_file_access(), unlink_initfile(sharedinitfname, ERROR);
errmsg("could not remove cache file \"%s\": %m",
initfilename)));
}
} }
void void
...@@ -6051,13 +6115,9 @@ RelationCacheInitFileRemove(void) ...@@ -6051,13 +6115,9 @@ RelationCacheInitFileRemove(void)
struct dirent *de; struct dirent *de;
char path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)]; char path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)];
/*
* We zap the shared cache file too. In theory it can't get out of sync
* enough to be a problem, but in data-corruption cases, who knows ...
*/
snprintf(path, sizeof(path), "global/%s", snprintf(path, sizeof(path), "global/%s",
RELCACHE_INIT_FILENAME); RELCACHE_INIT_FILENAME);
unlink_initfile(path); unlink_initfile(path, LOG);
/* Scan everything in the default tablespace */ /* Scan everything in the default tablespace */
RelationCacheInitFileRemoveInDir("base"); RelationCacheInitFileRemoveInDir("base");
...@@ -6097,7 +6157,7 @@ RelationCacheInitFileRemoveInDir(const char *tblspcpath) ...@@ -6097,7 +6157,7 @@ RelationCacheInitFileRemoveInDir(const char *tblspcpath)
/* Try to remove the init file in each database */ /* Try to remove the init file in each database */
snprintf(initfilename, sizeof(initfilename), "%s/%s/%s", snprintf(initfilename, sizeof(initfilename), "%s/%s/%s",
tblspcpath, de->d_name, RELCACHE_INIT_FILENAME); tblspcpath, de->d_name, RELCACHE_INIT_FILENAME);
unlink_initfile(initfilename); unlink_initfile(initfilename, LOG);
} }
} }
...@@ -6105,12 +6165,15 @@ RelationCacheInitFileRemoveInDir(const char *tblspcpath) ...@@ -6105,12 +6165,15 @@ RelationCacheInitFileRemoveInDir(const char *tblspcpath)
} }
static void static void
unlink_initfile(const char *initfilename) unlink_initfile(const char *initfilename, int elevel)
{ {
if (unlink(initfilename) < 0) if (unlink(initfilename) < 0)
{ {
/* It might not be there, but log any error other than ENOENT */ /* It might not be there, but log any error other than ENOENT */
if (errno != ENOENT) if (errno != ENOENT)
elog(LOG, "could not remove cache file \"%s\": %m", initfilename); ereport(elevel,
(errcode_for_file_access(),
errmsg("could not remove cache file \"%s\": %m",
initfilename)));
} }
} }
...@@ -64,7 +64,7 @@ typedef struct xl_invalidations ...@@ -64,7 +64,7 @@ typedef struct xl_invalidations
{ {
Oid dbId; /* MyDatabaseId */ Oid dbId; /* MyDatabaseId */
Oid tsId; /* MyDatabaseTableSpace */ Oid tsId; /* MyDatabaseTableSpace */
bool relcacheInitFileInval; /* invalidate relcache init file */ bool relcacheInitFileInval; /* invalidate relcache init files */
int nmsgs; /* number of shared inval msgs */ int nmsgs; /* number of shared inval msgs */
SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER]; SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER];
} xl_invalidations; } xl_invalidations;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment