Commit 5c156060 authored by Andres Freund's avatar Andres Freund

Fix several recently introduced issues around handling new relation forks.

Most of these stem from d25f5191 "tableam: relation creation, VACUUM
FULL/CLUSTER, SET TABLESPACE.".

1) To pass data to the relation_set_new_filenode()
   RelationSetNewRelfilenode() was made to update RelationData.rd_rel
   directly. That's not OK however, as it makes the relcache entries
   temporarily inconsistent. Which among other scenarios is a problem
   if a REINDEX targets an index on pg_class - the
   CatalogTupleUpdate() in RelationSetNewRelfilenode().  Presumably
   that was introduced because other places in the code do so - while
   those aren't "good practice" they don't appear to be actively
   buggy (e.g. because system tables may not be targeted).

   I (Andres) should have caught this while reviewing and signficantly
   evolving the code in that commit, mea culpa.

   Fix that by instead passing in the new RelFileNode as separate
   argument to relation_set_new_filenode() and rely on the relcache to
   update the catalog entry. Also revert that the
   RelationMapUpdateMap() call was changed to immediate, and undo some
   other more unnecessary changes.

2) Document that the relation_set_new_filenode cannot rely on the
   whole relcache entry to be valid. It might be worthwhile to
   refactor the code to never have to rely on that, but given the way
   heap_create() is currently coded, that'd be a large change.

3) ATExecSetTableSpace() shouldn't do FlushRelationBuffers() itself. A
   table AM might not use shared buffers at all. Move to
   index_copy_data() and heapam_relation_copy_data().

4) heapam_relation_set_new_filenode() previously sometimes accessed
   rel->rd_rel->relpersistence rather than the `persistence`
   argument. Code movement mistake.

5) Previously heapam_relation_set_new_filenode() re-opened the smgr
   relation to create the init for, if necesary. Instead have
   RelationCreateStorage() return the SMgrRelation and use it to
   create the init fork.

6) Add a note about the danger of modifying the relcache directly to
   ATExecSetTableSpace() - it's currently not a bug because there's a
   check ERRORing for catalog tables.

Regression tests and assertion improvements that together trigger the
bug described in 1) will be added in a later commit, as there is a
related bug on all branches.

Reported-By: Michael Paquier
Diagnosed-By: Tom Lane and Andres Freund
Author: Andres Freund
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/20190418011430.GA19133@paquier.xyz
parent 9ee7414e
...@@ -566,10 +566,14 @@ heapam_finish_bulk_insert(Relation relation, int options) ...@@ -566,10 +566,14 @@ heapam_finish_bulk_insert(Relation relation, int options)
*/ */
static void static void
heapam_relation_set_new_filenode(Relation rel, char persistence, heapam_relation_set_new_filenode(Relation rel,
const RelFileNode *newrnode,
char persistence,
TransactionId *freezeXid, TransactionId *freezeXid,
MultiXactId *minmulti) MultiXactId *minmulti)
{ {
SMgrRelation srel;
/* /*
* Initialize to the minimum XID that could put tuples in the table. We * Initialize to the minimum XID that could put tuples in the table. We
* know that no xacts older than RecentXmin are still running, so that * know that no xacts older than RecentXmin are still running, so that
...@@ -587,7 +591,7 @@ heapam_relation_set_new_filenode(Relation rel, char persistence, ...@@ -587,7 +591,7 @@ heapam_relation_set_new_filenode(Relation rel, char persistence,
*/ */
*minmulti = GetOldestMultiXactId(); *minmulti = GetOldestMultiXactId();
RelationCreateStorage(rel->rd_node, persistence); srel = RelationCreateStorage(*newrnode, persistence);
/* /*
* If required, set up an init fork for an unlogged table so that it can * If required, set up an init fork for an unlogged table so that it can
...@@ -598,16 +602,17 @@ heapam_relation_set_new_filenode(Relation rel, char persistence, ...@@ -598,16 +602,17 @@ heapam_relation_set_new_filenode(Relation rel, char persistence,
* while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE * while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE
* record. Therefore, logging is necessary even if wal_level=minimal. * record. Therefore, logging is necessary even if wal_level=minimal.
*/ */
if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) if (persistence == RELPERSISTENCE_UNLOGGED)
{ {
Assert(rel->rd_rel->relkind == RELKIND_RELATION || Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
rel->rd_rel->relkind == RELKIND_MATVIEW || rel->rd_rel->relkind == RELKIND_MATVIEW ||
rel->rd_rel->relkind == RELKIND_TOASTVALUE); rel->rd_rel->relkind == RELKIND_TOASTVALUE);
RelationOpenSmgr(rel); smgrcreate(srel, INIT_FORKNUM, false);
smgrcreate(rel->rd_smgr, INIT_FORKNUM, false); log_smgrcreate(newrnode, INIT_FORKNUM);
log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM); smgrimmedsync(srel, INIT_FORKNUM);
smgrimmedsync(rel->rd_smgr, INIT_FORKNUM);
} }
smgrclose(srel);
} }
static void static void
...@@ -617,13 +622,21 @@ heapam_relation_nontransactional_truncate(Relation rel) ...@@ -617,13 +622,21 @@ heapam_relation_nontransactional_truncate(Relation rel)
} }
static void static void
heapam_relation_copy_data(Relation rel, RelFileNode newrnode) heapam_relation_copy_data(Relation rel, const RelFileNode *newrnode)
{ {
SMgrRelation dstrel; SMgrRelation dstrel;
dstrel = smgropen(newrnode, rel->rd_backend); dstrel = smgropen(*newrnode, rel->rd_backend);
RelationOpenSmgr(rel); RelationOpenSmgr(rel);
/*
* Since we copy the file directly without looking at the shared buffers,
* we'd better first flush out any pages of the source relation that are
* in shared buffers. We assume no new changes will be made while we are
* holding exclusive lock on the rel.
*/
FlushRelationBuffers(rel);
/* /*
* Create and copy all forks of the relation, and schedule unlinking of * Create and copy all forks of the relation, and schedule unlinking of
* old physical files. * old physical files.
...@@ -631,7 +644,7 @@ heapam_relation_copy_data(Relation rel, RelFileNode newrnode) ...@@ -631,7 +644,7 @@ heapam_relation_copy_data(Relation rel, RelFileNode newrnode)
* NOTE: any conflict in relfilenode value will be caught in * NOTE: any conflict in relfilenode value will be caught in
* RelationCreateStorage(). * RelationCreateStorage().
*/ */
RelationCreateStorage(newrnode, rel->rd_rel->relpersistence); RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence);
/* copy main fork */ /* copy main fork */
RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM, RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM,
...@@ -652,7 +665,7 @@ heapam_relation_copy_data(Relation rel, RelFileNode newrnode) ...@@ -652,7 +665,7 @@ heapam_relation_copy_data(Relation rel, RelFileNode newrnode)
if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT || if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT ||
(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED && (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
forkNum == INIT_FORKNUM)) forkNum == INIT_FORKNUM))
log_smgrcreate(&newrnode, forkNum); log_smgrcreate(newrnode, forkNum);
RelationCopyStorage(rel->rd_smgr, dstrel, forkNum, RelationCopyStorage(rel->rd_smgr, dstrel, forkNum,
rel->rd_rel->relpersistence); rel->rd_rel->relpersistence);
} }
......
...@@ -435,7 +435,8 @@ heap_create(const char *relname, ...@@ -435,7 +435,8 @@ heap_create(const char *relname,
case RELKIND_RELATION: case RELKIND_RELATION:
case RELKIND_TOASTVALUE: case RELKIND_TOASTVALUE:
case RELKIND_MATVIEW: case RELKIND_MATVIEW:
table_relation_set_new_filenode(rel, relpersistence, table_relation_set_new_filenode(rel, &rel->rd_node,
relpersistence,
relfrozenxid, relminmxid); relfrozenxid, relminmxid);
break; break;
} }
......
...@@ -75,7 +75,7 @@ static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */ ...@@ -75,7 +75,7 @@ static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
* This function is transactional. The creation is WAL-logged, and if the * This function is transactional. The creation is WAL-logged, and if the
* transaction aborts later on, the storage will be destroyed. * transaction aborts later on, the storage will be destroyed.
*/ */
void SMgrRelation
RelationCreateStorage(RelFileNode rnode, char relpersistence) RelationCreateStorage(RelFileNode rnode, char relpersistence)
{ {
PendingRelDelete *pending; PendingRelDelete *pending;
...@@ -99,7 +99,7 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence) ...@@ -99,7 +99,7 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
break; break;
default: default:
elog(ERROR, "invalid relpersistence: %c", relpersistence); elog(ERROR, "invalid relpersistence: %c", relpersistence);
return; /* placate compiler */ return NULL; /* placate compiler */
} }
srel = smgropen(rnode, backend); srel = smgropen(rnode, backend);
...@@ -117,13 +117,15 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence) ...@@ -117,13 +117,15 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
pending->nestLevel = GetCurrentTransactionNestLevel(); pending->nestLevel = GetCurrentTransactionNestLevel();
pending->next = pendingDeletes; pending->next = pendingDeletes;
pendingDeletes = pending; pendingDeletes = pending;
return srel;
} }
/* /*
* Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL. * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
*/ */
void void
log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum) log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum)
{ {
xl_smgr_create xlrec; xl_smgr_create xlrec;
...@@ -294,6 +296,10 @@ RelationTruncate(Relation rel, BlockNumber nblocks) ...@@ -294,6 +296,10 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
/* /*
* Copy a fork's data, block by block. * Copy a fork's data, block by block.
*
* Note that this requires that there is no dirty data in shared buffers. If
* it's possible that there are, callers need to flush those using
* e.g. FlushRelationBuffers(rel).
*/ */
void void
RelationCopyStorage(SMgrRelation src, SMgrRelation dst, RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
......
...@@ -12236,14 +12236,6 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) ...@@ -12236,14 +12236,6 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
elog(ERROR, "cache lookup failed for relation %u", tableOid); elog(ERROR, "cache lookup failed for relation %u", tableOid);
rd_rel = (Form_pg_class) GETSTRUCT(tuple); rd_rel = (Form_pg_class) GETSTRUCT(tuple);
/*
* Since we copy the file directly without looking at the shared buffers,
* we'd better first flush out any pages of the source relation that are
* in shared buffers. We assume no new changes will be made while we are
* holding exclusive lock on the rel.
*/
FlushRelationBuffers(rel);
/* /*
* Relfilenodes are not unique in databases across tablespaces, so we need * Relfilenodes are not unique in databases across tablespaces, so we need
* to allocate a new one in the new tablespace. * to allocate a new one in the new tablespace.
...@@ -12266,10 +12258,16 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) ...@@ -12266,10 +12258,16 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
Assert(rel->rd_rel->relkind == RELKIND_RELATION || Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
rel->rd_rel->relkind == RELKIND_MATVIEW || rel->rd_rel->relkind == RELKIND_MATVIEW ||
rel->rd_rel->relkind == RELKIND_TOASTVALUE); rel->rd_rel->relkind == RELKIND_TOASTVALUE);
table_relation_copy_data(rel, newrnode); table_relation_copy_data(rel, &newrnode);
} }
/* update the pg_class row */ /*
* Update the pg_class row.
*
* NB: This wouldn't work if ATExecSetTableSpace() were allowed to be
* executed on pg_class or its indexes (the above copy wouldn't contain
* the updated pg_class entry), but that's forbidden above.
*/
rd_rel->reltablespace = (newTableSpace == MyDatabaseTableSpace) ? InvalidOid : newTableSpace; rd_rel->reltablespace = (newTableSpace == MyDatabaseTableSpace) ? InvalidOid : newTableSpace;
rd_rel->relfilenode = newrelfilenode; rd_rel->relfilenode = newrelfilenode;
CatalogTupleUpdate(pg_class, &tuple->t_self, tuple); CatalogTupleUpdate(pg_class, &tuple->t_self, tuple);
...@@ -12537,6 +12535,14 @@ index_copy_data(Relation rel, RelFileNode newrnode) ...@@ -12537,6 +12535,14 @@ index_copy_data(Relation rel, RelFileNode newrnode)
dstrel = smgropen(newrnode, rel->rd_backend); dstrel = smgropen(newrnode, rel->rd_backend);
RelationOpenSmgr(rel); RelationOpenSmgr(rel);
/*
* Since we copy the file directly without looking at the shared buffers,
* we'd better first flush out any pages of the source relation that are
* in shared buffers. We assume no new changes will be made while we are
* holding exclusive lock on the rel.
*/
FlushRelationBuffers(rel);
/* /*
* Create and copy all forks of the relation, and schedule unlinking of * Create and copy all forks of the relation, and schedule unlinking of
* old physical files. * old physical files.
......
...@@ -3440,6 +3440,7 @@ RelationSetNewRelfilenode(Relation relation, char persistence) ...@@ -3440,6 +3440,7 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
Form_pg_class classform; Form_pg_class classform;
MultiXactId minmulti = InvalidMultiXactId; MultiXactId minmulti = InvalidMultiXactId;
TransactionId freezeXid = InvalidTransactionId; TransactionId freezeXid = InvalidTransactionId;
RelFileNode newrnode;
/* Allocate a new relfilenode */ /* Allocate a new relfilenode */
newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, NULL, newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, NULL,
...@@ -3462,36 +3463,20 @@ RelationSetNewRelfilenode(Relation relation, char persistence) ...@@ -3462,36 +3463,20 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
*/ */
RelationDropStorage(relation); RelationDropStorage(relation);
/* /* initialize new relfilenode from old relfilenode */
* Now update the pg_class row. However, if we're dealing with a mapped newrnode = relation->rd_node;
* index, pg_class.relfilenode doesn't change; instead we have to send the
* update to the relation mapper.
*/
if (RelationIsMapped(relation))
RelationMapUpdateMap(RelationGetRelid(relation),
newrelfilenode,
relation->rd_rel->relisshared,
true);
else
{
relation->rd_rel->relfilenode = newrelfilenode;
classform->relfilenode = newrelfilenode;
}
RelationInitPhysicalAddr(relation);
/* /*
* Create storage for the main fork of the new relfilenode. If it's * Create storage for the main fork of the new relfilenode. If it's
* table-like object, call into table AM to do so, which'll also create * table-like object, call into table AM to do so, which'll also create
* the table's init fork. * the table's init fork.
* *
* NOTE: any conflict in relfilenode value will be caught here, if * NOTE: If relevant for the AM, any conflict in relfilenode value will be
* GetNewRelFileNode messes up for any reason. * caught here, if GetNewRelFileNode messes up for any reason.
*/ */
newrnode = relation->rd_node;
newrnode.relNode = newrelfilenode;
/*
* Create storage for relation.
*/
switch (relation->rd_rel->relkind) switch (relation->rd_rel->relkind)
{ {
/* shouldn't be called for these */ /* shouldn't be called for these */
...@@ -3505,18 +3490,36 @@ RelationSetNewRelfilenode(Relation relation, char persistence) ...@@ -3505,18 +3490,36 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
case RELKIND_INDEX: case RELKIND_INDEX:
case RELKIND_SEQUENCE: case RELKIND_SEQUENCE:
RelationCreateStorage(relation->rd_node, persistence); {
RelationOpenSmgr(relation); SMgrRelation srel;
srel = RelationCreateStorage(newrnode, persistence);
smgrclose(srel);
}
break; break;
case RELKIND_RELATION: case RELKIND_RELATION:
case RELKIND_TOASTVALUE: case RELKIND_TOASTVALUE:
case RELKIND_MATVIEW: case RELKIND_MATVIEW:
table_relation_set_new_filenode(relation, persistence, table_relation_set_new_filenode(relation, &newrnode,
persistence,
&freezeXid, &minmulti); &freezeXid, &minmulti);
break; break;
} }
/*
* However, if we're dealing with a mapped index, pg_class.relfilenode
* doesn't change; instead we have to send the update to the relation
* mapper.
*/
if (RelationIsMapped(relation))
RelationMapUpdateMap(RelationGetRelid(relation),
newrelfilenode,
relation->rd_rel->relisshared,
false);
else
classform->relfilenode = newrelfilenode;
/* These changes are safe even for a mapped relation */ /* These changes are safe even for a mapped relation */
if (relation->rd_rel->relkind != RELKIND_SEQUENCE) if (relation->rd_rel->relkind != RELKIND_SEQUENCE)
{ {
......
...@@ -416,7 +416,12 @@ typedef struct TableAmRoutine ...@@ -416,7 +416,12 @@ typedef struct TableAmRoutine
* This callback needs to create a new relation filenode for `rel`, with * This callback needs to create a new relation filenode for `rel`, with
* appropriate durability behaviour for `persistence`. * appropriate durability behaviour for `persistence`.
* *
* On output *freezeXid, *minmulti must be set to the values appropriate * Note that only the subset of the relcache filled by
* RelationBuildLocalRelation() can be relied upon and that the relation's
* catalog entries either will either not yet exist (new relation), or
* will still reference the old relfilenode.
*
* As output *freezeXid, *minmulti must be set to the values appropriate
* for pg_class.{relfrozenxid, relminmxid}. For AMs that don't need those * for pg_class.{relfrozenxid, relminmxid}. For AMs that don't need those
* fields to be filled they can be set to InvalidTransactionId and * fields to be filled they can be set to InvalidTransactionId and
* InvalidMultiXactId, respectively. * InvalidMultiXactId, respectively.
...@@ -424,6 +429,7 @@ typedef struct TableAmRoutine ...@@ -424,6 +429,7 @@ typedef struct TableAmRoutine
* See also table_relation_set_new_filenode(). * See also table_relation_set_new_filenode().
*/ */
void (*relation_set_new_filenode) (Relation rel, void (*relation_set_new_filenode) (Relation rel,
const RelFileNode *newrnode,
char persistence, char persistence,
TransactionId *freezeXid, TransactionId *freezeXid,
MultiXactId *minmulti); MultiXactId *minmulti);
...@@ -444,7 +450,8 @@ typedef struct TableAmRoutine ...@@ -444,7 +450,8 @@ typedef struct TableAmRoutine
* This can typically be implemented by directly copying the underlying * This can typically be implemented by directly copying the underlying
* storage, unless it contains references to the tablespace internally. * storage, unless it contains references to the tablespace internally.
*/ */
void (*relation_copy_data) (Relation rel, RelFileNode newrnode); void (*relation_copy_data) (Relation rel,
const RelFileNode *newrnode);
/* See table_relation_copy_for_cluster() */ /* See table_relation_copy_for_cluster() */
void (*relation_copy_for_cluster) (Relation NewHeap, void (*relation_copy_for_cluster) (Relation NewHeap,
...@@ -1251,21 +1258,25 @@ table_finish_bulk_insert(Relation rel, int options) ...@@ -1251,21 +1258,25 @@ table_finish_bulk_insert(Relation rel, int options)
*/ */
/* /*
* Create a new relation filenode for `rel`, with persistence set to * Create storage for `rel` in `newrode`, with persistence set to
* `persistence`. * `persistence`.
* *
* This is used both during relation creation and various DDL operations to * This is used both during relation creation and various DDL operations to
* create a new relfilenode that can be filled from scratch. * create a new relfilenode that can be filled from scratch. When creating
* new storage for an existing relfilenode, this should be called before the
* relcache entry has been updated.
* *
* *freezeXid, *minmulti are set to the xid / multixact horizon for the table * *freezeXid, *minmulti are set to the xid / multixact horizon for the table
* that pg_class.{relfrozenxid, relminmxid} have to be set to. * that pg_class.{relfrozenxid, relminmxid} have to be set to.
*/ */
static inline void static inline void
table_relation_set_new_filenode(Relation rel, char persistence, table_relation_set_new_filenode(Relation rel,
const RelFileNode *newrnode,
char persistence,
TransactionId *freezeXid, TransactionId *freezeXid,
MultiXactId *minmulti) MultiXactId *minmulti)
{ {
rel->rd_tableam->relation_set_new_filenode(rel, persistence, rel->rd_tableam->relation_set_new_filenode(rel, newrnode, persistence,
freezeXid, minmulti); freezeXid, minmulti);
} }
...@@ -1288,7 +1299,7 @@ table_relation_nontransactional_truncate(Relation rel) ...@@ -1288,7 +1299,7 @@ table_relation_nontransactional_truncate(Relation rel)
* changing a relation's tablespace. * changing a relation's tablespace.
*/ */
static inline void static inline void
table_relation_copy_data(Relation rel, RelFileNode newrnode) table_relation_copy_data(Relation rel, const RelFileNode *newrnode)
{ {
rel->rd_tableam->relation_copy_data(rel, newrnode); rel->rd_tableam->relation_copy_data(rel, newrnode);
} }
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#include "storage/smgr.h" #include "storage/smgr.h"
#include "utils/relcache.h" #include "utils/relcache.h"
extern void RelationCreateStorage(RelFileNode rnode, char relpersistence); extern SMgrRelation RelationCreateStorage(RelFileNode rnode, char relpersistence);
extern void RelationDropStorage(Relation rel); extern void RelationDropStorage(Relation rel);
extern void RelationPreserveStorage(RelFileNode rnode, bool atCommit); extern void RelationPreserveStorage(RelFileNode rnode, bool atCommit);
extern void RelationTruncate(Relation rel, BlockNumber nblocks); extern void RelationTruncate(Relation rel, BlockNumber nblocks);
......
...@@ -50,7 +50,7 @@ typedef struct xl_smgr_truncate ...@@ -50,7 +50,7 @@ typedef struct xl_smgr_truncate
int flags; int flags;
} xl_smgr_truncate; } xl_smgr_truncate;
extern void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum); extern void log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum);
extern void smgr_redo(XLogReaderState *record); extern void smgr_redo(XLogReaderState *record);
extern void smgr_desc(StringInfo buf, XLogReaderState *record); extern void smgr_desc(StringInfo buf, XLogReaderState *record);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment