Commit f1ac27bf authored by Peter Eisentraut's avatar Peter Eisentraut

Add logical replication support to replicate into partitioned tables

Mainly, this adds support code in logical/worker.c for applying
replicated operations whose target is a partitioned table to its
relevant partitions.

Author: Amit Langote <amitlangote09@gmail.com>
Reviewed-by: default avatarRafia Sabih <rafia.pghackers@gmail.com>
Reviewed-by: default avatarPeter Eisentraut <peter.eisentraut@2ndquadrant.com>
Reviewed-by: default avatarPetr Jelinek <petr@2ndquadrant.com>
Discussion: https://www.postgresql.org/message-id/flat/CA+HiwqH=Y85vRK3mOdjEkqFK+E=ST=eQiHdpj43L=_eJMOOznQ@mail.gmail.com
parent b7ce6de9
......@@ -402,16 +402,19 @@
<listitem>
<para>
Replication is only supported by tables, partitioned or not, although a
given table must either be partitioned on both servers or not partitioned
at all. Also, when replicating between partitioned tables, the actual
replication occurs between leaf partitions, so partitions on the two
servers must match one-to-one.
Replication is only supported by tables, including partitioned tables.
Attempts to replicate other types of relations such as views, materialized
views, or foreign tables, will result in an error.
</para>
</listitem>
<listitem>
<para>
Attempts to replicate other types of relations such as views, materialized
views, or foreign tables, will result in an error.
When replicating between partitioned tables, the actual replication
originates from the leaf partitions on the publisher, so partitions on
the publisher must also exist on the subscriber as valid target tables.
(They could either be leaf partitions themselves, or they could be
further subpartitioned, or they could even be independent tables.)
</para>
</listitem>
</itemizedlist>
......
......@@ -594,17 +594,9 @@ CheckSubscriptionRelkind(char relkind, const char *nspname,
const char *relname)
{
/*
* We currently only support writing to regular tables. However, give a
* more specific error for partitioned and foreign tables.
* Give a more specific error for foreign tables.
*/
if (relkind == RELKIND_PARTITIONED_TABLE)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("cannot use relation \"%s.%s\" as logical replication target",
nspname, relname),
errdetail("\"%s.%s\" is a partitioned table.",
nspname, relname)));
else if (relkind == RELKIND_FOREIGN_TABLE)
if (relkind == RELKIND_FOREIGN_TABLE)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("cannot use relation \"%s.%s\" as logical replication target",
......@@ -612,7 +604,7 @@ CheckSubscriptionRelkind(char relkind, const char *nspname,
errdetail("\"%s.%s\" is a foreign table.",
nspname, relname)));
if (relkind != RELKIND_RELATION)
if (relkind != RELKIND_RELATION && relkind != RELKIND_PARTITIONED_TABLE)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("cannot use relation \"%s.%s\" as logical replication target",
......
......@@ -35,6 +35,24 @@ static MemoryContext LogicalRepRelMapContext = NULL;
static HTAB *LogicalRepRelMap = NULL;
static HTAB *LogicalRepTypMap = NULL;
/*
* Partition map (LogicalRepPartMap)
*
* When a partitioned table is used as replication target, replicated
* operations are actually performed on its leaf partitions, which requires
* the partitions to also be mapped to the remote relation. Parent's entry
* (LogicalRepRelMapEntry) cannot be used as-is for all partitions, because
* individual partitions may have different attribute numbers, which means
* attribute mappings to remote relation's attributes must be maintained
* separately for each partition.
*/
static MemoryContext LogicalRepPartMapContext = NULL;
static HTAB *LogicalRepPartMap = NULL;
typedef struct LogicalRepPartMapEntry
{
Oid partoid; /* LogicalRepPartMap's key */
LogicalRepRelMapEntry relmapentry;
} LogicalRepPartMapEntry;
/*
* Relcache invalidation callback for our relation map cache.
......@@ -472,3 +490,174 @@ logicalrep_typmap_gettypname(Oid remoteid)
Assert(OidIsValid(entry->remoteid));
return psprintf("%s.%s", entry->nspname, entry->typname);
}
/*
* Partition cache: look up partition LogicalRepRelMapEntry's
*
* Unlike relation map cache, this is keyed by partition OID, not remote
* relation OID, because we only have to use this cache in the case where
* partitions are not directly mapped to any remote relation, such as when
* replication is occurring with one of their ancestors as target.
*/
/*
* Relcache invalidation callback
*/
static void
logicalrep_partmap_invalidate_cb(Datum arg, Oid reloid)
{
LogicalRepRelMapEntry *entry;
/* Just to be sure. */
if (LogicalRepPartMap == NULL)
return;
if (reloid != InvalidOid)
{
HASH_SEQ_STATUS status;
hash_seq_init(&status, LogicalRepPartMap);
/* TODO, use inverse lookup hashtable? */
while ((entry = (LogicalRepRelMapEntry *) hash_seq_search(&status)) != NULL)
{
if (entry->localreloid == reloid)
{
entry->localreloid = InvalidOid;
hash_seq_term(&status);
break;
}
}
}
else
{
/* invalidate all cache entries */
HASH_SEQ_STATUS status;
hash_seq_init(&status, LogicalRepPartMap);
while ((entry = (LogicalRepRelMapEntry *) hash_seq_search(&status)) != NULL)
entry->localreloid = InvalidOid;
}
}
/*
* Initialize the partition map cache.
*/
static void
logicalrep_partmap_init(void)
{
HASHCTL ctl;
if (!LogicalRepPartMapContext)
LogicalRepPartMapContext =
AllocSetContextCreate(CacheMemoryContext,
"LogicalRepPartMapContext",
ALLOCSET_DEFAULT_SIZES);
/* Initialize the relation hash table. */
MemSet(&ctl, 0, sizeof(ctl));
ctl.keysize = sizeof(Oid); /* partition OID */
ctl.entrysize = sizeof(LogicalRepPartMapEntry);
ctl.hcxt = LogicalRepPartMapContext;
LogicalRepPartMap = hash_create("logicalrep partition map cache", 64, &ctl,
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
/* Watch for invalidation events. */
CacheRegisterRelcacheCallback(logicalrep_partmap_invalidate_cb,
(Datum) 0);
}
/*
* logicalrep_partition_open
*
* Returned entry reuses most of the values of the root table's entry, save
* the attribute map, which can be different for the partition.
*
* Note there's no logialrep_partition_close, because the caller closes the
* the component relation.
*/
LogicalRepRelMapEntry *
logicalrep_partition_open(LogicalRepRelMapEntry *root,
Relation partrel, AttrMap *map)
{
LogicalRepRelMapEntry *entry;
LogicalRepPartMapEntry *part_entry;
LogicalRepRelation *remoterel = &root->remoterel;
Oid partOid = RelationGetRelid(partrel);
AttrMap *attrmap = root->attrmap;
bool found;
int i;
MemoryContext oldctx;
if (LogicalRepPartMap == NULL)
logicalrep_partmap_init();
/* Search for existing entry. */
part_entry = (LogicalRepPartMapEntry *) hash_search(LogicalRepPartMap,
(void *) &partOid,
HASH_ENTER, &found);
if (found)
return &part_entry->relmapentry;
memset(part_entry, 0, sizeof(LogicalRepPartMapEntry));
/* Switch to longer-lived context. */
oldctx = MemoryContextSwitchTo(LogicalRepPartMapContext);
part_entry->partoid = partOid;
/* Remote relation is used as-is from the root entry. */
entry = &part_entry->relmapentry;
entry->remoterel.remoteid = remoterel->remoteid;
entry->remoterel.nspname = pstrdup(remoterel->nspname);
entry->remoterel.relname = pstrdup(remoterel->relname);
entry->remoterel.natts = remoterel->natts;
entry->remoterel.attnames = palloc(remoterel->natts * sizeof(char *));
entry->remoterel.atttyps = palloc(remoterel->natts * sizeof(Oid));
for (i = 0; i < remoterel->natts; i++)
{
entry->remoterel.attnames[i] = pstrdup(remoterel->attnames[i]);
entry->remoterel.atttyps[i] = remoterel->atttyps[i];
}
entry->remoterel.replident = remoterel->replident;
entry->remoterel.attkeys = bms_copy(remoterel->attkeys);
entry->localrel = partrel;
entry->localreloid = partOid;
/*
* If the partition's attributes don't match the root relation's, we'll
* need to make a new attrmap which maps partition attribute numbers to
* remoterel's, instead the original which maps root relation's attribute
* numbers to remoterel's.
*
* Note that 'map' which comes from the tuple routing data structure
* contains 1-based attribute numbers (of the parent relation). However,
* the map in 'entry', a logical replication data structure, contains
* 0-based attribute numbers (of the remote relation).
*/
if (map)
{
AttrNumber attno;
entry->attrmap = make_attrmap(map->maplen);
for (attno = 0; attno < entry->attrmap->maplen; attno++)
{
AttrNumber root_attno = map->attnums[attno];
entry->attrmap->attnums[attno] = attrmap->attnums[root_attno - 1];
}
}
else
entry->attrmap = attrmap;
entry->updatable = root->updatable;
/* state and statelsn are left set to 0. */
MemoryContextSwitchTo(oldctx);
return entry;
}
......@@ -762,7 +762,6 @@ copy_table(Relation rel)
/* Map the publisher relation to local one. */
relmapentry = logicalrep_rel_open(lrel.remoteid, NoLock);
Assert(rel == relmapentry->localrel);
Assert(relmapentry->localrel->rd_rel->relkind == RELKIND_RELATION);
/* Start copy on the publisher. */
initStringInfo(&cmd);
......
This diff is collapsed.
......@@ -34,6 +34,8 @@ extern void logicalrep_relmap_update(LogicalRepRelation *remoterel);
extern LogicalRepRelMapEntry *logicalrep_rel_open(LogicalRepRelId remoteid,
LOCKMODE lockmode);
extern LogicalRepRelMapEntry *logicalrep_partition_open(LogicalRepRelMapEntry *root,
Relation partrel, AttrMap *map);
extern void logicalrep_rel_close(LogicalRepRelMapEntry *rel,
LOCKMODE lockmode);
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment