Commit 77b6b5e9 authored by Robert Haas's avatar Robert Haas

Make RelationGetPartitionDispatchInfo expand depth-first.

With this change, the order of leaf partitions as returned by
RelationGetPartitionDispatchInfo should now be the same as the
order used by expand_inherited_rtentry.  This will make it simpler
for future patches to match up the partition dispatch information
with the planner data structures.  The new code is also, in my
opinion anyway, simpler and easier to understand.

Amit Langote, reviewed by Amit Khandekar.  I also reviewed and
made a few cosmetic revisions.

Discussion: http://postgr.es/m/d98d4761-5071-1762-501e-0e15047c714b@lab.ntt.co.jp
parent 8951c65d
...@@ -147,6 +147,8 @@ static int32 partition_bound_cmp(PartitionKey key, ...@@ -147,6 +147,8 @@ static int32 partition_bound_cmp(PartitionKey key,
static int partition_bound_bsearch(PartitionKey key, static int partition_bound_bsearch(PartitionKey key,
PartitionBoundInfo boundinfo, PartitionBoundInfo boundinfo,
void *probe, bool probe_is_bound, bool *is_equal); void *probe, bool probe_is_bound, bool *is_equal);
static void get_partition_dispatch_recurse(Relation rel, Relation parent,
List **pds, List **leaf_part_oids);
/* /*
* RelationBuildPartitionDesc * RelationBuildPartitionDesc
...@@ -1191,21 +1193,6 @@ get_partition_qual_relid(Oid relid) ...@@ -1191,21 +1193,6 @@ get_partition_qual_relid(Oid relid)
return result; return result;
} }
/*
* Append OIDs of rel's partitions to the list 'partoids' and for each OID,
* append pointer rel to the list 'parents'.
*/
#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \
do\
{\
int i;\
for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\
{\
(partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\
(parents) = lappend((parents), (rel));\
}\
} while(0)
/* /*
* RelationGetPartitionDispatchInfo * RelationGetPartitionDispatchInfo
* Returns information necessary to route tuples down a partition tree * Returns information necessary to route tuples down a partition tree
...@@ -1222,151 +1209,130 @@ PartitionDispatch * ...@@ -1222,151 +1209,130 @@ PartitionDispatch *
RelationGetPartitionDispatchInfo(Relation rel, RelationGetPartitionDispatchInfo(Relation rel,
int *num_parted, List **leaf_part_oids) int *num_parted, List **leaf_part_oids)
{ {
List *pdlist = NIL;
PartitionDispatchData **pd; PartitionDispatchData **pd;
List *all_parts = NIL, ListCell *lc;
*all_parents = NIL, int i;
*parted_rels,
*parted_rel_parents;
ListCell *lc1,
*lc2;
int i,
k,
offset;
/* Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
* We rely on the relcache to traverse the partition tree to build both
* the leaf partition OIDs list and the array of PartitionDispatch objects
* for the partitioned tables in the tree. That means every partitioned
* table in the tree must be locked, which is fine since we require the
* caller to lock all the partitions anyway.
*
* For every partitioned table in the tree, starting with the root
* partitioned table, add its relcache entry to parted_rels, while also
* queuing its partitions (in the order in which they appear in the
* partition descriptor) to be looked at later in the same loop. This is
* a bit tricky but works because the foreach() macro doesn't fetch the
* next list element until the bottom of the loop.
*/
*num_parted = 1;
parted_rels = list_make1(rel);
/* Root partitioned table has no parent, so NULL for parent */
parted_rel_parents = list_make1(NULL);
APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents);
forboth(lc1, all_parts, lc2, all_parents)
{
Oid partrelid = lfirst_oid(lc1);
Relation parent = lfirst(lc2);
if (get_rel_relkind(partrelid) == RELKIND_PARTITIONED_TABLE) *num_parted = 0;
{ *leaf_part_oids = NIL;
/*
* Already locked by the caller. Note that it is the
* responsibility of the caller to close the below relcache entry,
* once done using the information being collected here (for
* example, in ExecEndModifyTable).
*/
Relation partrel = heap_open(partrelid, NoLock);
(*num_parted)++; get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids);
parted_rels = lappend(parted_rels, partrel); *num_parted = list_length(pdlist);
parted_rel_parents = lappend(parted_rel_parents, parent); pd = (PartitionDispatchData **) palloc(*num_parted *
APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents); sizeof(PartitionDispatchData *));
} i = 0;
foreach(lc, pdlist)
{
pd[i++] = lfirst(lc);
} }
/* return pd;
* We want to create two arrays - one for leaf partitions and another for }
* partitioned tables (including the root table and internal partitions).
* While we only create the latter here, leaf partition array of suitable /*
* objects (such as, ResultRelInfo) is created by the caller using the * get_partition_dispatch_recurse
* list of OIDs we return. Indexes into these arrays get assigned in a * Recursively expand partition tree rooted at rel
* breadth-first manner, whereby partitions of any given level are placed *
* consecutively in the respective arrays. * As the partition tree is expanded in a depth-first manner, we mantain two
* global lists: of PartitionDispatch objects corresponding to partitioned
* tables in *pds and of the leaf partition OIDs in *leaf_part_oids.
*
* Note that the order of OIDs of leaf partitions in leaf_part_oids matches
* the order in which the planner's expand_partitioned_rtentry() processes
* them. It's not necessarily the case that the offsets match up exactly,
* because constraint exclusion might prune away some partitions on the
* planner side, whereas we'll always have the complete list; but unpruned
* partitions will appear in the same order in the plan as they are returned
* here.
*/ */
pd = (PartitionDispatchData **) palloc(*num_parted * static void
sizeof(PartitionDispatchData *)); get_partition_dispatch_recurse(Relation rel, Relation parent,
*leaf_part_oids = NIL; List **pds, List **leaf_part_oids)
i = k = offset = 0; {
forboth(lc1, parted_rels, lc2, parted_rel_parents) TupleDesc tupdesc = RelationGetDescr(rel);
{ PartitionDesc partdesc = RelationGetPartitionDesc(rel);
Relation partrel = lfirst(lc1); PartitionKey partkey = RelationGetPartitionKey(rel);
Relation parent = lfirst(lc2); PartitionDispatch pd;
PartitionKey partkey = RelationGetPartitionKey(partrel); int i;
TupleDesc tupdesc = RelationGetDescr(partrel);
PartitionDesc partdesc = RelationGetPartitionDesc(partrel); check_stack_depth();
int j,
m; /* Build a PartitionDispatch for this table and add it to *pds. */
pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData));
pd[i] = (PartitionDispatch) palloc(sizeof(PartitionDispatchData)); *pds = lappend(*pds, pd);
pd[i]->reldesc = partrel; pd->reldesc = rel;
pd[i]->key = partkey; pd->key = partkey;
pd[i]->keystate = NIL; pd->keystate = NIL;
pd[i]->partdesc = partdesc; pd->partdesc = partdesc;
if (parent != NULL) if (parent != NULL)
{ {
/* /*
* For every partitioned table other than root, we must store a * For every partitioned table other than the root, we must store a
* tuple table slot initialized with its tuple descriptor and a * tuple table slot initialized with its tuple descriptor and a tuple
* tuple conversion map to convert a tuple from its parent's * conversion map to convert a tuple from its parent's rowtype to its
* rowtype to its own. That is to make sure that we are looking at * own. That is to make sure that we are looking at the correct row
* the correct row using the correct tuple descriptor when * using the correct tuple descriptor when computing its partition key
* computing its partition key for tuple routing. * for tuple routing.
*/ */
pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc); pd->tupslot = MakeSingleTupleTableSlot(tupdesc);
pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent), pd->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
tupdesc, tupdesc,
gettext_noop("could not convert row type")); gettext_noop("could not convert row type"));
} }
else else
{ {
/* Not required for the root partitioned table */ /* Not required for the root partitioned table */
pd[i]->tupslot = NULL; pd->tupslot = NULL;
pd[i]->tupmap = NULL; pd->tupmap = NULL;
} }
pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
/* /*
* Indexes corresponding to the internal partitions are multiplied by * Go look at each partition of this table. If it's a leaf partition,
* -1 to distinguish them from those of leaf partitions. Encountering * simply add its OID to *leaf_part_oids. If it's a partitioned table,
* an index >= 0 means we found a leaf partition, which is immediately * recursively call get_partition_dispatch_recurse(), so that its
* returned as the partition we are looking for. A negative index * partitions are processed as well and a corresponding PartitionDispatch
* means we found a partitioned table, whose PartitionDispatch object * object gets added to *pds.
* is located at the above index multiplied back by -1. Using the *
* PartitionDispatch object, search is continued further down the * About the values in pd->indexes: for a leaf partition, it contains the
* partition tree. * leaf partition's position in the global list *leaf_part_oids minus 1,
*/ * whereas for a partitioned table partition, it contains the partition's
m = 0; * position in the global list *pds multiplied by -1. The latter is
for (j = 0; j < partdesc->nparts; j++) * multiplied by -1 to distinguish partitioned tables from leaf partitions
{ * when going through the values in pd->indexes. So, for example, when
Oid partrelid = partdesc->oids[j]; * using it during tuple-routing, encountering a value >= 0 means we found
* a leaf partition. It is immediately returned as the index in the array
* of ResultRelInfos of all the leaf partitions, using which we insert the
* tuple into that leaf partition. A negative value means we found a
* partitioned table. The value multiplied by -1 is returned as the index
* in the array of PartitionDispatch objects of all partitioned tables in
* the tree. This value is used to continue the search in the next level
* of the partition tree.
*/
pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
for (i = 0; i < partdesc->nparts; i++)
{
Oid partrelid = partdesc->oids[i];
if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE) if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE)
{ {
*leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid); *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid);
pd[i]->indexes[j] = k++; pd->indexes[i] = list_length(*leaf_part_oids) - 1;
} }
else else
{ {
/* /*
* offset denotes the number of partitioned tables of upper * We assume all tables in the partition tree were already locked
* levels including those of the current level. Any partition * by the caller.
* of this table must belong to the next level and hence will
* be placed after the last partitioned table of this level.
*/ */
pd[i]->indexes[j] = -(1 + offset + m); Relation partrel = heap_open(partrelid, NoLock);
m++;
}
}
i++;
/* pd->indexes[i] = -list_length(*pds);
* This counts the number of partitioned tables at upper levels get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids);
* including those of the current level. }
*/
offset += m;
} }
return pd;
} }
/* Module-local functions */ /* Module-local functions */
......
...@@ -1565,6 +1565,13 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) ...@@ -1565,6 +1565,13 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
root->append_rel_list = list_concat(root->append_rel_list, appinfos); root->append_rel_list = list_concat(root->append_rel_list, appinfos);
} }
/*
* expand_partitioned_rtentry
* Recursively expand an RTE for a partitioned table.
*
* Note that RelationGetPartitionDispatchInfo will expand partitions in the
* same order as this code.
*/
static void static void
expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte, expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
Index parentRTindex, Relation parentrel, Index parentRTindex, Relation parentrel,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment