diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 9980582b77..fca874752f 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -923,13 +923,19 @@ RelationGetPartitionQual(Relation rel, bool recurse) return generate_partition_qual(rel, recurse); } -/* Turn an array of OIDs with N elements into a list */ -#define OID_ARRAY_TO_LIST(arr, N, list) \ +/* + * Append OIDs of rel's partitions to the list 'partoids' and for each OID, + * append pointer rel to the list 'parents'. + */ +#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \ do\ {\ int i;\ - for (i = 0; i < (N); i++)\ - (list) = lappend_oid((list), (arr)[i]);\ + for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\ + {\ + (partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\ + (parents) = lappend((parents), (rel));\ + }\ } while(0) /* @@ -944,11 +950,13 @@ PartitionDispatch * RelationGetPartitionDispatchInfo(Relation rel, int lockmode, int *num_parted, List **leaf_part_oids) { - PartitionDesc rootpartdesc = RelationGetPartitionDesc(rel); PartitionDispatchData **pd; List *all_parts = NIL, - *parted_rels; - ListCell *lc; + *all_parents = NIL, + *parted_rels, + *parted_rel_parents; + ListCell *lc1, + *lc2; int i, k, offset; @@ -965,10 +973,13 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode, */ *num_parted = 1; parted_rels = list_make1(rel); - OID_ARRAY_TO_LIST(rootpartdesc->oids, rootpartdesc->nparts, all_parts); - foreach(lc, all_parts) + /* Root partitioned table has no parent, so NULL for parent */ + parted_rel_parents = list_make1(NULL); + APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents); + forboth(lc1, all_parts, lc2, all_parents) { - Relation partrel = heap_open(lfirst_oid(lc), lockmode); + Relation partrel = heap_open(lfirst_oid(lc1), lockmode); + Relation parent = lfirst(lc2); PartitionDesc partdesc = RelationGetPartitionDesc(partrel); /* @@ -979,7 +990,8 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode, { (*num_parted)++; parted_rels = lappend(parted_rels, partrel); - OID_ARRAY_TO_LIST(partdesc->oids, partdesc->nparts, all_parts); + parted_rel_parents = lappend(parted_rel_parents, parent); + APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents); } else heap_close(partrel, NoLock); @@ -1004,10 +1016,12 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode, sizeof(PartitionDispatchData *)); *leaf_part_oids = NIL; i = k = offset = 0; - foreach(lc, parted_rels) + forboth(lc1, parted_rels, lc2, parted_rel_parents) { - Relation partrel = lfirst(lc); + Relation partrel = lfirst(lc1); + Relation parent = lfirst(lc2); PartitionKey partkey = RelationGetPartitionKey(partrel); + TupleDesc tupdesc = RelationGetDescr(partrel); PartitionDesc partdesc = RelationGetPartitionDesc(partrel); int j, m; @@ -1017,6 +1031,27 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode, pd[i]->key = partkey; pd[i]->keystate = NIL; pd[i]->partdesc = partdesc; + if (parent != NULL) + { + /* + * For every partitioned table other than root, we must store + * a tuple table slot initialized with its tuple descriptor and + * a tuple conversion map to convert a tuple from its parent's + * rowtype to its own. That is to make sure that we are looking + * at the correct row using the correct tuple descriptor when + * computing its partition key for tuple routing. + */ + pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc); + pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent), + tupdesc, + gettext_noop("could not convert row type")); + } + else + { + /* Not required for the root partitioned table */ + pd[i]->tupslot = NULL; + pd[i]->tupmap = NULL; + } pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); /* @@ -1610,6 +1645,8 @@ get_partition_for_tuple(PartitionDispatch *pd, { PartitionKey key = parent->key; PartitionDesc partdesc = parent->partdesc; + TupleTableSlot *myslot = parent->tupslot; + TupleConversionMap *map = parent->tupmap; /* Quick exit */ if (partdesc->nparts == 0) @@ -1618,6 +1655,17 @@ get_partition_for_tuple(PartitionDispatch *pd, return -1; } + if (myslot != NULL) + { + HeapTuple tuple = ExecFetchSlotTuple(slot); + + ExecClearTuple(myslot); + Assert(map != NULL); + tuple = do_convert_tuple(tuple, map); + ExecStoreTuple(tuple, myslot, InvalidBuffer, true); + slot = myslot; + } + /* Extract partition key from tuple */ FormPartitionKeyDatum(parent, slot, estate, values, isnull); diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index d5901651db..aa25a23336 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2435,6 +2435,15 @@ CopyFrom(CopyState cstate) /* Triggers might need a slot as well */ estate->es_trig_tuple_slot = ExecInitExtraTupleSlot(estate); + /* + * Initialize a dedicated slot to manipulate tuples of any given + * partition's rowtype. + */ + if (cstate->partition_dispatch_info) + estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate); + else + estate->es_partition_tuple_slot = NULL; + /* * It's more efficient to prepare a bunch of tuples for insertion, and * insert them in one heap_multi_insert() call, than call heap_insert() @@ -2484,7 +2493,8 @@ CopyFrom(CopyState cstate) for (;;) { - TupleTableSlot *slot; + TupleTableSlot *slot, + *oldslot = NULL; bool skip_tuple; Oid loaded_oid = InvalidOid; @@ -2571,7 +2581,19 @@ CopyFrom(CopyState cstate) map = cstate->partition_tupconv_maps[leaf_part_index]; if (map) { + Relation partrel = resultRelInfo->ri_RelationDesc; + tuple = do_convert_tuple(tuple, map); + + /* + * We must use the partition's tuple descriptor from this + * point on. Use a dedicated slot from this point on until + * we're finished dealing with the partition. + */ + oldslot = slot; + slot = estate->es_partition_tuple_slot; + Assert(slot != NULL); + ExecSetSlotDescriptor(slot, RelationGetDescr(partrel)); ExecStoreTuple(tuple, slot, InvalidBuffer, true); } @@ -2667,6 +2689,10 @@ CopyFrom(CopyState cstate) { resultRelInfo = saved_resultRelInfo; estate->es_result_relation_info = resultRelInfo; + + /* Switch back to the slot corresponding to the root table */ + Assert(oldslot != NULL); + slot = oldslot; } } } @@ -2714,13 +2740,14 @@ CopyFrom(CopyState cstate) * Remember cstate->partition_dispatch_info[0] corresponds to the root * partitioned table, which we must not try to close, because it is * the main target table of COPY that will be closed eventually by - * DoCopy(). + * DoCopy(). Also, tupslot is NULL for the root partitioned table. */ for (i = 1; i < cstate->num_dispatch; i++) { PartitionDispatch pd = cstate->partition_dispatch_info[i]; heap_close(pd->reldesc, NoLock); + ExecDropSingleTupleTableSlot(pd->tupslot); } for (i = 0; i < cstate->num_partitions; i++) { diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index a9546106ce..0d85b151c2 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -262,6 +262,7 @@ ExecInsert(ModifyTableState *mtstate, Relation resultRelationDesc; Oid newId; List *recheckIndexes = NIL; + TupleTableSlot *oldslot = NULL; /* * get the heap tuple out of the tuple table slot, making sure we have a @@ -318,7 +319,19 @@ ExecInsert(ModifyTableState *mtstate, map = mtstate->mt_partition_tupconv_maps[leaf_part_index]; if (map) { + Relation partrel = resultRelInfo->ri_RelationDesc; + tuple = do_convert_tuple(tuple, map); + + /* + * We must use the partition's tuple descriptor from this + * point on, until we're finished dealing with the partition. + * Use the dedicated slot for that. + */ + oldslot = slot; + slot = estate->es_partition_tuple_slot; + Assert(slot != NULL); + ExecSetSlotDescriptor(slot, RelationGetDescr(partrel)); ExecStoreTuple(tuple, slot, InvalidBuffer, true); } } @@ -566,6 +579,10 @@ ExecInsert(ModifyTableState *mtstate, { resultRelInfo = saved_resultRelInfo; estate->es_result_relation_info = resultRelInfo; + + /* Switch back to the slot corresponding to the root table */ + Assert(oldslot != NULL); + slot = oldslot; } /* @@ -1734,7 +1751,15 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) mtstate->mt_partitions = partitions; mtstate->mt_num_partitions = num_partitions; mtstate->mt_partition_tupconv_maps = partition_tupconv_maps; + + /* + * Initialize a dedicated slot to manipulate tuples of any given + * partition's rowtype. + */ + estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate); } + else + estate->es_partition_tuple_slot = NULL; /* * Initialize any WITH CHECK OPTION constraints if needed. @@ -2058,12 +2083,14 @@ ExecEndModifyTable(ModifyTableState *node) * Remember node->mt_partition_dispatch_info[0] corresponds to the root * partitioned table, which we must not try to close, because it is the * main target table of the query that will be closed by ExecEndPlan(). + * Also, tupslot is NULL for the root partitioned table. */ for (i = 1; i < node->mt_num_dispatch; i++) { PartitionDispatch pd = node->mt_partition_dispatch_info[i]; heap_close(pd->reldesc, NoLock); + ExecDropSingleTupleTableSlot(pd->tupslot); } for (i = 0; i < node->mt_num_partitions; i++) { diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index 21effbf87b..bf38df5d29 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -47,6 +47,11 @@ typedef struct PartitionDescData *PartitionDesc; * key Partition key information of the table * keystate Execution state required for expressions in the partition key * partdesc Partition descriptor of the table + * tupslot A standalone TupleTableSlot initialized with this table's tuple + * descriptor + * tupmap TupleConversionMap to convert from the parent's rowtype to + * this table's rowtype (when extracting the partition key of a + * tuple just before routing it through this table) * indexes Array with partdesc->nparts members (for details on what * individual members represent, see how they are set in * RelationGetPartitionDispatchInfo()) @@ -58,6 +63,8 @@ typedef struct PartitionDispatchData PartitionKey key; List *keystate; /* list of ExprState */ PartitionDesc partdesc; + TupleTableSlot *tupslot; + TupleConversionMap *tupmap; int *indexes; } PartitionDispatchData; diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index cc0821bcac..d43ec56a2b 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -384,6 +384,9 @@ typedef struct EState TupleTableSlot *es_trig_oldtup_slot; /* for TriggerEnabled */ TupleTableSlot *es_trig_newtup_slot; /* for TriggerEnabled */ + /* Slot used to manipulate a tuple after it is routed to a partition */ + TupleTableSlot *es_partition_tuple_slot; + /* Parameter info: */ ParamListInfo es_param_list_info; /* values of external params */ ParamExecData *es_param_exec_vals; /* values of internal params */ diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index 561cefa3c4..49f667b119 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -300,3 +300,40 @@ drop cascades to table part_null drop cascades to table part_ee_ff drop cascades to table part_ee_ff1 drop cascades to table part_ee_ff2 +-- more tests for certain multi-level partitioning scenarios +create table p (a int, b int) partition by range (a, b); +create table p1 (b int, a int not null) partition by range (b); +create table p11 (like p1); +alter table p11 drop a; +alter table p11 add a int; +alter table p11 drop a; +alter table p11 add a int not null; +-- attnum for key attribute 'a' is different in p, p1, and p11 +select attrelid::regclass, attname, attnum +from pg_attribute +where attname = 'a' + and (attrelid = 'p'::regclass + or attrelid = 'p1'::regclass + or attrelid = 'p11'::regclass); + attrelid | attname | attnum +----------+---------+-------- + p | a | 1 + p1 | a | 2 + p11 | a | 4 +(3 rows) + +alter table p1 attach partition p11 for values from (2) to (5); +alter table p attach partition p1 for values from (1, 2) to (1, 10); +-- check that "(1, 2)" is correctly routed to p11. +insert into p values (1, 2); +select tableoid::regclass, * from p; + tableoid | a | b +----------+---+--- + p11 | 1 | 2 +(1 row) + +-- cleanup +drop table p cascade; +NOTICE: drop cascades to 2 other objects +DETAIL: drop cascades to table p1 +drop cascades to table p11 diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index 846bb5897a..08dc068de8 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -170,3 +170,29 @@ select tableoid::regclass, * from list_parted; -- cleanup drop table range_parted cascade; drop table list_parted cascade; + +-- more tests for certain multi-level partitioning scenarios +create table p (a int, b int) partition by range (a, b); +create table p1 (b int, a int not null) partition by range (b); +create table p11 (like p1); +alter table p11 drop a; +alter table p11 add a int; +alter table p11 drop a; +alter table p11 add a int not null; +-- attnum for key attribute 'a' is different in p, p1, and p11 +select attrelid::regclass, attname, attnum +from pg_attribute +where attname = 'a' + and (attrelid = 'p'::regclass + or attrelid = 'p1'::regclass + or attrelid = 'p11'::regclass); + +alter table p1 attach partition p11 for values from (2) to (5); +alter table p attach partition p1 for values from (1, 2) to (1, 10); + +-- check that "(1, 2)" is correctly routed to p11. +insert into p values (1, 2); +select tableoid::regclass, * from p; + +-- cleanup +drop table p cascade;