Fix tuple routing in cases where tuple descriptors don't match.

The previous coding failed to work correctly when we have a
multi-level partitioned hierarchy where tables at successive levels
have different attribute numbers for the partition key attributes.  To
fix, have each PartitionDispatch object store a standalone
TupleTableSlot initialized with the TupleDesc of the corresponding
partitioned table, along with a TupleConversionMap to map tuples from
the its parent's rowtype to own rowtype.  After tuple routing chooses
a leaf partition, we must use the leaf partition's tuple descriptor,
not the root table's.  To that end, a dedicated TupleTableSlot for
tuple routing is now allocated in EState.

Amit Langote
This commit is contained in:
Robert Haas 2016-12-22 17:31:52 -05:00
parent 12bd7dd317
commit 2ac3ef7a01
7 changed files with 190 additions and 15 deletions

View File

@ -923,13 +923,19 @@ RelationGetPartitionQual(Relation rel, bool recurse)
return generate_partition_qual(rel, recurse); return generate_partition_qual(rel, recurse);
} }
/* Turn an array of OIDs with N elements into a list */ /*
#define OID_ARRAY_TO_LIST(arr, N, list) \ * Append OIDs of rel's partitions to the list 'partoids' and for each OID,
* append pointer rel to the list 'parents'.
*/
#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \
do\ do\
{\ {\
int i;\ int i;\
for (i = 0; i < (N); i++)\ for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\
(list) = lappend_oid((list), (arr)[i]);\ {\
(partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\
(parents) = lappend((parents), (rel));\
}\
} while(0) } while(0)
/* /*
@ -944,11 +950,13 @@ PartitionDispatch *
RelationGetPartitionDispatchInfo(Relation rel, int lockmode, RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
int *num_parted, List **leaf_part_oids) int *num_parted, List **leaf_part_oids)
{ {
PartitionDesc rootpartdesc = RelationGetPartitionDesc(rel);
PartitionDispatchData **pd; PartitionDispatchData **pd;
List *all_parts = NIL, List *all_parts = NIL,
*parted_rels; *all_parents = NIL,
ListCell *lc; *parted_rels,
*parted_rel_parents;
ListCell *lc1,
*lc2;
int i, int i,
k, k,
offset; offset;
@ -965,10 +973,13 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
*/ */
*num_parted = 1; *num_parted = 1;
parted_rels = list_make1(rel); parted_rels = list_make1(rel);
OID_ARRAY_TO_LIST(rootpartdesc->oids, rootpartdesc->nparts, all_parts); /* Root partitioned table has no parent, so NULL for parent */
foreach(lc, all_parts) parted_rel_parents = list_make1(NULL);
APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents);
forboth(lc1, all_parts, lc2, all_parents)
{ {
Relation partrel = heap_open(lfirst_oid(lc), lockmode); Relation partrel = heap_open(lfirst_oid(lc1), lockmode);
Relation parent = lfirst(lc2);
PartitionDesc partdesc = RelationGetPartitionDesc(partrel); PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
/* /*
@ -979,7 +990,8 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
{ {
(*num_parted)++; (*num_parted)++;
parted_rels = lappend(parted_rels, partrel); parted_rels = lappend(parted_rels, partrel);
OID_ARRAY_TO_LIST(partdesc->oids, partdesc->nparts, all_parts); parted_rel_parents = lappend(parted_rel_parents, parent);
APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents);
} }
else else
heap_close(partrel, NoLock); heap_close(partrel, NoLock);
@ -1004,10 +1016,12 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
sizeof(PartitionDispatchData *)); sizeof(PartitionDispatchData *));
*leaf_part_oids = NIL; *leaf_part_oids = NIL;
i = k = offset = 0; i = k = offset = 0;
foreach(lc, parted_rels) forboth(lc1, parted_rels, lc2, parted_rel_parents)
{ {
Relation partrel = lfirst(lc); Relation partrel = lfirst(lc1);
Relation parent = lfirst(lc2);
PartitionKey partkey = RelationGetPartitionKey(partrel); PartitionKey partkey = RelationGetPartitionKey(partrel);
TupleDesc tupdesc = RelationGetDescr(partrel);
PartitionDesc partdesc = RelationGetPartitionDesc(partrel); PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
int j, int j,
m; m;
@ -1017,6 +1031,27 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
pd[i]->key = partkey; pd[i]->key = partkey;
pd[i]->keystate = NIL; pd[i]->keystate = NIL;
pd[i]->partdesc = partdesc; pd[i]->partdesc = partdesc;
if (parent != NULL)
{
/*
* For every partitioned table other than root, we must store
* a tuple table slot initialized with its tuple descriptor and
* a tuple conversion map to convert a tuple from its parent's
* rowtype to its own. That is to make sure that we are looking
* at the correct row using the correct tuple descriptor when
* computing its partition key for tuple routing.
*/
pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc);
pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
tupdesc,
gettext_noop("could not convert row type"));
}
else
{
/* Not required for the root partitioned table */
pd[i]->tupslot = NULL;
pd[i]->tupmap = NULL;
}
pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
/* /*
@ -1610,6 +1645,8 @@ get_partition_for_tuple(PartitionDispatch *pd,
{ {
PartitionKey key = parent->key; PartitionKey key = parent->key;
PartitionDesc partdesc = parent->partdesc; PartitionDesc partdesc = parent->partdesc;
TupleTableSlot *myslot = parent->tupslot;
TupleConversionMap *map = parent->tupmap;
/* Quick exit */ /* Quick exit */
if (partdesc->nparts == 0) if (partdesc->nparts == 0)
@ -1618,6 +1655,17 @@ get_partition_for_tuple(PartitionDispatch *pd,
return -1; return -1;
} }
if (myslot != NULL)
{
HeapTuple tuple = ExecFetchSlotTuple(slot);
ExecClearTuple(myslot);
Assert(map != NULL);
tuple = do_convert_tuple(tuple, map);
ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
slot = myslot;
}
/* Extract partition key from tuple */ /* Extract partition key from tuple */
FormPartitionKeyDatum(parent, slot, estate, values, isnull); FormPartitionKeyDatum(parent, slot, estate, values, isnull);

View File

@ -2435,6 +2435,15 @@ CopyFrom(CopyState cstate)
/* Triggers might need a slot as well */ /* Triggers might need a slot as well */
estate->es_trig_tuple_slot = ExecInitExtraTupleSlot(estate); estate->es_trig_tuple_slot = ExecInitExtraTupleSlot(estate);
/*
* Initialize a dedicated slot to manipulate tuples of any given
* partition's rowtype.
*/
if (cstate->partition_dispatch_info)
estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
else
estate->es_partition_tuple_slot = NULL;
/* /*
* It's more efficient to prepare a bunch of tuples for insertion, and * It's more efficient to prepare a bunch of tuples for insertion, and
* insert them in one heap_multi_insert() call, than call heap_insert() * insert them in one heap_multi_insert() call, than call heap_insert()
@ -2484,7 +2493,8 @@ CopyFrom(CopyState cstate)
for (;;) for (;;)
{ {
TupleTableSlot *slot; TupleTableSlot *slot,
*oldslot = NULL;
bool skip_tuple; bool skip_tuple;
Oid loaded_oid = InvalidOid; Oid loaded_oid = InvalidOid;
@ -2571,7 +2581,19 @@ CopyFrom(CopyState cstate)
map = cstate->partition_tupconv_maps[leaf_part_index]; map = cstate->partition_tupconv_maps[leaf_part_index];
if (map) if (map)
{ {
Relation partrel = resultRelInfo->ri_RelationDesc;
tuple = do_convert_tuple(tuple, map); tuple = do_convert_tuple(tuple, map);
/*
* We must use the partition's tuple descriptor from this
* point on. Use a dedicated slot from this point on until
* we're finished dealing with the partition.
*/
oldslot = slot;
slot = estate->es_partition_tuple_slot;
Assert(slot != NULL);
ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
ExecStoreTuple(tuple, slot, InvalidBuffer, true); ExecStoreTuple(tuple, slot, InvalidBuffer, true);
} }
@ -2667,6 +2689,10 @@ CopyFrom(CopyState cstate)
{ {
resultRelInfo = saved_resultRelInfo; resultRelInfo = saved_resultRelInfo;
estate->es_result_relation_info = resultRelInfo; estate->es_result_relation_info = resultRelInfo;
/* Switch back to the slot corresponding to the root table */
Assert(oldslot != NULL);
slot = oldslot;
} }
} }
} }
@ -2714,13 +2740,14 @@ CopyFrom(CopyState cstate)
* Remember cstate->partition_dispatch_info[0] corresponds to the root * Remember cstate->partition_dispatch_info[0] corresponds to the root
* partitioned table, which we must not try to close, because it is * partitioned table, which we must not try to close, because it is
* the main target table of COPY that will be closed eventually by * the main target table of COPY that will be closed eventually by
* DoCopy(). * DoCopy(). Also, tupslot is NULL for the root partitioned table.
*/ */
for (i = 1; i < cstate->num_dispatch; i++) for (i = 1; i < cstate->num_dispatch; i++)
{ {
PartitionDispatch pd = cstate->partition_dispatch_info[i]; PartitionDispatch pd = cstate->partition_dispatch_info[i];
heap_close(pd->reldesc, NoLock); heap_close(pd->reldesc, NoLock);
ExecDropSingleTupleTableSlot(pd->tupslot);
} }
for (i = 0; i < cstate->num_partitions; i++) for (i = 0; i < cstate->num_partitions; i++)
{ {

View File

@ -262,6 +262,7 @@ ExecInsert(ModifyTableState *mtstate,
Relation resultRelationDesc; Relation resultRelationDesc;
Oid newId; Oid newId;
List *recheckIndexes = NIL; List *recheckIndexes = NIL;
TupleTableSlot *oldslot = NULL;
/* /*
* get the heap tuple out of the tuple table slot, making sure we have a * get the heap tuple out of the tuple table slot, making sure we have a
@ -318,7 +319,19 @@ ExecInsert(ModifyTableState *mtstate,
map = mtstate->mt_partition_tupconv_maps[leaf_part_index]; map = mtstate->mt_partition_tupconv_maps[leaf_part_index];
if (map) if (map)
{ {
Relation partrel = resultRelInfo->ri_RelationDesc;
tuple = do_convert_tuple(tuple, map); tuple = do_convert_tuple(tuple, map);
/*
* We must use the partition's tuple descriptor from this
* point on, until we're finished dealing with the partition.
* Use the dedicated slot for that.
*/
oldslot = slot;
slot = estate->es_partition_tuple_slot;
Assert(slot != NULL);
ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
ExecStoreTuple(tuple, slot, InvalidBuffer, true); ExecStoreTuple(tuple, slot, InvalidBuffer, true);
} }
} }
@ -566,6 +579,10 @@ ExecInsert(ModifyTableState *mtstate,
{ {
resultRelInfo = saved_resultRelInfo; resultRelInfo = saved_resultRelInfo;
estate->es_result_relation_info = resultRelInfo; estate->es_result_relation_info = resultRelInfo;
/* Switch back to the slot corresponding to the root table */
Assert(oldslot != NULL);
slot = oldslot;
} }
/* /*
@ -1734,7 +1751,15 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
mtstate->mt_partitions = partitions; mtstate->mt_partitions = partitions;
mtstate->mt_num_partitions = num_partitions; mtstate->mt_num_partitions = num_partitions;
mtstate->mt_partition_tupconv_maps = partition_tupconv_maps; mtstate->mt_partition_tupconv_maps = partition_tupconv_maps;
/*
* Initialize a dedicated slot to manipulate tuples of any given
* partition's rowtype.
*/
estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
} }
else
estate->es_partition_tuple_slot = NULL;
/* /*
* Initialize any WITH CHECK OPTION constraints if needed. * Initialize any WITH CHECK OPTION constraints if needed.
@ -2058,12 +2083,14 @@ ExecEndModifyTable(ModifyTableState *node)
* Remember node->mt_partition_dispatch_info[0] corresponds to the root * Remember node->mt_partition_dispatch_info[0] corresponds to the root
* partitioned table, which we must not try to close, because it is the * partitioned table, which we must not try to close, because it is the
* main target table of the query that will be closed by ExecEndPlan(). * main target table of the query that will be closed by ExecEndPlan().
* Also, tupslot is NULL for the root partitioned table.
*/ */
for (i = 1; i < node->mt_num_dispatch; i++) for (i = 1; i < node->mt_num_dispatch; i++)
{ {
PartitionDispatch pd = node->mt_partition_dispatch_info[i]; PartitionDispatch pd = node->mt_partition_dispatch_info[i];
heap_close(pd->reldesc, NoLock); heap_close(pd->reldesc, NoLock);
ExecDropSingleTupleTableSlot(pd->tupslot);
} }
for (i = 0; i < node->mt_num_partitions; i++) for (i = 0; i < node->mt_num_partitions; i++)
{ {

View File

@ -47,6 +47,11 @@ typedef struct PartitionDescData *PartitionDesc;
* key Partition key information of the table * key Partition key information of the table
* keystate Execution state required for expressions in the partition key * keystate Execution state required for expressions in the partition key
* partdesc Partition descriptor of the table * partdesc Partition descriptor of the table
* tupslot A standalone TupleTableSlot initialized with this table's tuple
* descriptor
* tupmap TupleConversionMap to convert from the parent's rowtype to
* this table's rowtype (when extracting the partition key of a
* tuple just before routing it through this table)
* indexes Array with partdesc->nparts members (for details on what * indexes Array with partdesc->nparts members (for details on what
* individual members represent, see how they are set in * individual members represent, see how they are set in
* RelationGetPartitionDispatchInfo()) * RelationGetPartitionDispatchInfo())
@ -58,6 +63,8 @@ typedef struct PartitionDispatchData
PartitionKey key; PartitionKey key;
List *keystate; /* list of ExprState */ List *keystate; /* list of ExprState */
PartitionDesc partdesc; PartitionDesc partdesc;
TupleTableSlot *tupslot;
TupleConversionMap *tupmap;
int *indexes; int *indexes;
} PartitionDispatchData; } PartitionDispatchData;

View File

@ -384,6 +384,9 @@ typedef struct EState
TupleTableSlot *es_trig_oldtup_slot; /* for TriggerEnabled */ TupleTableSlot *es_trig_oldtup_slot; /* for TriggerEnabled */
TupleTableSlot *es_trig_newtup_slot; /* for TriggerEnabled */ TupleTableSlot *es_trig_newtup_slot; /* for TriggerEnabled */
/* Slot used to manipulate a tuple after it is routed to a partition */
TupleTableSlot *es_partition_tuple_slot;
/* Parameter info: */ /* Parameter info: */
ParamListInfo es_param_list_info; /* values of external params */ ParamListInfo es_param_list_info; /* values of external params */
ParamExecData *es_param_exec_vals; /* values of internal params */ ParamExecData *es_param_exec_vals; /* values of internal params */

View File

@ -300,3 +300,40 @@ drop cascades to table part_null
drop cascades to table part_ee_ff drop cascades to table part_ee_ff
drop cascades to table part_ee_ff1 drop cascades to table part_ee_ff1
drop cascades to table part_ee_ff2 drop cascades to table part_ee_ff2
-- more tests for certain multi-level partitioning scenarios
create table p (a int, b int) partition by range (a, b);
create table p1 (b int, a int not null) partition by range (b);
create table p11 (like p1);
alter table p11 drop a;
alter table p11 add a int;
alter table p11 drop a;
alter table p11 add a int not null;
-- attnum for key attribute 'a' is different in p, p1, and p11
select attrelid::regclass, attname, attnum
from pg_attribute
where attname = 'a'
and (attrelid = 'p'::regclass
or attrelid = 'p1'::regclass
or attrelid = 'p11'::regclass);
attrelid | attname | attnum
----------+---------+--------
p | a | 1
p1 | a | 2
p11 | a | 4
(3 rows)
alter table p1 attach partition p11 for values from (2) to (5);
alter table p attach partition p1 for values from (1, 2) to (1, 10);
-- check that "(1, 2)" is correctly routed to p11.
insert into p values (1, 2);
select tableoid::regclass, * from p;
tableoid | a | b
----------+---+---
p11 | 1 | 2
(1 row)
-- cleanup
drop table p cascade;
NOTICE: drop cascades to 2 other objects
DETAIL: drop cascades to table p1
drop cascades to table p11

View File

@ -170,3 +170,29 @@ select tableoid::regclass, * from list_parted;
-- cleanup -- cleanup
drop table range_parted cascade; drop table range_parted cascade;
drop table list_parted cascade; drop table list_parted cascade;
-- more tests for certain multi-level partitioning scenarios
create table p (a int, b int) partition by range (a, b);
create table p1 (b int, a int not null) partition by range (b);
create table p11 (like p1);
alter table p11 drop a;
alter table p11 add a int;
alter table p11 drop a;
alter table p11 add a int not null;
-- attnum for key attribute 'a' is different in p, p1, and p11
select attrelid::regclass, attname, attnum
from pg_attribute
where attname = 'a'
and (attrelid = 'p'::regclass
or attrelid = 'p1'::regclass
or attrelid = 'p11'::regclass);
alter table p1 attach partition p11 for values from (2) to (5);
alter table p attach partition p1 for values from (1, 2) to (1, 10);
-- check that "(1, 2)" is correctly routed to p11.
insert into p values (1, 2);
select tableoid::regclass, * from p;
-- cleanup
drop table p cascade;