Fix tuple routing in cases where tuple descriptors don't match.

The previous coding failed to work correctly when we have a
multi-level partitioned hierarchy where tables at successive levels
have different attribute numbers for the partition key attributes.  To
fix, have each PartitionDispatch object store a standalone
TupleTableSlot initialized with the TupleDesc of the corresponding
partitioned table, along with a TupleConversionMap to map tuples from
the its parent's rowtype to own rowtype.  After tuple routing chooses
a leaf partition, we must use the leaf partition's tuple descriptor,
not the root table's.  To that end, a dedicated TupleTableSlot for
tuple routing is now allocated in EState.

Amit Langote
This commit is contained in:
Robert Haas 2016-12-22 17:31:52 -05:00
parent 12bd7dd317
commit 2ac3ef7a01
7 changed files with 190 additions and 15 deletions

View File

@ -923,13 +923,19 @@ RelationGetPartitionQual(Relation rel, bool recurse)
return generate_partition_qual(rel, recurse);
}
/* Turn an array of OIDs with N elements into a list */
#define OID_ARRAY_TO_LIST(arr, N, list) \
/*
* Append OIDs of rel's partitions to the list 'partoids' and for each OID,
* append pointer rel to the list 'parents'.
*/
#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \
do\
{\
int i;\
for (i = 0; i < (N); i++)\
(list) = lappend_oid((list), (arr)[i]);\
for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\
{\
(partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\
(parents) = lappend((parents), (rel));\
}\
} while(0)
/*
@ -944,11 +950,13 @@ PartitionDispatch *
RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
int *num_parted, List **leaf_part_oids)
{
PartitionDesc rootpartdesc = RelationGetPartitionDesc(rel);
PartitionDispatchData **pd;
List *all_parts = NIL,
*parted_rels;
ListCell *lc;
*all_parents = NIL,
*parted_rels,
*parted_rel_parents;
ListCell *lc1,
*lc2;
int i,
k,
offset;
@ -965,10 +973,13 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
*/
*num_parted = 1;
parted_rels = list_make1(rel);
OID_ARRAY_TO_LIST(rootpartdesc->oids, rootpartdesc->nparts, all_parts);
foreach(lc, all_parts)
/* Root partitioned table has no parent, so NULL for parent */
parted_rel_parents = list_make1(NULL);
APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents);
forboth(lc1, all_parts, lc2, all_parents)
{
Relation partrel = heap_open(lfirst_oid(lc), lockmode);
Relation partrel = heap_open(lfirst_oid(lc1), lockmode);
Relation parent = lfirst(lc2);
PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
/*
@ -979,7 +990,8 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
{
(*num_parted)++;
parted_rels = lappend(parted_rels, partrel);
OID_ARRAY_TO_LIST(partdesc->oids, partdesc->nparts, all_parts);
parted_rel_parents = lappend(parted_rel_parents, parent);
APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents);
}
else
heap_close(partrel, NoLock);
@ -1004,10 +1016,12 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
sizeof(PartitionDispatchData *));
*leaf_part_oids = NIL;
i = k = offset = 0;
foreach(lc, parted_rels)
forboth(lc1, parted_rels, lc2, parted_rel_parents)
{
Relation partrel = lfirst(lc);
Relation partrel = lfirst(lc1);
Relation parent = lfirst(lc2);
PartitionKey partkey = RelationGetPartitionKey(partrel);
TupleDesc tupdesc = RelationGetDescr(partrel);
PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
int j,
m;
@ -1017,6 +1031,27 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
pd[i]->key = partkey;
pd[i]->keystate = NIL;
pd[i]->partdesc = partdesc;
if (parent != NULL)
{
/*
* For every partitioned table other than root, we must store
* a tuple table slot initialized with its tuple descriptor and
* a tuple conversion map to convert a tuple from its parent's
* rowtype to its own. That is to make sure that we are looking
* at the correct row using the correct tuple descriptor when
* computing its partition key for tuple routing.
*/
pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc);
pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
tupdesc,
gettext_noop("could not convert row type"));
}
else
{
/* Not required for the root partitioned table */
pd[i]->tupslot = NULL;
pd[i]->tupmap = NULL;
}
pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
/*
@ -1610,6 +1645,8 @@ get_partition_for_tuple(PartitionDispatch *pd,
{
PartitionKey key = parent->key;
PartitionDesc partdesc = parent->partdesc;
TupleTableSlot *myslot = parent->tupslot;
TupleConversionMap *map = parent->tupmap;
/* Quick exit */
if (partdesc->nparts == 0)
@ -1618,6 +1655,17 @@ get_partition_for_tuple(PartitionDispatch *pd,
return -1;
}
if (myslot != NULL)
{
HeapTuple tuple = ExecFetchSlotTuple(slot);
ExecClearTuple(myslot);
Assert(map != NULL);
tuple = do_convert_tuple(tuple, map);
ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
slot = myslot;
}
/* Extract partition key from tuple */
FormPartitionKeyDatum(parent, slot, estate, values, isnull);

View File

@ -2435,6 +2435,15 @@ CopyFrom(CopyState cstate)
/* Triggers might need a slot as well */
estate->es_trig_tuple_slot = ExecInitExtraTupleSlot(estate);
/*
* Initialize a dedicated slot to manipulate tuples of any given
* partition's rowtype.
*/
if (cstate->partition_dispatch_info)
estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
else
estate->es_partition_tuple_slot = NULL;
/*
* It's more efficient to prepare a bunch of tuples for insertion, and
* insert them in one heap_multi_insert() call, than call heap_insert()
@ -2484,7 +2493,8 @@ CopyFrom(CopyState cstate)
for (;;)
{
TupleTableSlot *slot;
TupleTableSlot *slot,
*oldslot = NULL;
bool skip_tuple;
Oid loaded_oid = InvalidOid;
@ -2571,7 +2581,19 @@ CopyFrom(CopyState cstate)
map = cstate->partition_tupconv_maps[leaf_part_index];
if (map)
{
Relation partrel = resultRelInfo->ri_RelationDesc;
tuple = do_convert_tuple(tuple, map);
/*
* We must use the partition's tuple descriptor from this
* point on. Use a dedicated slot from this point on until
* we're finished dealing with the partition.
*/
oldslot = slot;
slot = estate->es_partition_tuple_slot;
Assert(slot != NULL);
ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
ExecStoreTuple(tuple, slot, InvalidBuffer, true);
}
@ -2667,6 +2689,10 @@ CopyFrom(CopyState cstate)
{
resultRelInfo = saved_resultRelInfo;
estate->es_result_relation_info = resultRelInfo;
/* Switch back to the slot corresponding to the root table */
Assert(oldslot != NULL);
slot = oldslot;
}
}
}
@ -2714,13 +2740,14 @@ CopyFrom(CopyState cstate)
* Remember cstate->partition_dispatch_info[0] corresponds to the root
* partitioned table, which we must not try to close, because it is
* the main target table of COPY that will be closed eventually by
* DoCopy().
* DoCopy(). Also, tupslot is NULL for the root partitioned table.
*/
for (i = 1; i < cstate->num_dispatch; i++)
{
PartitionDispatch pd = cstate->partition_dispatch_info[i];
heap_close(pd->reldesc, NoLock);
ExecDropSingleTupleTableSlot(pd->tupslot);
}
for (i = 0; i < cstate->num_partitions; i++)
{

View File

@ -262,6 +262,7 @@ ExecInsert(ModifyTableState *mtstate,
Relation resultRelationDesc;
Oid newId;
List *recheckIndexes = NIL;
TupleTableSlot *oldslot = NULL;
/*
* get the heap tuple out of the tuple table slot, making sure we have a
@ -318,7 +319,19 @@ ExecInsert(ModifyTableState *mtstate,
map = mtstate->mt_partition_tupconv_maps[leaf_part_index];
if (map)
{
Relation partrel = resultRelInfo->ri_RelationDesc;
tuple = do_convert_tuple(tuple, map);
/*
* We must use the partition's tuple descriptor from this
* point on, until we're finished dealing with the partition.
* Use the dedicated slot for that.
*/
oldslot = slot;
slot = estate->es_partition_tuple_slot;
Assert(slot != NULL);
ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
ExecStoreTuple(tuple, slot, InvalidBuffer, true);
}
}
@ -566,6 +579,10 @@ ExecInsert(ModifyTableState *mtstate,
{
resultRelInfo = saved_resultRelInfo;
estate->es_result_relation_info = resultRelInfo;
/* Switch back to the slot corresponding to the root table */
Assert(oldslot != NULL);
slot = oldslot;
}
/*
@ -1734,7 +1751,15 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
mtstate->mt_partitions = partitions;
mtstate->mt_num_partitions = num_partitions;
mtstate->mt_partition_tupconv_maps = partition_tupconv_maps;
/*
* Initialize a dedicated slot to manipulate tuples of any given
* partition's rowtype.
*/
estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
}
else
estate->es_partition_tuple_slot = NULL;
/*
* Initialize any WITH CHECK OPTION constraints if needed.
@ -2058,12 +2083,14 @@ ExecEndModifyTable(ModifyTableState *node)
* Remember node->mt_partition_dispatch_info[0] corresponds to the root
* partitioned table, which we must not try to close, because it is the
* main target table of the query that will be closed by ExecEndPlan().
* Also, tupslot is NULL for the root partitioned table.
*/
for (i = 1; i < node->mt_num_dispatch; i++)
{
PartitionDispatch pd = node->mt_partition_dispatch_info[i];
heap_close(pd->reldesc, NoLock);
ExecDropSingleTupleTableSlot(pd->tupslot);
}
for (i = 0; i < node->mt_num_partitions; i++)
{

View File

@ -47,6 +47,11 @@ typedef struct PartitionDescData *PartitionDesc;
* key Partition key information of the table
* keystate Execution state required for expressions in the partition key
* partdesc Partition descriptor of the table
* tupslot A standalone TupleTableSlot initialized with this table's tuple
* descriptor
* tupmap TupleConversionMap to convert from the parent's rowtype to
* this table's rowtype (when extracting the partition key of a
* tuple just before routing it through this table)
* indexes Array with partdesc->nparts members (for details on what
* individual members represent, see how they are set in
* RelationGetPartitionDispatchInfo())
@ -58,6 +63,8 @@ typedef struct PartitionDispatchData
PartitionKey key;
List *keystate; /* list of ExprState */
PartitionDesc partdesc;
TupleTableSlot *tupslot;
TupleConversionMap *tupmap;
int *indexes;
} PartitionDispatchData;

View File

@ -384,6 +384,9 @@ typedef struct EState
TupleTableSlot *es_trig_oldtup_slot; /* for TriggerEnabled */
TupleTableSlot *es_trig_newtup_slot; /* for TriggerEnabled */
/* Slot used to manipulate a tuple after it is routed to a partition */
TupleTableSlot *es_partition_tuple_slot;
/* Parameter info: */
ParamListInfo es_param_list_info; /* values of external params */
ParamExecData *es_param_exec_vals; /* values of internal params */

View File

@ -300,3 +300,40 @@ drop cascades to table part_null
drop cascades to table part_ee_ff
drop cascades to table part_ee_ff1
drop cascades to table part_ee_ff2
-- more tests for certain multi-level partitioning scenarios
create table p (a int, b int) partition by range (a, b);
create table p1 (b int, a int not null) partition by range (b);
create table p11 (like p1);
alter table p11 drop a;
alter table p11 add a int;
alter table p11 drop a;
alter table p11 add a int not null;
-- attnum for key attribute 'a' is different in p, p1, and p11
select attrelid::regclass, attname, attnum
from pg_attribute
where attname = 'a'
and (attrelid = 'p'::regclass
or attrelid = 'p1'::regclass
or attrelid = 'p11'::regclass);
attrelid | attname | attnum
----------+---------+--------
p | a | 1
p1 | a | 2
p11 | a | 4
(3 rows)
alter table p1 attach partition p11 for values from (2) to (5);
alter table p attach partition p1 for values from (1, 2) to (1, 10);
-- check that "(1, 2)" is correctly routed to p11.
insert into p values (1, 2);
select tableoid::regclass, * from p;
tableoid | a | b
----------+---+---
p11 | 1 | 2
(1 row)
-- cleanup
drop table p cascade;
NOTICE: drop cascades to 2 other objects
DETAIL: drop cascades to table p1
drop cascades to table p11

View File

@ -170,3 +170,29 @@ select tableoid::regclass, * from list_parted;
-- cleanup
drop table range_parted cascade;
drop table list_parted cascade;
-- more tests for certain multi-level partitioning scenarios
create table p (a int, b int) partition by range (a, b);
create table p1 (b int, a int not null) partition by range (b);
create table p11 (like p1);
alter table p11 drop a;
alter table p11 add a int;
alter table p11 drop a;
alter table p11 add a int not null;
-- attnum for key attribute 'a' is different in p, p1, and p11
select attrelid::regclass, attname, attnum
from pg_attribute
where attname = 'a'
and (attrelid = 'p'::regclass
or attrelid = 'p1'::regclass
or attrelid = 'p11'::regclass);
alter table p1 attach partition p11 for values from (2) to (5);
alter table p attach partition p1 for values from (1, 2) to (1, 10);
-- check that "(1, 2)" is correctly routed to p11.
insert into p values (1, 2);
select tableoid::regclass, * from p;
-- cleanup
drop table p cascade;