diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 79fcbd6b06..bd2ea25804 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -51,6 +51,11 @@ * PartitionDispatchData->indexes for details on how this array is * indexed. * + * nonleaf_partitions + * Array of 'max_dispatch' elements containing pointers to fake + * ResultRelInfo objects for nonleaf partitions, useful for checking + * the partition constraint. + * * num_dispatch * The current number of items stored in the 'partition_dispatch_info' * array. Also serves as the index of the next free array element for @@ -89,6 +94,7 @@ struct PartitionTupleRouting { Relation partition_root; PartitionDispatch *partition_dispatch_info; + ResultRelInfo **nonleaf_partitions; int num_dispatch; int max_dispatch; ResultRelInfo **partitions; @@ -280,9 +286,11 @@ ExecFindPartition(ModifyTableState *mtstate, PartitionDispatch dispatch; PartitionDesc partdesc; ExprContext *ecxt = GetPerTupleExprContext(estate); - TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple; + TupleTableSlot *ecxt_scantuple_saved = ecxt->ecxt_scantuple; + TupleTableSlot *rootslot = slot; TupleTableSlot *myslot = NULL; MemoryContext oldcxt; + ResultRelInfo *rri = NULL; /* use per-tuple context here to avoid leaking memory */ oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); @@ -296,9 +304,8 @@ ExecFindPartition(ModifyTableState *mtstate, /* start with the root partitioned table */ dispatch = pd[0]; - while (true) + while (dispatch != NULL) { - AttrMap *map = dispatch->tupmap; int partidx = -1; CHECK_FOR_INTERRUPTS(); @@ -306,17 +313,6 @@ ExecFindPartition(ModifyTableState *mtstate, rel = dispatch->reldesc; partdesc = dispatch->partdesc; - /* - * Convert the tuple to this parent's layout, if different from the - * current relation. - */ - myslot = dispatch->tupslot; - if (myslot != NULL) - { - Assert(map != NULL); - slot = execute_attr_map_slot(map, slot, myslot); - } - /* * Extract partition key from tuple. Expression evaluation machinery * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to @@ -352,11 +348,9 @@ ExecFindPartition(ModifyTableState *mtstate, if (partdesc->is_leaf[partidx]) { - ResultRelInfo *rri; - /* - * Look to see if we've already got a ResultRelInfo for this - * partition. + * We've reached the leaf -- hurray, we're done. Look to see if + * we've already got a ResultRelInfo for this partition. */ if (likely(dispatch->indexes[partidx] >= 0)) { @@ -400,14 +394,10 @@ ExecFindPartition(ModifyTableState *mtstate, dispatch, rootResultRelInfo, partidx); } + Assert(rri != NULL); - /* Release the tuple in the lowest parent's dedicated slot. */ - if (slot == myslot) - ExecClearTuple(myslot); - - MemoryContextSwitchTo(oldcxt); - ecxt->ecxt_scantuple = ecxt_scantuple_old; - return rri; + /* Signal to terminate the loop */ + dispatch = NULL; } else { @@ -419,6 +409,8 @@ ExecFindPartition(ModifyTableState *mtstate, /* Already built. */ Assert(dispatch->indexes[partidx] < proute->num_dispatch); + rri = proute->nonleaf_partitions[dispatch->indexes[partidx]]; + /* * Move down to the next partition level and search again * until we find a leaf partition that matches this tuple @@ -440,10 +432,75 @@ ExecFindPartition(ModifyTableState *mtstate, dispatch, partidx); Assert(dispatch->indexes[partidx] >= 0 && dispatch->indexes[partidx] < proute->num_dispatch); + + rri = proute->nonleaf_partitions[dispatch->indexes[partidx]]; dispatch = subdispatch; } + + /* + * Convert the tuple to the new parent's layout, if different from + * the previous parent. + */ + if (dispatch->tupslot) + { + AttrMap *map = dispatch->tupmap; + TupleTableSlot *tempslot = myslot; + + myslot = dispatch->tupslot; + slot = execute_attr_map_slot(map, slot, myslot); + + if (tempslot != NULL) + ExecClearTuple(tempslot); + } + } + + /* + * If this partition is the default one, we must check its partition + * constraint now, which may have changed concurrently due to + * partitions being added to the parent. + * + * (We do this here, and do not rely on ExecInsert doing it, because + * we don't want to miss doing it for non-leaf partitions.) + */ + if (partidx == partdesc->boundinfo->default_index) + { + PartitionRoutingInfo *partrouteinfo = rri->ri_PartitionInfo; + + /* + * The tuple must match the partition's layout for the constraint + * expression to be evaluated successfully. If the partition is + * sub-partitioned, that would already be the case due to the code + * above, but for a leaf partition the tuple still matches the + * parent's layout. + * + * Note that we have a map to convert from root to current + * partition, but not from immediate parent to current partition. + * So if we have to convert, do it from the root slot; if not, use + * the root slot as-is. + */ + if (partrouteinfo) + { + TupleConversionMap *map = partrouteinfo->pi_RootToPartitionMap; + + if (map) + slot = execute_attr_map_slot(map->attrMap, rootslot, + partrouteinfo->pi_PartitionTupleSlot); + else + slot = rootslot; + } + + ExecPartitionCheck(rri, slot, estate, true); } } + + /* Release the tuple in the lowest parent's dedicated slot. */ + if (myslot != NULL) + ExecClearTuple(myslot); + /* and restore ecxt's scantuple */ + ecxt->ecxt_scantuple = ecxt_scantuple_saved; + MemoryContextSwitchTo(oldcxt); + + return rri; } /* @@ -1060,6 +1117,8 @@ ExecInitPartitionDispatchInfo(EState *estate, proute->max_dispatch = 4; proute->partition_dispatch_info = (PartitionDispatch *) palloc(sizeof(PartitionDispatch) * proute->max_dispatch); + proute->nonleaf_partitions = (ResultRelInfo **) + palloc(sizeof(ResultRelInfo *) * proute->max_dispatch); } else { @@ -1067,10 +1126,28 @@ ExecInitPartitionDispatchInfo(EState *estate, proute->partition_dispatch_info = (PartitionDispatch *) repalloc(proute->partition_dispatch_info, sizeof(PartitionDispatch) * proute->max_dispatch); + proute->nonleaf_partitions = (ResultRelInfo **) + repalloc(proute->nonleaf_partitions, + sizeof(ResultRelInfo *) * proute->max_dispatch); } } proute->partition_dispatch_info[dispatchidx] = pd; + /* + * If setting up a PartitionDispatch for a sub-partitioned table, we may + * also need a minimally valid ResultRelInfo for checking the partition + * constraint later; set that up now. + */ + if (parent_pd) + { + ResultRelInfo *rri = makeNode(ResultRelInfo); + + InitResultRelInfo(rri, rel, 1, proute->partition_root, 0); + proute->nonleaf_partitions[dispatchidx] = rri; + } + else + proute->nonleaf_partitions[dispatchidx] = NULL; + /* * Finally, if setting up a PartitionDispatch for a sub-partitioned table, * install a downlink in the parent to allow quick descent. diff --git a/src/test/isolation/expected/partition-concurrent-attach.out b/src/test/isolation/expected/partition-concurrent-attach.out new file mode 100644 index 0000000000..17fac39989 --- /dev/null +++ b/src/test/isolation/expected/partition-concurrent-attach.out @@ -0,0 +1,49 @@ +Parsed test spec with 2 sessions + +starting permutation: s1b s1a s2b s2i s1c s2c s2s +step s1b: begin; +step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200); +step s2b: begin; +step s2i: insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); +step s1c: commit; +step s2i: <... completed> +error in steps s1c s2i: ERROR: new row for relation "tpart_default" violates partition constraint +step s2c: commit; +step s2s: select tableoid::regclass, * from tpart; +tableoid i j + +tpart_2 110 xxx +tpart_2 120 yyy +tpart_2 150 zzz + +starting permutation: s1b s1a s2b s2i2 s1c s2c s2s +step s1b: begin; +step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200); +step s2b: begin; +step s2i2: insert into tpart_default (i, j) values (110, 'xxx'), (120, 'yyy'), (150, 'zzz'); +step s1c: commit; +step s2i2: <... completed> +error in steps s1c s2i2: ERROR: new row for relation "tpart_default" violates partition constraint +step s2c: commit; +step s2s: select tableoid::regclass, * from tpart; +tableoid i j + +tpart_2 110 xxx +tpart_2 120 yyy +tpart_2 150 zzz + +starting permutation: s1b s2b s2i s1a s2c s1c s2s +step s1b: begin; +step s2b: begin; +step s2i: insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); +step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200); +step s2c: commit; +step s1a: <... completed> +error in steps s2c s1a: ERROR: updated partition constraint for default partition "tpart_default_default" would be violated by some row +step s1c: commit; +step s2s: select tableoid::regclass, * from tpart; +tableoid i j + +tpart_default_default110 xxx +tpart_default_default120 yyy +tpart_default_default150 zzz diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 218c87b24b..a4144127ad 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -80,6 +80,7 @@ test: vacuum-skip-locked test: predicate-hash test: predicate-gist test: predicate-gin +test: partition-concurrent-attach test: partition-key-update-1 test: partition-key-update-2 test: partition-key-update-3 diff --git a/src/test/isolation/specs/partition-concurrent-attach.spec b/src/test/isolation/specs/partition-concurrent-attach.spec new file mode 100644 index 0000000000..48c3f83e0c --- /dev/null +++ b/src/test/isolation/specs/partition-concurrent-attach.spec @@ -0,0 +1,43 @@ +# Verify that default partition constraint is enforced correctly +# in light of partitions being added concurrently to its parent +setup { + drop table if exists tpart; + create table tpart(i int, j text) partition by range(i); + create table tpart_1(like tpart); + create table tpart_2(like tpart); + create table tpart_default (a int, j text, i int) partition by list (j); + create table tpart_default_default (a int, i int, b int, j text); + alter table tpart_default_default drop b; + alter table tpart_default attach partition tpart_default_default default; + alter table tpart_default drop a; + alter table tpart attach partition tpart_default default; + alter table tpart attach partition tpart_1 for values from(0) to (100); + insert into tpart_2 values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); +} + +session "s1" +step "s1b" { begin; } +step "s1a" { alter table tpart attach partition tpart_2 for values from (100) to (200); } +step "s1c" { commit; } + +session "s2" +step "s2b" { begin; } +step "s2i" { insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); } +step "s2i2" { insert into tpart_default (i, j) values (110, 'xxx'), (120, 'yyy'), (150, 'zzz'); } +step "s2c" { commit; } +step "s2s" { select tableoid::regclass, * from tpart; } + +teardown { drop table tpart; } + +# insert into tpart by s2 which routes to tpart_default due to not seeing +# concurrently added tpart_2 should fail, because the partition constraint +# of tpart_default would have changed due to tpart_2 having been added +permutation "s1b" "s1a" "s2b" "s2i" "s1c" "s2c" "s2s" + +# similar to above, but now insert into sub-partitioned tpart_default +permutation "s1b" "s1a" "s2b" "s2i2" "s1c" "s2c" "s2s" + +# reverse: now the insert into tpart_default by s2 occurs first followed by +# attach in s1, which should fail when it scans the leaf default partition +# find the violating rows +permutation "s1b" "s2b" "s2i" "s1a" "s2c" "s1c" "s2s"