diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 57519fe8d6..2436692eb8 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -2726,11 +2726,19 @@ ExecASDeleteTriggers(EState *estate, ResultRelInfo *relinfo, false, NULL, NULL, NIL, NULL, transition_capture); } +/* + * Execute BEFORE ROW DELETE triggers. + * + * True indicates caller can proceed with the delete. False indicates caller + * need to suppress the delete and additionally if requested, we need to pass + * back the concurrently updated tuple if any. + */ bool ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, ItemPointer tupleid, - HeapTuple fdw_trigtuple) + HeapTuple fdw_trigtuple, + TupleTableSlot **epqslot) { TriggerDesc *trigdesc = relinfo->ri_TrigDesc; bool result = true; @@ -2747,6 +2755,18 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, LockTupleExclusive, &newSlot); if (trigtuple == NULL) return false; + + /* + * If the tuple was concurrently updated and the caller of this + * function requested for the updated tuple, skip the trigger + * execution. + */ + if (newSlot != NULL && epqslot != NULL) + { + *epqslot = newSlot; + heap_freetuple(trigtuple); + return false; + } } else trigtuple = fdw_trigtuple; diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index fad6df0aeb..9a7dedf5aa 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -531,7 +531,7 @@ ExecSimpleRelationDelete(EState *estate, EPQState *epqstate, { skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, &searchslot->tts_tuple->t_self, - NULL); + NULL, NULL); } if (!skip_tuple) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 7e0b867971..779b3d4894 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -609,7 +609,11 @@ ExecInsert(ModifyTableState *mtstate, * foreign table, tupleid is invalid; the FDW has to figure out * which row to delete using data from the planSlot. oldtuple is * passed to foreign table triggers; it is NULL when the foreign - * table has no relevant triggers. + * table has no relevant triggers. We use tupleDeleted to indicate + * whether the tuple is actually deleted, callers can use it to + * decide whether to continue the operation. When this DELETE is a + * part of an UPDATE of partition-key, then the slot returned by + * EvalPlanQual() is passed back using output parameter epqslot. * * Returns RETURNING result if any, otherwise NULL. * ---------------------------------------------------------------- @@ -621,10 +625,11 @@ ExecDelete(ModifyTableState *mtstate, TupleTableSlot *planSlot, EPQState *epqstate, EState *estate, - bool *tupleDeleted, bool processReturning, bool canSetTag, - bool changingPart) + bool changingPart, + bool *tupleDeleted, + TupleTableSlot **epqslot) { ResultRelInfo *resultRelInfo; Relation resultRelationDesc; @@ -649,7 +654,7 @@ ExecDelete(ModifyTableState *mtstate, bool dodelete; dodelete = ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, - tupleid, oldtuple); + tupleid, oldtuple, epqslot); if (!dodelete) /* "do nothing" */ return NULL; @@ -769,19 +774,30 @@ ldelete:; if (!ItemPointerEquals(tupleid, &hufd.ctid)) { - TupleTableSlot *epqslot; + TupleTableSlot *my_epqslot; - epqslot = EvalPlanQual(estate, - epqstate, - resultRelationDesc, - resultRelInfo->ri_RangeTableIndex, - LockTupleExclusive, - &hufd.ctid, - hufd.xmax); - if (!TupIsNull(epqslot)) + my_epqslot = EvalPlanQual(estate, + epqstate, + resultRelationDesc, + resultRelInfo->ri_RangeTableIndex, + LockTupleExclusive, + &hufd.ctid, + hufd.xmax); + if (!TupIsNull(my_epqslot)) { *tupleid = hufd.ctid; - goto ldelete; + + /* + * If requested, skip delete and pass back the updated + * row. + */ + if (epqslot) + { + *epqslot = my_epqslot; + return NULL; + } + else + goto ldelete; } } /* tuple already deleted; nothing to do */ @@ -1052,6 +1068,7 @@ lreplace:; { bool tuple_deleted; TupleTableSlot *ret_slot; + TupleTableSlot *epqslot = NULL; PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; int map_index; TupleConversionMap *tupconv_map; @@ -1081,8 +1098,8 @@ lreplace:; * processing. We want to return rows from INSERT. */ ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, - estate, &tuple_deleted, false, - false /* canSetTag */ , true /* changingPart */ ); + estate, false, false /* canSetTag */ , + true /* changingPart */ , &tuple_deleted, &epqslot); /* * For some reason if DELETE didn't happen (e.g. trigger prevented @@ -1105,7 +1122,23 @@ lreplace:; * resurrect it. */ if (!tuple_deleted) - return NULL; + { + /* + * epqslot will be typically NULL. But when ExecDelete() + * finds that another transaction has concurrently updated the + * same row, it re-fetches the row, skips the delete, and + * epqslot is set to the re-fetched tuple slot. In that case, + * we need to do all the checks again. + */ + if (TupIsNull(epqslot)) + return NULL; + else + { + slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot); + tuple = ExecMaterializeSlot(slot); + goto lreplace; + } + } /* * Updates set the transition capture map only when a new subplan @@ -2136,8 +2169,8 @@ ExecModifyTable(PlanState *pstate) case CMD_DELETE: slot = ExecDelete(node, tupleid, oldtuple, planSlot, &node->mt_epqstate, estate, - NULL, true, node->canSetTag, - false /* changingPart */ ); + true, node->canSetTag, + false /* changingPart */ , NULL, NULL); break; default: elog(ERROR, "unknown operation"); diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index a5b8610fa2..1031448c14 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -206,7 +206,8 @@ extern bool ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, ItemPointer tupleid, - HeapTuple fdw_trigtuple); + HeapTuple fdw_trigtuple, + TupleTableSlot **epqslot); extern void ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, ItemPointer tupleid, diff --git a/src/test/isolation/expected/partition-key-update-4.out b/src/test/isolation/expected/partition-key-update-4.out new file mode 100644 index 0000000000..774a7faf6c --- /dev/null +++ b/src/test/isolation/expected/partition-key-update-4.out @@ -0,0 +1,60 @@ +Parsed test spec with 2 sessions + +starting permutation: s1b s2b s2u1 s1u s2c s1c s1s +step s1b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2u1: UPDATE foo SET b = b || ' update2' WHERE a = 1; +step s1u: UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; +step s2c: COMMIT; +step s1u: <... completed> +step s1c: COMMIT; +step s1s: SELECT tableoid::regclass, * FROM foo ORDER BY a; +tableoid a b + +foo2 2 ABC update2 update1 + +starting permutation: s1b s2b s2ut1 s1ut s2c s1c s1st s1stl +step s1b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2ut1: UPDATE footrg SET b = b || ' update2' WHERE a = 1; +step s1ut: UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; +step s2c: COMMIT; +step s1ut: <... completed> +step s1c: COMMIT; +step s1st: SELECT tableoid::regclass, * FROM footrg ORDER BY a; +tableoid a b + +footrg2 2 ABC update2 update1 +step s1stl: SELECT * FROM triglog ORDER BY a; +a b + +1 ABC update2 trigger + +starting permutation: s1b s2b s2u2 s1u s2c s1c s1s +step s1b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2u2: UPDATE foo SET b = 'EFG' WHERE a = 1; +step s1u: UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; +step s2c: COMMIT; +step s1u: <... completed> +step s1c: COMMIT; +step s1s: SELECT tableoid::regclass, * FROM foo ORDER BY a; +tableoid a b + +foo1 1 EFG + +starting permutation: s1b s2b s2ut2 s1ut s2c s1c s1st s1stl +step s1b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2ut2: UPDATE footrg SET b = 'EFG' WHERE a = 1; +step s1ut: UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; +step s2c: COMMIT; +step s1ut: <... completed> +step s1c: COMMIT; +step s1st: SELECT tableoid::regclass, * FROM footrg ORDER BY a; +tableoid a b + +footrg1 1 EFG +step s1stl: SELECT * FROM triglog ORDER BY a; +a b + diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 0e997215a8..d5594e80e2 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -74,4 +74,5 @@ test: predicate-gin-nomatch test: partition-key-update-1 test: partition-key-update-2 test: partition-key-update-3 +test: partition-key-update-4 test: plpgsql-toast diff --git a/src/test/isolation/specs/partition-key-update-4.spec b/src/test/isolation/specs/partition-key-update-4.spec new file mode 100644 index 0000000000..1d53a7d0c6 --- /dev/null +++ b/src/test/isolation/specs/partition-key-update-4.spec @@ -0,0 +1,76 @@ +# Test that a row that ends up in a new partition contains changes made by +# a concurrent transaction. + +setup +{ + -- + -- Setup to test concurrent handling of ExecDelete(). + -- + CREATE TABLE foo (a int, b text) PARTITION BY LIST(a); + CREATE TABLE foo1 PARTITION OF foo FOR VALUES IN (1); + CREATE TABLE foo2 PARTITION OF foo FOR VALUES IN (2); + INSERT INTO foo VALUES (1, 'ABC'); + + -- + -- Setup to test concurrent handling of GetTupleForTrigger(). + -- + CREATE TABLE footrg (a int, b text) PARTITION BY LIST(a); + CREATE TABLE triglog as select * from footrg; + CREATE TABLE footrg1 PARTITION OF footrg FOR VALUES IN (1); + CREATE TABLE footrg2 PARTITION OF footrg FOR VALUES IN (2); + INSERT INTO footrg VALUES (1, 'ABC'); + CREATE FUNCTION func_footrg() RETURNS TRIGGER AS $$ + BEGIN + OLD.b = OLD.b || ' trigger'; + + -- This will verify that the trigger is not run *before* the row is + -- refetched by EvalPlanQual. The OLD row should contain the changes made + -- by the concurrent session. + INSERT INTO triglog select OLD.*; + + RETURN OLD; + END $$ LANGUAGE PLPGSQL; + CREATE TRIGGER footrg_ondel BEFORE DELETE ON footrg1 + FOR EACH ROW EXECUTE PROCEDURE func_footrg(); + +} + +teardown +{ + DROP TABLE foo; + DROP TRIGGER footrg_ondel ON footrg1; + DROP FUNCTION func_footrg(); + DROP TABLE footrg; + DROP TABLE triglog; +} + +session "s1" +step "s1b" { BEGIN ISOLATION LEVEL READ COMMITTED; } +step "s1u" { UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; } +step "s1ut" { UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; } +step "s1s" { SELECT tableoid::regclass, * FROM foo ORDER BY a; } +step "s1st" { SELECT tableoid::regclass, * FROM footrg ORDER BY a; } +step "s1stl" { SELECT * FROM triglog ORDER BY a; } +step "s1c" { COMMIT; } + +session "s2" +step "s2b" { BEGIN ISOLATION LEVEL READ COMMITTED; } +step "s2u1" { UPDATE foo SET b = b || ' update2' WHERE a = 1; } +step "s2u2" { UPDATE foo SET b = 'EFG' WHERE a = 1; } +step "s2ut1" { UPDATE footrg SET b = b || ' update2' WHERE a = 1; } +step "s2ut2" { UPDATE footrg SET b = 'EFG' WHERE a = 1; } +step "s2c" { COMMIT; } + + +# Session s1 is moving a row into another partition, but is waiting for +# another session s2 that is updating the original row. The row that ends up +# in the new partition should contain the changes made by session s2. +permutation "s1b" "s2b" "s2u1" "s1u" "s2c" "s1c" "s1s" + +# Same as above, except, session s1 is waiting in GetTupleTrigger(). +permutation "s1b" "s2b" "s2ut1" "s1ut" "s2c" "s1c" "s1st" "s1stl" + +# Below two cases are similar to the above two; except that the session s1 +# fails EvalPlanQual() test, so partition key update does not happen. +permutation "s1b" "s2b" "s2u2" "s1u" "s2c" "s1c" "s1s" +permutation "s1b" "s2b" "s2ut2" "s1ut" "s2c" "s1c" "s1st" "s1stl"