From 0527df732beb8c2d584c7e4ea5aeb05cf92e14a2 Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Thu, 12 Jul 2018 12:17:27 +0530 Subject: [PATCH] Allow using the updated tuple while moving it to a different partition. An update that causes the tuple to be moved to a different partition was missing out on re-constructing the to-be-updated tuple, based on the latest tuple in the update chain. Instead, it's simply deleting the latest tuple and inserting a new tuple in the new partition based on the old tuple. Commit 2f17844104 didn't consider this case, so some of the updates were getting lost. In passing, change the argument order for output parameter in ExecDelete and add some commentary about it. Reported-by: Pavan Deolasee Author: Amit Khandekar, with minor changes by me Reviewed-by: Dilip Kumar, Amit Kapila and Alvaro Herrera Backpatch-through: 11 Discussion: https://postgr.es/m/CAJ3gD9fRbEzDqdeDq1jxqZUb47kJn+tQ7=Bcgjc8quqKsDViKQ@mail.gmail.com --- src/backend/commands/trigger.c | 22 +++++- src/backend/executor/execReplication.c | 2 +- src/backend/executor/nodeModifyTable.c | 71 ++++++++++++----- src/include/commands/trigger.h | 3 +- .../expected/partition-key-update-4.out | 60 +++++++++++++++ src/test/isolation/isolation_schedule | 1 + .../specs/partition-key-update-4.spec | 76 +++++++++++++++++++ 7 files changed, 213 insertions(+), 22 deletions(-) create mode 100644 src/test/isolation/expected/partition-key-update-4.out create mode 100644 src/test/isolation/specs/partition-key-update-4.spec diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 57519fe8d6..2436692eb8 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -2726,11 +2726,19 @@ ExecASDeleteTriggers(EState *estate, ResultRelInfo *relinfo, false, NULL, NULL, NIL, NULL, transition_capture); } +/* + * Execute BEFORE ROW DELETE triggers. + * + * True indicates caller can proceed with the delete. False indicates caller + * need to suppress the delete and additionally if requested, we need to pass + * back the concurrently updated tuple if any. + */ bool ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, ItemPointer tupleid, - HeapTuple fdw_trigtuple) + HeapTuple fdw_trigtuple, + TupleTableSlot **epqslot) { TriggerDesc *trigdesc = relinfo->ri_TrigDesc; bool result = true; @@ -2747,6 +2755,18 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, LockTupleExclusive, &newSlot); if (trigtuple == NULL) return false; + + /* + * If the tuple was concurrently updated and the caller of this + * function requested for the updated tuple, skip the trigger + * execution. + */ + if (newSlot != NULL && epqslot != NULL) + { + *epqslot = newSlot; + heap_freetuple(trigtuple); + return false; + } } else trigtuple = fdw_trigtuple; diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index fad6df0aeb..9a7dedf5aa 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -531,7 +531,7 @@ ExecSimpleRelationDelete(EState *estate, EPQState *epqstate, { skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, &searchslot->tts_tuple->t_self, - NULL); + NULL, NULL); } if (!skip_tuple) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 7e0b867971..779b3d4894 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -609,7 +609,11 @@ ExecInsert(ModifyTableState *mtstate, * foreign table, tupleid is invalid; the FDW has to figure out * which row to delete using data from the planSlot. oldtuple is * passed to foreign table triggers; it is NULL when the foreign - * table has no relevant triggers. + * table has no relevant triggers. We use tupleDeleted to indicate + * whether the tuple is actually deleted, callers can use it to + * decide whether to continue the operation. When this DELETE is a + * part of an UPDATE of partition-key, then the slot returned by + * EvalPlanQual() is passed back using output parameter epqslot. * * Returns RETURNING result if any, otherwise NULL. * ---------------------------------------------------------------- @@ -621,10 +625,11 @@ ExecDelete(ModifyTableState *mtstate, TupleTableSlot *planSlot, EPQState *epqstate, EState *estate, - bool *tupleDeleted, bool processReturning, bool canSetTag, - bool changingPart) + bool changingPart, + bool *tupleDeleted, + TupleTableSlot **epqslot) { ResultRelInfo *resultRelInfo; Relation resultRelationDesc; @@ -649,7 +654,7 @@ ExecDelete(ModifyTableState *mtstate, bool dodelete; dodelete = ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, - tupleid, oldtuple); + tupleid, oldtuple, epqslot); if (!dodelete) /* "do nothing" */ return NULL; @@ -769,19 +774,30 @@ ldelete:; if (!ItemPointerEquals(tupleid, &hufd.ctid)) { - TupleTableSlot *epqslot; + TupleTableSlot *my_epqslot; - epqslot = EvalPlanQual(estate, - epqstate, - resultRelationDesc, - resultRelInfo->ri_RangeTableIndex, - LockTupleExclusive, - &hufd.ctid, - hufd.xmax); - if (!TupIsNull(epqslot)) + my_epqslot = EvalPlanQual(estate, + epqstate, + resultRelationDesc, + resultRelInfo->ri_RangeTableIndex, + LockTupleExclusive, + &hufd.ctid, + hufd.xmax); + if (!TupIsNull(my_epqslot)) { *tupleid = hufd.ctid; - goto ldelete; + + /* + * If requested, skip delete and pass back the updated + * row. + */ + if (epqslot) + { + *epqslot = my_epqslot; + return NULL; + } + else + goto ldelete; } } /* tuple already deleted; nothing to do */ @@ -1052,6 +1068,7 @@ lreplace:; { bool tuple_deleted; TupleTableSlot *ret_slot; + TupleTableSlot *epqslot = NULL; PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; int map_index; TupleConversionMap *tupconv_map; @@ -1081,8 +1098,8 @@ lreplace:; * processing. We want to return rows from INSERT. */ ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, - estate, &tuple_deleted, false, - false /* canSetTag */ , true /* changingPart */ ); + estate, false, false /* canSetTag */ , + true /* changingPart */ , &tuple_deleted, &epqslot); /* * For some reason if DELETE didn't happen (e.g. trigger prevented @@ -1105,7 +1122,23 @@ lreplace:; * resurrect it. */ if (!tuple_deleted) - return NULL; + { + /* + * epqslot will be typically NULL. But when ExecDelete() + * finds that another transaction has concurrently updated the + * same row, it re-fetches the row, skips the delete, and + * epqslot is set to the re-fetched tuple slot. In that case, + * we need to do all the checks again. + */ + if (TupIsNull(epqslot)) + return NULL; + else + { + slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot); + tuple = ExecMaterializeSlot(slot); + goto lreplace; + } + } /* * Updates set the transition capture map only when a new subplan @@ -2136,8 +2169,8 @@ ExecModifyTable(PlanState *pstate) case CMD_DELETE: slot = ExecDelete(node, tupleid, oldtuple, planSlot, &node->mt_epqstate, estate, - NULL, true, node->canSetTag, - false /* changingPart */ ); + true, node->canSetTag, + false /* changingPart */ , NULL, NULL); break; default: elog(ERROR, "unknown operation"); diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index a5b8610fa2..1031448c14 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -206,7 +206,8 @@ extern bool ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, ItemPointer tupleid, - HeapTuple fdw_trigtuple); + HeapTuple fdw_trigtuple, + TupleTableSlot **epqslot); extern void ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, ItemPointer tupleid, diff --git a/src/test/isolation/expected/partition-key-update-4.out b/src/test/isolation/expected/partition-key-update-4.out new file mode 100644 index 0000000000..774a7faf6c --- /dev/null +++ b/src/test/isolation/expected/partition-key-update-4.out @@ -0,0 +1,60 @@ +Parsed test spec with 2 sessions + +starting permutation: s1b s2b s2u1 s1u s2c s1c s1s +step s1b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2u1: UPDATE foo SET b = b || ' update2' WHERE a = 1; +step s1u: UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; +step s2c: COMMIT; +step s1u: <... completed> +step s1c: COMMIT; +step s1s: SELECT tableoid::regclass, * FROM foo ORDER BY a; +tableoid a b + +foo2 2 ABC update2 update1 + +starting permutation: s1b s2b s2ut1 s1ut s2c s1c s1st s1stl +step s1b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2ut1: UPDATE footrg SET b = b || ' update2' WHERE a = 1; +step s1ut: UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; +step s2c: COMMIT; +step s1ut: <... completed> +step s1c: COMMIT; +step s1st: SELECT tableoid::regclass, * FROM footrg ORDER BY a; +tableoid a b + +footrg2 2 ABC update2 update1 +step s1stl: SELECT * FROM triglog ORDER BY a; +a b + +1 ABC update2 trigger + +starting permutation: s1b s2b s2u2 s1u s2c s1c s1s +step s1b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2u2: UPDATE foo SET b = 'EFG' WHERE a = 1; +step s1u: UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; +step s2c: COMMIT; +step s1u: <... completed> +step s1c: COMMIT; +step s1s: SELECT tableoid::regclass, * FROM foo ORDER BY a; +tableoid a b + +foo1 1 EFG + +starting permutation: s1b s2b s2ut2 s1ut s2c s1c s1st s1stl +step s1b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2b: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2ut2: UPDATE footrg SET b = 'EFG' WHERE a = 1; +step s1ut: UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; +step s2c: COMMIT; +step s1ut: <... completed> +step s1c: COMMIT; +step s1st: SELECT tableoid::regclass, * FROM footrg ORDER BY a; +tableoid a b + +footrg1 1 EFG +step s1stl: SELECT * FROM triglog ORDER BY a; +a b + diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 0e997215a8..d5594e80e2 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -74,4 +74,5 @@ test: predicate-gin-nomatch test: partition-key-update-1 test: partition-key-update-2 test: partition-key-update-3 +test: partition-key-update-4 test: plpgsql-toast diff --git a/src/test/isolation/specs/partition-key-update-4.spec b/src/test/isolation/specs/partition-key-update-4.spec new file mode 100644 index 0000000000..1d53a7d0c6 --- /dev/null +++ b/src/test/isolation/specs/partition-key-update-4.spec @@ -0,0 +1,76 @@ +# Test that a row that ends up in a new partition contains changes made by +# a concurrent transaction. + +setup +{ + -- + -- Setup to test concurrent handling of ExecDelete(). + -- + CREATE TABLE foo (a int, b text) PARTITION BY LIST(a); + CREATE TABLE foo1 PARTITION OF foo FOR VALUES IN (1); + CREATE TABLE foo2 PARTITION OF foo FOR VALUES IN (2); + INSERT INTO foo VALUES (1, 'ABC'); + + -- + -- Setup to test concurrent handling of GetTupleForTrigger(). + -- + CREATE TABLE footrg (a int, b text) PARTITION BY LIST(a); + CREATE TABLE triglog as select * from footrg; + CREATE TABLE footrg1 PARTITION OF footrg FOR VALUES IN (1); + CREATE TABLE footrg2 PARTITION OF footrg FOR VALUES IN (2); + INSERT INTO footrg VALUES (1, 'ABC'); + CREATE FUNCTION func_footrg() RETURNS TRIGGER AS $$ + BEGIN + OLD.b = OLD.b || ' trigger'; + + -- This will verify that the trigger is not run *before* the row is + -- refetched by EvalPlanQual. The OLD row should contain the changes made + -- by the concurrent session. + INSERT INTO triglog select OLD.*; + + RETURN OLD; + END $$ LANGUAGE PLPGSQL; + CREATE TRIGGER footrg_ondel BEFORE DELETE ON footrg1 + FOR EACH ROW EXECUTE PROCEDURE func_footrg(); + +} + +teardown +{ + DROP TABLE foo; + DROP TRIGGER footrg_ondel ON footrg1; + DROP FUNCTION func_footrg(); + DROP TABLE footrg; + DROP TABLE triglog; +} + +session "s1" +step "s1b" { BEGIN ISOLATION LEVEL READ COMMITTED; } +step "s1u" { UPDATE foo SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; } +step "s1ut" { UPDATE footrg SET a = a + 1, b = b || ' update1' WHERE b like '%ABC%'; } +step "s1s" { SELECT tableoid::regclass, * FROM foo ORDER BY a; } +step "s1st" { SELECT tableoid::regclass, * FROM footrg ORDER BY a; } +step "s1stl" { SELECT * FROM triglog ORDER BY a; } +step "s1c" { COMMIT; } + +session "s2" +step "s2b" { BEGIN ISOLATION LEVEL READ COMMITTED; } +step "s2u1" { UPDATE foo SET b = b || ' update2' WHERE a = 1; } +step "s2u2" { UPDATE foo SET b = 'EFG' WHERE a = 1; } +step "s2ut1" { UPDATE footrg SET b = b || ' update2' WHERE a = 1; } +step "s2ut2" { UPDATE footrg SET b = 'EFG' WHERE a = 1; } +step "s2c" { COMMIT; } + + +# Session s1 is moving a row into another partition, but is waiting for +# another session s2 that is updating the original row. The row that ends up +# in the new partition should contain the changes made by session s2. +permutation "s1b" "s2b" "s2u1" "s1u" "s2c" "s1c" "s1s" + +# Same as above, except, session s1 is waiting in GetTupleTrigger(). +permutation "s1b" "s2b" "s2ut1" "s1ut" "s2c" "s1c" "s1st" "s1stl" + +# Below two cases are similar to the above two; except that the session s1 +# fails EvalPlanQual() test, so partition key update does not happen. +permutation "s1b" "s2b" "s2u2" "s1u" "s2c" "s1c" "s1s" +permutation "s1b" "s2b" "s2ut2" "s1ut" "s2c" "s1c" "s1st" "s1stl"