From 529ebb20aaa5eb68e4fb7a656271bbb83efe9529 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 5 Nov 2019 11:42:24 -0500 Subject: [PATCH] Generate EquivalenceClass members for partitionwise child join rels. Commit d25ea0127 got rid of what I thought were entirely unnecessary derived child expressions in EquivalenceClasses for EC members that mention multiple baserels. But it turns out that some of the child expressions that code created are necessary for partitionwise joins, else we fail to find matching pathkeys for Sort nodes. (This happens only for certain shapes of the resulting plan; it may be that partitionwise aggregation is also necessary to show the failure, though I'm not sure of that.) Reverting that commit entirely would be quite painful performance-wise for large partition sets. So instead, add code that explicitly generates child expressions that match only partitionwise child join rels we have actually generated. Per report from Justin Pryzby. (Amit Langote noticed the problem earlier, though it's not clear if he recognized then that it could result in a planner error, not merely failure to exploit partitionwise join, in the code as-committed.) Back-patch to v12 where commit d25ea0127 came in. Amit Langote, with lots of kibitzing from me Discussion: https://postgr.es/m/CA+HiwqG2WVUGmLJqtR0tPFhniO=H=9qQ+Z3L_ZC+Y3-EVQHFGg@mail.gmail.com Discussion: https://postgr.es/m/20191011143703.GN10470@telsasoft.com --- src/backend/optimizer/path/equivclass.c | 161 +++++++++++++++++-- src/backend/optimizer/util/relnode.c | 15 +- src/include/optimizer/paths.h | 5 + src/test/regress/expected/partition_join.out | 77 +++++++++ src/test/regress/sql/partition_join.sql | 15 ++ 5 files changed, 259 insertions(+), 14 deletions(-) diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c index ccc07ba9f0..082776f7fe 100644 --- a/src/backend/optimizer/path/equivclass.c +++ b/src/backend/optimizer/path/equivclass.c @@ -1132,7 +1132,7 @@ generate_join_implied_equalities(PlannerInfo *root, Relids inner_relids = inner_rel->relids; Relids nominal_inner_relids; Relids nominal_join_relids; - Bitmapset * matching_ecs; + Bitmapset *matching_ecs; int i; /* If inner rel is a child, extra setup work is needed */ @@ -2209,7 +2209,7 @@ match_eclasses_to_foreign_key_col(PlannerInfo *root, /* * add_child_rel_equivalences - * Search for EC members that reference the parent_rel, and + * Search for EC members that reference the root parent of child_rel, and * add transformed members referencing the child_rel. * * Note that this function won't be called at all unless we have at least some @@ -2217,6 +2217,12 @@ match_eclasses_to_foreign_key_col(PlannerInfo *root, * * parent_rel and child_rel could be derived from appinfo, but since the * caller has already computed them, we might as well just pass them in. + * + * The passed-in AppendRelInfo is not used when the parent_rel is not a + * top-level baserel, since it shows the mapping from the parent_rel but + * we need to translate EC expressions that refer to the top-level parent. + * Using it is faster than using adjust_appendrel_attrs_multilevel(), though, + * so we prefer it when we can. */ void add_child_rel_equivalences(PlannerInfo *root, @@ -2224,6 +2230,8 @@ add_child_rel_equivalences(PlannerInfo *root, RelOptInfo *parent_rel, RelOptInfo *child_rel) { + Relids top_parent_relids = child_rel->top_parent_relids; + Relids child_relids = child_rel->relids; int i; /* @@ -2248,7 +2256,7 @@ add_child_rel_equivalences(PlannerInfo *root, continue; /* Sanity check eclass_indexes only contain ECs for parent_rel */ - Assert(bms_is_subset(child_rel->top_parent_relids, cur_ec->ec_relids)); + Assert(bms_is_subset(top_parent_relids, cur_ec->ec_relids)); /* * We don't use foreach() here because there's no point in scanning @@ -2268,13 +2276,14 @@ add_child_rel_equivalences(PlannerInfo *root, * already-transformed child members. Otherwise, if some original * member expression references more than one appendrel, we'd get * an O(N^2) explosion of useless derived expressions for - * combinations of children. + * combinations of children. (But add_child_join_rel_equivalences + * may add targeted combinations for partitionwise-join purposes.) */ if (cur_em->em_is_child) continue; /* ignore children here */ /* Does this member reference child's topmost parent rel? */ - if (bms_overlap(cur_em->em_relids, child_rel->top_parent_relids)) + if (bms_overlap(cur_em->em_relids, top_parent_relids)) { /* Yes, generate transformed child version */ Expr *child_expr; @@ -2295,8 +2304,8 @@ add_child_rel_equivalences(PlannerInfo *root, child_expr = (Expr *) adjust_appendrel_attrs_multilevel(root, (Node *) cur_em->em_expr, - child_rel->relids, - child_rel->top_parent_relids); + child_relids, + top_parent_relids); } /* @@ -2306,21 +2315,20 @@ add_child_rel_equivalences(PlannerInfo *root, * don't want the child member to be marked as constant. */ new_relids = bms_difference(cur_em->em_relids, - child_rel->top_parent_relids); - new_relids = bms_add_members(new_relids, child_rel->relids); + top_parent_relids); + new_relids = bms_add_members(new_relids, child_relids); /* * And likewise for nullable_relids. Note this code assumes * parent and child relids are singletons. */ new_nullable_relids = cur_em->em_nullable_relids; - if (bms_overlap(new_nullable_relids, - child_rel->top_parent_relids)) + if (bms_overlap(new_nullable_relids, top_parent_relids)) { new_nullable_relids = bms_difference(new_nullable_relids, - child_rel->top_parent_relids); + top_parent_relids); new_nullable_relids = bms_add_members(new_nullable_relids, - child_rel->relids); + child_relids); } (void) add_eq_member(cur_ec, child_expr, @@ -2334,6 +2342,133 @@ add_child_rel_equivalences(PlannerInfo *root, } } +/* + * add_child_join_rel_equivalences + * Like add_child_rel_equivalences(), but for joinrels + * + * Here we find the ECs relevant to the top parent joinrel and add transformed + * member expressions that refer to this child joinrel. + * + * Note that this function won't be called at all unless we have at least some + * reason to believe that the EC members it generates will be useful. + */ +void +add_child_join_rel_equivalences(PlannerInfo *root, + int nappinfos, AppendRelInfo **appinfos, + RelOptInfo *parent_joinrel, + RelOptInfo *child_joinrel) +{ + Relids top_parent_relids = child_joinrel->top_parent_relids; + Relids child_relids = child_joinrel->relids; + Bitmapset *matching_ecs; + int i; + + Assert(IS_JOIN_REL(child_joinrel) && IS_JOIN_REL(parent_joinrel)); + + /* We need consider only ECs that mention the parent joinrel */ + matching_ecs = get_eclass_indexes_for_relids(root, top_parent_relids); + + i = -1; + while ((i = bms_next_member(matching_ecs, i)) >= 0) + { + EquivalenceClass *cur_ec = (EquivalenceClass *) list_nth(root->eq_classes, i); + int num_members; + + /* + * If this EC contains a volatile expression, then generating child + * EMs would be downright dangerous, so skip it. We rely on a + * volatile EC having only one EM. + */ + if (cur_ec->ec_has_volatile) + continue; + + /* Sanity check on get_eclass_indexes_for_relids result */ + Assert(bms_overlap(top_parent_relids, cur_ec->ec_relids)); + + /* + * We don't use foreach() here because there's no point in scanning + * newly-added child members, so we can stop after the last + * pre-existing EC member. + */ + num_members = list_length(cur_ec->ec_members); + for (int pos = 0; pos < num_members; pos++) + { + EquivalenceMember *cur_em = (EquivalenceMember *) list_nth(cur_ec->ec_members, pos); + + if (cur_em->em_is_const) + continue; /* ignore consts here */ + + /* + * We consider only original EC members here, not + * already-transformed child members. + */ + if (cur_em->em_is_child) + continue; /* ignore children here */ + + /* + * We may ignore expressions that reference a single baserel, + * because add_child_rel_equivalences should have handled them. + */ + if (bms_membership(cur_em->em_relids) != BMS_MULTIPLE) + continue; + + /* Does this member reference child's topmost parent rel? */ + if (bms_overlap(cur_em->em_relids, top_parent_relids)) + { + /* Yes, generate transformed child version */ + Expr *child_expr; + Relids new_relids; + Relids new_nullable_relids; + + if (parent_joinrel->reloptkind == RELOPT_JOINREL) + { + /* Simple single-level transformation */ + child_expr = (Expr *) + adjust_appendrel_attrs(root, + (Node *) cur_em->em_expr, + nappinfos, appinfos); + } + else + { + /* Must do multi-level transformation */ + Assert(parent_joinrel->reloptkind == RELOPT_OTHER_JOINREL); + child_expr = (Expr *) + adjust_appendrel_attrs_multilevel(root, + (Node *) cur_em->em_expr, + child_relids, + top_parent_relids); + } + + /* + * Transform em_relids to match. Note we do *not* do + * pull_varnos(child_expr) here, as for example the + * transformation might have substituted a constant, but we + * don't want the child member to be marked as constant. + */ + new_relids = bms_difference(cur_em->em_relids, + top_parent_relids); + new_relids = bms_add_members(new_relids, child_relids); + + /* + * For nullable_relids, we must selectively replace parent + * nullable relids with child ones. + */ + new_nullable_relids = cur_em->em_nullable_relids; + if (bms_overlap(new_nullable_relids, top_parent_relids)) + new_nullable_relids = + adjust_child_relids_multilevel(root, + new_nullable_relids, + child_relids, + top_parent_relids); + + (void) add_eq_member(cur_ec, child_expr, + new_relids, new_nullable_relids, + true, cur_em->em_datatype); + } + } + } +} + /* * generate_implied_equalities_for_column diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 85415381fb..03e02423b2 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -843,6 +843,7 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, /* Compute information relevant to foreign relations. */ set_foreign_rel_properties(joinrel, outer_rel, inner_rel); + /* Compute information needed for mapping Vars to the child rel */ appinfos = find_appinfos_by_relids(root, joinrel->relids, &nappinfos); /* Set up reltarget struct */ @@ -854,7 +855,6 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, (Node *) parent_joinrel->joininfo, nappinfos, appinfos); - pfree(appinfos); /* * Lateral relids referred in child join will be same as that referred in @@ -886,6 +886,19 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, /* Add the relation to the PlannerInfo. */ add_join_rel(root, joinrel); + /* + * We might need EquivalenceClass members corresponding to the child join, + * so that we can represent sort pathkeys for it. As with children of + * baserels, we shouldn't need this unless there are relevant eclass joins + * (implying that a merge join might be possible) or pathkeys to sort by. + */ + if (joinrel->has_eclass_joins || has_useful_pathkeys(root, parent_joinrel)) + add_child_join_rel_equivalences(root, + nappinfos, appinfos, + parent_joinrel, joinrel); + + pfree(appinfos); + return joinrel; } diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 7345137d1d..c6c34630c2 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -153,6 +153,11 @@ extern void add_child_rel_equivalences(PlannerInfo *root, AppendRelInfo *appinfo, RelOptInfo *parent_rel, RelOptInfo *child_rel); +extern void add_child_join_rel_equivalences(PlannerInfo *root, + int nappinfos, + AppendRelInfo **appinfos, + RelOptInfo *parent_rel, + RelOptInfo *child_rel); extern List *generate_implied_equalities_for_column(PlannerInfo *root, RelOptInfo *rel, ec_matches_callback_type callback, diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out index cad8dd591a..975bf6765c 100644 --- a/src/test/regress/expected/partition_join.out +++ b/src/test/regress/expected/partition_join.out @@ -459,6 +459,83 @@ SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL 550 | | (12 rows) +-- bug with inadequate sort key representation +SET enable_partitionwise_aggregate TO true; +SET enable_hashjoin TO false; +EXPLAIN (COSTS OFF) +SELECT a, b FROM prt1 FULL JOIN prt2 p2(b,a,c) USING(a,b) + WHERE a BETWEEN 490 AND 510 + GROUP BY 1, 2 ORDER BY 1, 2; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------- + Group + Group Key: (COALESCE(prt1_p1.a, p2.a)), (COALESCE(prt1_p1.b, p2.b)) + -> Merge Append + Sort Key: (COALESCE(prt1_p1.a, p2.a)), (COALESCE(prt1_p1.b, p2.b)) + -> Group + Group Key: (COALESCE(prt1_p1.a, p2.a)), (COALESCE(prt1_p1.b, p2.b)) + -> Sort + Sort Key: (COALESCE(prt1_p1.a, p2.a)), (COALESCE(prt1_p1.b, p2.b)) + -> Merge Full Join + Merge Cond: ((prt1_p1.a = p2.a) AND (prt1_p1.b = p2.b)) + Filter: ((COALESCE(prt1_p1.a, p2.a) >= 490) AND (COALESCE(prt1_p1.a, p2.a) <= 510)) + -> Sort + Sort Key: prt1_p1.a, prt1_p1.b + -> Seq Scan on prt1_p1 + -> Sort + Sort Key: p2.a, p2.b + -> Seq Scan on prt2_p1 p2 + -> Group + Group Key: (COALESCE(prt1_p2.a, p2_1.a)), (COALESCE(prt1_p2.b, p2_1.b)) + -> Sort + Sort Key: (COALESCE(prt1_p2.a, p2_1.a)), (COALESCE(prt1_p2.b, p2_1.b)) + -> Merge Full Join + Merge Cond: ((prt1_p2.a = p2_1.a) AND (prt1_p2.b = p2_1.b)) + Filter: ((COALESCE(prt1_p2.a, p2_1.a) >= 490) AND (COALESCE(prt1_p2.a, p2_1.a) <= 510)) + -> Sort + Sort Key: prt1_p2.a, prt1_p2.b + -> Seq Scan on prt1_p2 + -> Sort + Sort Key: p2_1.a, p2_1.b + -> Seq Scan on prt2_p2 p2_1 + -> Group + Group Key: (COALESCE(prt1_p3.a, p2_2.a)), (COALESCE(prt1_p3.b, p2_2.b)) + -> Sort + Sort Key: (COALESCE(prt1_p3.a, p2_2.a)), (COALESCE(prt1_p3.b, p2_2.b)) + -> Merge Full Join + Merge Cond: ((prt1_p3.a = p2_2.a) AND (prt1_p3.b = p2_2.b)) + Filter: ((COALESCE(prt1_p3.a, p2_2.a) >= 490) AND (COALESCE(prt1_p3.a, p2_2.a) <= 510)) + -> Sort + Sort Key: prt1_p3.a, prt1_p3.b + -> Seq Scan on prt1_p3 + -> Sort + Sort Key: p2_2.a, p2_2.b + -> Seq Scan on prt2_p3 p2_2 +(43 rows) + +SELECT a, b FROM prt1 FULL JOIN prt2 p2(b,a,c) USING(a,b) + WHERE a BETWEEN 490 AND 510 + GROUP BY 1, 2 ORDER BY 1, 2; + a | b +-----+---- + 490 | 15 + 492 | 17 + 494 | 19 + 495 | 20 + 496 | 21 + 498 | 23 + 500 | 0 + 501 | 1 + 502 | 2 + 504 | 4 + 506 | 6 + 507 | 7 + 508 | 8 + 510 | 10 +(14 rows) + +RESET enable_partitionwise_aggregate; +RESET enable_hashjoin; -- -- partitioned by expression -- diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql index fb3ba18a26..92994b479b 100644 --- a/src/test/regress/sql/partition_join.sql +++ b/src/test/regress/sql/partition_join.sql @@ -91,6 +91,21 @@ SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.a) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a; +-- bug with inadequate sort key representation +SET enable_partitionwise_aggregate TO true; +SET enable_hashjoin TO false; + +EXPLAIN (COSTS OFF) +SELECT a, b FROM prt1 FULL JOIN prt2 p2(b,a,c) USING(a,b) + WHERE a BETWEEN 490 AND 510 + GROUP BY 1, 2 ORDER BY 1, 2; +SELECT a, b FROM prt1 FULL JOIN prt2 p2(b,a,c) USING(a,b) + WHERE a BETWEEN 490 AND 510 + GROUP BY 1, 2 ORDER BY 1, 2; + +RESET enable_partitionwise_aggregate; +RESET enable_hashjoin; + -- -- partitioned by expression --