diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 7abee7a2d8..4e1c96271a 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.371 2009/10/28 14:55:38 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.372 2009/11/15 02:45:34 tgl Exp $ * * NOTES * Every node type that can appear in stored rules' parsetrees *must* @@ -1501,6 +1501,7 @@ _outMergePath(StringInfo str, MergePath *node) WRITE_NODE_FIELD(path_mergeclauses); WRITE_NODE_FIELD(outersortkeys); WRITE_NODE_FIELD(innersortkeys); + WRITE_BOOL_FIELD(materialize_inner); } static void diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 4d402ca720..c225d7e288 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/path/allpaths.c,v 1.188 2009/10/26 02:26:33 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/path/allpaths.c,v 1.189 2009/11/15 02:45:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1443,13 +1443,12 @@ print_path(PlannerInfo *root, Path *path, int indent) { MergePath *mp = (MergePath *) path; - if (mp->outersortkeys || mp->innersortkeys) - { - for (i = 0; i < indent; i++) - printf("\t"); - printf(" sortouter=%d sortinner=%d\n", - ((mp->outersortkeys) ? 1 : 0), - ((mp->innersortkeys) ? 1 : 0)); + for (i = 0; i < indent; i++) + printf("\t"); + printf(" sortouter=%d sortinner=%d materializeinner=%d\n", + ((mp->outersortkeys) ? 1 : 0), + ((mp->innersortkeys) ? 1 : 0), + ((mp->materialize_inner) ? 1 : 0)); } } diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 6acc5ae34b..ffbd9afbba 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -54,7 +54,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.211 2009/09/12 22:12:03 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.212 2009/11/15 02:45:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1166,23 +1166,6 @@ cost_sort(Path *path, PlannerInfo *root, path->total_cost = startup_cost + run_cost; } -/* - * sort_exceeds_work_mem - * Given a finished Sort plan node, detect whether it is expected to - * spill to disk (ie, will need more than work_mem workspace) - * - * This assumes there will be no available LIMIT. - */ -bool -sort_exceeds_work_mem(Sort *sort) -{ - double input_bytes = relation_byte_size(sort->plan.plan_rows, - sort->plan.plan_width); - long work_mem_bytes = work_mem * 1024L; - - return (input_bytes > work_mem_bytes); -} - /* * cost_material * Determines and returns the cost of materializing a relation, including @@ -1543,7 +1526,18 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) * Determines and returns the cost of joining two relations using the * merge join algorithm. * - * 'path' is already filled in except for the cost fields + * Unlike other costsize functions, this routine makes one actual decision: + * whether we should materialize the inner path. We do that either because + * the inner path can't support mark/restore, or because it's cheaper to + * use an interposed Material node to handle mark/restore. When the decision + * is cost-based it would be logically cleaner to build and cost two separate + * paths with and without that flag set; but that would require repeating most + * of the calculations here, which are not all that cheap. Since the choice + * will not affect output pathkeys or startup cost, only total cost, there is + * no possibility of wanting to keep both paths. So it seems best to make + * the decision here and record it in the path's materialize_inner field. + * + * 'path' is already filled in except for the cost fields and materialize_inner * 'sjinfo' is extra info about the join for selectivity estimation * * Notes: path's mergeclauses should be a subset of the joinrestrictinfo list; @@ -1561,7 +1555,10 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) List *innersortkeys = path->innersortkeys; Cost startup_cost = 0; Cost run_cost = 0; - Cost cpu_per_tuple; + Cost cpu_per_tuple, + inner_run_cost, + bare_inner_cost, + mat_inner_cost; QualCost merge_qual_cost; QualCost qp_qual_cost; double outer_path_rows = PATH_ROWS(outer_path); @@ -1606,10 +1603,7 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) /* * When there are equal merge keys in the outer relation, the mergejoin * must rescan any matching tuples in the inner relation. This means - * re-fetching inner tuples. Our cost model for this is that a re-fetch - * costs the same as an original fetch, which is probably an overestimate; - * but on the other hand we ignore the bookkeeping costs of mark/restore. - * Not clear if it's worth developing a more refined model. + * re-fetching inner tuples; we have to estimate how often that happens. * * For regular inner and outer joins, the number of re-fetches can be * estimated approximately as size of merge join output minus size of @@ -1641,7 +1635,7 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) if (rescannedtuples < 0) rescannedtuples = 0; } - /* We'll inflate inner run cost this much to account for rescanning */ + /* We'll inflate various costs this much to account for rescanning */ rescanratio = 1.0 + (rescannedtuples / inner_path_rows); /* @@ -1778,32 +1772,83 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) -1.0); startup_cost += sort_path.startup_cost; startup_cost += (sort_path.total_cost - sort_path.startup_cost) - * innerstartsel * rescanratio; - run_cost += (sort_path.total_cost - sort_path.startup_cost) - * (innerendsel - innerstartsel) * rescanratio; - - /* - * If the inner sort is expected to spill to disk, we want to add a - * materialize node to shield it from the need to handle mark/restore. - * This will allow it to perform the last merge pass on-the-fly, while - * in most cases not requiring the materialize to spill to disk. - * Charge an extra cpu_tuple_cost per tuple to account for the - * materialize node. (Keep this estimate in sync with similar ones in - * create_mergejoin_path and create_mergejoin_plan.) - */ - if (relation_byte_size(inner_path_rows, inner_path->parent->width) > - (work_mem * 1024L)) - run_cost += cpu_tuple_cost * inner_path_rows; + * innerstartsel; + inner_run_cost = (sort_path.total_cost - sort_path.startup_cost) + * (innerendsel - innerstartsel); } else { startup_cost += inner_path->startup_cost; startup_cost += (inner_path->total_cost - inner_path->startup_cost) - * innerstartsel * rescanratio; - run_cost += (inner_path->total_cost - inner_path->startup_cost) - * (innerendsel - innerstartsel) * rescanratio; + * innerstartsel; + inner_run_cost = (inner_path->total_cost - inner_path->startup_cost) + * (innerendsel - innerstartsel); } + /* + * Decide whether we want to materialize the inner input to shield it from + * mark/restore and performing re-fetches. Our cost model for regular + * re-fetches is that a re-fetch costs the same as an original fetch, + * which is probably an overestimate; but on the other hand we ignore the + * bookkeeping costs of mark/restore. Not clear if it's worth developing + * a more refined model. So we just need to inflate the inner run cost + * by rescanratio. + */ + bare_inner_cost = inner_run_cost * rescanratio; + /* + * When we interpose a Material node the re-fetch cost is assumed to be + * just cpu_tuple_cost per tuple, independently of the underlying plan's + * cost; but we have to charge an extra cpu_tuple_cost per original fetch + * as well. Note that we're assuming the materialize node will never + * spill to disk, since it only has to remember tuples back to the last + * mark. (If there are a huge number of duplicates, our other cost + * factors will make the path so expensive that it probably won't get + * chosen anyway.) So we don't use cost_rescan here. + * + * Note: keep this estimate in sync with create_mergejoin_plan's labeling + * of the generated Material node. + */ + mat_inner_cost = inner_run_cost + + cpu_tuple_cost * inner_path_rows * rescanratio; + + /* Prefer materializing if it looks cheaper */ + if (mat_inner_cost < bare_inner_cost) + path->materialize_inner = true; + /* + * Even if materializing doesn't look cheaper, we *must* do it if the + * inner path is to be used directly (without sorting) and it doesn't + * support mark/restore. + * + * Since the inner side must be ordered, and only Sorts and IndexScans can + * create order to begin with, and they both support mark/restore, you + * might think there's no problem --- but you'd be wrong. Nestloop and + * merge joins can *preserve* the order of their inputs, so they can be + * selected as the input of a mergejoin, and they don't support + * mark/restore at present. + */ + else if (innersortkeys == NIL && + !ExecSupportsMarkRestore(inner_path->pathtype)) + path->materialize_inner = true; + /* + * Also, force materializing if the inner path is to be sorted and the + * sort is expected to spill to disk. This is because the final merge + * pass can be done on-the-fly if it doesn't have to support mark/restore. + * We don't try to adjust the cost estimates for this consideration, + * though. + */ + else if (innersortkeys != NIL && + relation_byte_size(inner_path_rows, inner_path->parent->width) > + (work_mem * 1024L)) + path->materialize_inner = true; + else + path->materialize_inner = false; + + /* Charge the right incremental cost for the chosen case */ + if (path->materialize_inner) + run_cost += mat_inner_cost; + else + run_cost += bare_inner_cost; + /* CPU costs */ /* diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index b068d2f3f8..41238b7c0e 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -10,7 +10,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.266 2009/10/26 02:26:33 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.267 2009/11/15 02:45:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1664,9 +1664,8 @@ create_mergejoin_plan(PlannerInfo *root, best_path->jpath.outerjoinpath->parent->relids); /* - * Create explicit sort nodes for the outer and inner join paths if - * necessary. The sort cost was already accounted for in the path. Make - * sure there are no excess columns in the inputs if sorting. + * Create explicit sort nodes for the outer and inner paths if necessary. + * Make sure there are no excess columns in the inputs if sorting. */ if (best_path->outersortkeys) { @@ -1695,23 +1694,17 @@ create_mergejoin_plan(PlannerInfo *root, innerpathkeys = best_path->jpath.innerjoinpath->pathkeys; /* - * If inner plan is a sort that is expected to spill to disk, add a - * materialize node to shield it from the need to handle mark/restore. - * This will allow it to perform the last merge pass on-the-fly, while in - * most cases not requiring the materialize to spill to disk. - * - * XXX really, Sort oughta do this for itself, probably, to avoid the - * overhead of a separate plan node. + * If specified, add a materialize node to shield the inner plan from + * the need to handle mark/restore. */ - if (IsA(inner_plan, Sort) && - sort_exceeds_work_mem((Sort *) inner_plan)) + if (best_path->materialize_inner) { Plan *matplan = (Plan *) make_material(inner_plan); /* * We assume the materialize will not spill to disk, and therefore * charge just cpu_tuple_cost per tuple. (Keep this estimate in sync - * with similar ones in cost_mergejoin and create_mergejoin_path.) + * with cost_mergejoin.) */ copy_plan_costsize(matplan, inner_plan); matplan->total_cost += cpu_tuple_cost * matplan->plan_rows; @@ -1887,6 +1880,7 @@ create_mergejoin_plan(PlannerInfo *root, inner_plan, best_path->jpath.jointype); + /* Costs of sort and material steps are included in path cost already */ copy_path_costsize(&join_plan->join.plan, &best_path->jpath.path); return join_plan; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index e9733892cc..62169e589b 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.154 2009/09/17 20:49:29 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.155 2009/11/15 02:45:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -17,7 +17,6 @@ #include #include "catalog/pg_operator.h" -#include "executor/executor.h" #include "miscadmin.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" @@ -1414,47 +1413,6 @@ create_mergejoin_path(PlannerInfo *root, pathkeys_contained_in(innersortkeys, inner_path->pathkeys)) innersortkeys = NIL; - /* - * If we are not sorting the inner path, we may need a materialize node to - * ensure it can be marked/restored. - * - * Since the inner side must be ordered, and only Sorts and IndexScans can - * create order to begin with, and they both support mark/restore, you - * might think there's no problem --- but you'd be wrong. Nestloop and - * merge joins can *preserve* the order of their inputs, so they can be - * selected as the input of a mergejoin, and they don't support - * mark/restore at present. - * - * Note: Sort supports mark/restore, so no materialize is really needed in - * that case; but one may be desirable anyway to optimize the sort. - * However, since we aren't representing the sort step separately in the - * Path tree, we can't explicitly represent the materialize either. So - * that case is not handled here. Instead, cost_mergejoin has to factor - * in the cost and create_mergejoin_plan has to add the plan node. - */ - if (innersortkeys == NIL && - !ExecSupportsMarkRestore(inner_path->pathtype)) - { - Path *mpath; - - mpath = (Path *) create_material_path(inner_path->parent, inner_path); - - /* - * We expect the materialize won't spill to disk (it could only do so - * if there were a whole lot of duplicate tuples, which is a case - * cost_mergejoin will avoid choosing anyway). Therefore - * cost_material's cost estimate is bogus and we should charge just - * cpu_tuple_cost per tuple. (Keep this estimate in sync with similar - * ones in cost_mergejoin and create_mergejoin_plan; also see - * cost_rescan.) - */ - mpath->startup_cost = inner_path->startup_cost; - mpath->total_cost = inner_path->total_cost; - mpath->total_cost += cpu_tuple_cost * inner_path->parent->rows; - - inner_path = mpath; - } - pathnode->jpath.path.pathtype = T_MergeJoin; pathnode->jpath.path.parent = joinrel; pathnode->jpath.jointype = jointype; @@ -1465,6 +1423,7 @@ create_mergejoin_path(PlannerInfo *root, pathnode->path_mergeclauses = mergeclauses; pathnode->outersortkeys = outersortkeys; pathnode->innersortkeys = innersortkeys; + /* pathnode->materialize_inner will be set by cost_mergejoin */ cost_mergejoin(pathnode, root, sjinfo); diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 59f83e8562..d54770eabc 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.178 2009/10/26 02:26:43 tgl Exp $ + * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.179 2009/11/15 02:45:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -835,6 +835,14 @@ typedef JoinPath NestPath; /* * A mergejoin path has these fields. * + * Unlike other path types, a MergePath node doesn't represent just a single + * run-time plan node: it can represent up to four. Aside from the MergeJoin + * node itself, there can be a Sort node for the outer input, a Sort node + * for the inner input, and/or a Material node for the inner input. We could + * represent these nodes by separate path nodes, but considering how many + * different merge paths are investigated during a complex join problem, + * it seems better to avoid unnecessary palloc overhead. + * * path_mergeclauses lists the clauses (in the form of RestrictInfos) * that will be used in the merge. * @@ -846,15 +854,19 @@ typedef JoinPath NestPath; * outersortkeys (resp. innersortkeys) is NIL if the outer path * (resp. inner path) is already ordered appropriately for the * mergejoin. If it is not NIL then it is a PathKeys list describing - * the ordering that must be created by an explicit sort step. + * the ordering that must be created by an explicit Sort node. + * + * materialize_inner is TRUE if a Material node should be placed atop the + * inner input. This may appear with or without an inner Sort step. */ typedef struct MergePath { JoinPath jpath; List *path_mergeclauses; /* join clauses to be used for merge */ - List *outersortkeys; /* keys for explicit sort, if any */ - List *innersortkeys; /* keys for explicit sort, if any */ + List *outersortkeys; /* keys for explicit sort, if any */ + List *innersortkeys; /* keys for explicit sort, if any */ + bool materialize_inner; /* add Materialize to inner? */ } MergePath; /* diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index f862979c08..041b7e8f9d 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.98 2009/09/12 22:12:04 tgl Exp $ + * $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.99 2009/11/15 02:45:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -84,7 +84,6 @@ extern void cost_recursive_union(Plan *runion, Plan *nrterm, Plan *rterm); extern void cost_sort(Path *path, PlannerInfo *root, List *pathkeys, Cost input_cost, double tuples, int width, double limit_tuples); -extern bool sort_exceeds_work_mem(Sort *sort); extern void cost_material(Path *path, Cost input_startup_cost, Cost input_total_cost, double tuples, int width);