/*------------------------------------------------------------------------- * * joinpath.c * Routines to find all possible paths for processing a set of joins * * Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $Header: /cvsroot/pgsql/src/backend/optimizer/path/joinpath.c,v 1.46 1999/08/21 03:49:00 tgl Exp $ * *------------------------------------------------------------------------- */ #include #include #include "postgres.h" #include "access/htup.h" #include "catalog/pg_attribute.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" #include "optimizer/restrictinfo.h" #include "parser/parsetree.h" #include "utils/lsyscache.h" static Path *best_innerjoin(List *join_paths, List *outer_relid); static List *sort_inner_and_outer(RelOptInfo *joinrel, RelOptInfo *outerrel, RelOptInfo *innerrel, List *mergeclause_list); static List *match_unsorted_outer(RelOptInfo *joinrel, RelOptInfo *outerrel, RelOptInfo *innerrel, List *outerpath_list, Path *cheapest_inner, Path *best_innerjoin, List *mergeclause_list); static List *match_unsorted_inner(RelOptInfo *joinrel, RelOptInfo *outerrel, RelOptInfo *innerrel, List *innerpath_list, List *mergeclause_list); static List *hash_inner_and_outer(Query *root, RelOptInfo *joinrel, RelOptInfo *outerrel, RelOptInfo *innerrel); static Cost estimate_disbursion(Query *root, Var *var); static List *select_mergejoin_clauses(List *restrictinfo_list); /* * update_rels_pathlist_for_joins * Creates all possible ways to process joins for each of the join * relations in the list 'joinrels.' Each unique path will be included * in the join relation's 'pathlist' field. * * 'joinrels' is the list of relation entries to be joined * * Modifies the pathlist field of each joinrel node to contain * the unique join paths. */ void update_rels_pathlist_for_joins(Query *root, List *joinrels) { List *j; foreach(j, joinrels) { RelOptInfo *joinrel = (RelOptInfo *) lfirst(j); Relids innerrelids; Relids outerrelids; RelOptInfo *innerrel; RelOptInfo *outerrel; Path *bestinnerjoin; List *pathlist; List *mergeclause_list = NIL; /* * On entry, joinrel->relids is a list of two sublists of relids, * namely the outer and inner member relids. Extract these sublists * and change joinrel->relids to a flattened single list. * (Use listCopy so as not to damage the member lists...) */ outerrelids = lfirst(joinrel->relids); innerrelids = lsecond(joinrel->relids); joinrel->relids = nconc(listCopy(outerrelids), listCopy(innerrelids)); /* * Get the corresponding RelOptInfos for the outer and inner sides. * Base relation id is an integer and join relation relid is a * list of integers. */ innerrel = (length(innerrelids) == 1) ? get_base_rel(root, lfirsti(innerrelids)) : get_join_rel(root, innerrelids); outerrel = (length(outerrelids) == 1) ? get_base_rel(root, lfirsti(outerrelids)) : get_join_rel(root, outerrelids); /* * Get the best inner join for match_unsorted_outer(). */ bestinnerjoin = best_innerjoin(innerrel->innerjoin, outerrel->relids); /* * Find potential mergejoin clauses. */ if (_enable_mergejoin_) mergeclause_list = select_mergejoin_clauses(joinrel->restrictinfo); /* * 1. Consider mergejoin paths where both relations must be * explicitly sorted. */ pathlist = sort_inner_and_outer(joinrel, outerrel, innerrel, mergeclause_list); /* * 2. Consider paths where the outer relation need not be * explicitly sorted. This includes both nestloops and * mergejoins where the outer path is already ordered. */ pathlist = add_pathlist(joinrel, pathlist, match_unsorted_outer(joinrel, outerrel, innerrel, outerrel->pathlist, innerrel->cheapestpath, bestinnerjoin, mergeclause_list)); /* * 3. Consider paths where the inner relation need not be * explicitly sorted. This includes mergejoins only * (nestloops were already built in match_unsorted_outer). */ pathlist = add_pathlist(joinrel, pathlist, match_unsorted_inner(joinrel, outerrel, innerrel, innerrel->pathlist, mergeclause_list)); /* * 4. Consider paths where both outer and inner relations must be * hashed before being joined. */ if (_enable_hashjoin_) pathlist = add_pathlist(joinrel, pathlist, hash_inner_and_outer(root, joinrel, outerrel, innerrel)); /* Save the completed pathlist in the join rel */ joinrel->pathlist = pathlist; } } /* * best_innerjoin * Find the cheapest index path that has already been identified by * indexable_joinclauses() as being a possible inner path for the given * outer relation(s) in a nestloop join. * * 'join_paths' is a list of potential inner indexscan join paths * 'outer_relids' is the relid list of the outer join relation * * Returns the pathnode of the best path, or NULL if there's no * usable path. */ static Path * best_innerjoin(List *join_paths, Relids outer_relids) { Path *cheapest = (Path *) NULL; List *join_path; foreach(join_path, join_paths) { Path *path = (Path *) lfirst(join_path); Assert(IsA(path, IndexPath)); /* path->joinrelids is the set of base rels that must be part of * outer_relids in order to use this inner path, because those * rels are used in the index join quals of this inner path. */ if (is_subset(((IndexPath *) path)->joinrelids, outer_relids) && (cheapest == NULL || path_is_cheaper(path, cheapest))) cheapest = path; } return cheapest; } /* * sort_inner_and_outer * Create mergejoin join paths by explicitly sorting both the outer and * inner join relations on each available merge ordering. * * 'joinrel' is the join relation * 'outerrel' is the outer join relation * 'innerrel' is the inner join relation * 'mergeclause_list' is a list of RestrictInfo nodes for available * mergejoin clauses between these two relations * * Returns a list of mergejoin paths. */ static List * sort_inner_and_outer(RelOptInfo *joinrel, RelOptInfo *outerrel, RelOptInfo *innerrel, List *mergeclause_list) { List *path_list = NIL; List *i; /* * Each possible ordering of the available mergejoin clauses will * generate a differently-sorted result path at essentially the * same cost. We have no basis for choosing one over another at * this level of joining, but some sort orders may be more useful * than others for higher-level mergejoins. Generating a path here * for *every* permutation of mergejoin clauses doesn't seem like * a winning strategy, however; the cost in planning time is too high. * * For now, we generate one path for each mergejoin clause, listing that * clause first and the rest in random order. This should allow at least * a one-clause mergejoin without re-sorting against any other possible * mergejoin partner path. But if we've not guessed the right ordering * of secondary clauses, we may end up evaluating clauses as qpquals when * they could have been done as mergeclauses. We need to figure out a * better way. (Two possible approaches: look at all the relevant index * relations to suggest plausible sort orders, or make just one output * path and somehow mark it as having a sort-order that can be rearranged * freely.) */ foreach(i, mergeclause_list) { RestrictInfo *restrictinfo = lfirst(i); List *curclause_list; List *outerkeys; List *innerkeys; List *merge_pathkeys; MergePath *path_node; /* Make a mergeclause list with this guy first. */ curclause_list = lcons(restrictinfo, lremove(restrictinfo, listCopy(mergeclause_list))); /* Build sort pathkeys for both sides. * * Note: it's possible that the cheapest path will already be * sorted properly --- create_mergejoin_path will detect that case * and suppress an explicit sort step. */ outerkeys = make_pathkeys_for_mergeclauses(curclause_list, outerrel->targetlist); innerkeys = make_pathkeys_for_mergeclauses(curclause_list, innerrel->targetlist); /* Build pathkeys representing output sort order. */ merge_pathkeys = build_join_pathkeys(outerkeys, joinrel->targetlist, curclause_list); /* And now we can make the path. */ path_node = create_mergejoin_path(joinrel, outerrel->size, innerrel->size, outerrel->width, innerrel->width, (Path *) outerrel->cheapestpath, (Path *) innerrel->cheapestpath, merge_pathkeys, get_actual_clauses(curclause_list), outerkeys, innerkeys); path_list = lappend(path_list, path_node); } return path_list; } /* * match_unsorted_outer * Creates possible join paths for processing a single join relation * 'joinrel' by employing either iterative substitution or * mergejoining on each of its possible outer paths (considering * only outer paths that are already ordered well enough for merging). * * We always generate a nestloop path for each available outer path. * If an indexscan inner path exists that is compatible with this outer rel * and cheaper than the cheapest general-purpose inner path, then we use * the indexscan inner path; else we use the cheapest general-purpose inner. * * We also consider mergejoins if mergejoin clauses are available. We have * two ways to generate the inner path for a mergejoin: use the cheapest * inner path (sorting it if it's not suitably ordered already), or using an * inner path that is already suitably ordered for the merge. If the * cheapest inner path is suitably ordered, then by definition it's the one * to use. Otherwise, we look for ordered paths that are cheaper than the * cheapest inner + sort costs. If we have several mergeclauses, it could be * that there is no inner path (or only a very expensive one) for the full * list of mergeclauses, but better paths exist if we truncate the * mergeclause list (thereby discarding some sort key requirements). So, we * consider truncations of the mergeclause list as well as the full list. * In any case, we find the cheapest suitable path and generate a single * output mergejoin path. (Since all the possible mergejoins will have * identical output pathkeys, there is no need to keep any but the cheapest.) * * 'joinrel' is the join relation * 'outerrel' is the outer join relation * 'innerrel' is the inner join relation * 'outerpath_list' is the list of possible outer paths * 'cheapest_inner' is the cheapest inner path * 'best_innerjoin' is the best inner index path (if any) * 'mergeclause_list' is a list of RestrictInfo nodes for available * mergejoin clauses between these two relations * * Returns a list of possible join path nodes. */ static List * match_unsorted_outer(RelOptInfo *joinrel, RelOptInfo *outerrel, RelOptInfo *innerrel, List *outerpath_list, Path *cheapest_inner, Path *best_innerjoin, List *mergeclause_list) { List *path_list = NIL; Path *nestinnerpath; List *i; /* * We only use the best innerjoin indexpath if it is cheaper * than the cheapest general-purpose inner path. */ if (best_innerjoin && path_is_cheaper(best_innerjoin, cheapest_inner)) nestinnerpath = best_innerjoin; else nestinnerpath = cheapest_inner; foreach(i, outerpath_list) { Path *outerpath = (Path *) lfirst(i); List *mergeclauses; List *merge_pathkeys; List *innersortkeys; Path *mergeinnerpath; int mergeclausecount; /* Look for useful mergeclauses (if any) */ mergeclauses = find_mergeclauses_for_pathkeys(outerpath->pathkeys, mergeclause_list); /* * The result will have this sort order (even if it is implemented * as a nestloop, and even if some of the mergeclauses are implemented * by qpquals rather than as true mergeclauses): */ merge_pathkeys = build_join_pathkeys(outerpath->pathkeys, joinrel->targetlist, mergeclauses); /* Always consider a nestloop join with this outer and best inner. */ path_list = lappend(path_list, create_nestloop_path(joinrel, outerrel, outerpath, nestinnerpath, merge_pathkeys)); /* Done with this outer path if no chance for a mergejoin */ if (mergeclauses == NIL) continue; /* Compute the required ordering of the inner path */ innersortkeys = make_pathkeys_for_mergeclauses(mergeclauses, innerrel->targetlist); /* Set up on the assumption that we will use the cheapest_inner */ mergeinnerpath = cheapest_inner; mergeclausecount = length(mergeclauses); /* If the cheapest_inner doesn't need to be sorted, it is the winner * by definition. */ if (pathkeys_contained_in(innersortkeys, cheapest_inner->pathkeys)) { /* cheapest_inner is the winner */ innersortkeys = NIL; /* we do not need to sort it... */ } else { /* look for a presorted path that's cheaper */ List *trialsortkeys = listCopy(innersortkeys); Cost cheapest_cost; int clausecount; cheapest_cost = cheapest_inner->path_cost + cost_sort(innersortkeys, innerrel->size, innerrel->width); for (clausecount = mergeclausecount; clausecount > 0; clausecount--) { Path *trialinnerpath; /* Look for an inner path ordered well enough to merge with * the first 'clausecount' mergeclauses. NB: trialsortkeys * is modified destructively, which is why we made a copy... */ trialinnerpath = get_cheapest_path_for_pathkeys(innerrel->pathlist, ltruncate(clausecount, trialsortkeys), false); if (trialinnerpath != NULL && trialinnerpath->path_cost < cheapest_cost) { /* Found a cheaper (or even-cheaper) sorted path */ cheapest_cost = trialinnerpath->path_cost; mergeinnerpath = trialinnerpath; mergeclausecount = clausecount; innersortkeys = NIL; /* we will not need to sort it... */ } } } /* Finally, we can build the mergejoin path */ mergeclauses = ltruncate(mergeclausecount, get_actual_clauses(mergeclauses)); path_list = lappend(path_list, create_mergejoin_path(joinrel, outerrel->size, innerrel->size, outerrel->width, innerrel->width, outerpath, mergeinnerpath, merge_pathkeys, mergeclauses, NIL, innersortkeys)); } return path_list; } /* * match_unsorted_inner * Generate mergejoin paths that use an explicit sort of the outer path * with an already-ordered inner path. * * 'joinrel' is the join result relation * 'outerrel' is the outer join relation * 'innerrel' is the inner join relation * 'innerpath_list' is the list of possible inner join paths * 'mergeclause_list' is a list of RestrictInfo nodes for available * mergejoin clauses between these two relations * * Returns a list of possible merge paths. */ static List * match_unsorted_inner(RelOptInfo *joinrel, RelOptInfo *outerrel, RelOptInfo *innerrel, List *innerpath_list, List *mergeclause_list) { List *path_list = NIL; List *i; foreach(i, innerpath_list) { Path *innerpath = (Path *) lfirst(i); List *mergeclauses; /* Look for useful mergeclauses (if any) */ mergeclauses = find_mergeclauses_for_pathkeys(innerpath->pathkeys, mergeclause_list); if (mergeclauses) { List *outersortkeys; Path *mergeouterpath; List *merge_pathkeys; /* Compute the required ordering of the outer path */ outersortkeys = make_pathkeys_for_mergeclauses(mergeclauses, outerrel->targetlist); /* Look for an outer path already ordered well enough to merge */ mergeouterpath = get_cheapest_path_for_pathkeys(outerrel->pathlist, outersortkeys, false); /* Should we use the mergeouter, or sort the cheapest outer? */ if (mergeouterpath != NULL && mergeouterpath->path_cost <= (outerrel->cheapestpath->path_cost + cost_sort(outersortkeys, outerrel->size, outerrel->width))) { /* Use mergeouterpath */ outersortkeys = NIL; /* no explicit sort step */ } else { /* Use outerrel->cheapestpath, with the outersortkeys */ mergeouterpath = outerrel->cheapestpath; } /* Compute pathkeys the result will have */ merge_pathkeys = build_join_pathkeys( outersortkeys ? outersortkeys : mergeouterpath->pathkeys, joinrel->targetlist, mergeclauses); mergeclauses = get_actual_clauses(mergeclauses); path_list = lappend(path_list, create_mergejoin_path(joinrel, outerrel->size, innerrel->size, outerrel->width, innerrel->width, mergeouterpath, innerpath, merge_pathkeys, mergeclauses, outersortkeys, NIL)); } } return path_list; } /* * hash_inner_and_outer * Create hashjoin join paths by explicitly hashing both the outer and * inner join relations of each available hash clause. * * 'joinrel' is the join relation * 'outerrel' is the outer join relation * 'innerrel' is the inner join relation * * Returns a list of hashjoin paths. */ static List * hash_inner_and_outer(Query *root, RelOptInfo *joinrel, RelOptInfo *outerrel, RelOptInfo *innerrel) { List *hpath_list = NIL; List *i; foreach(i, joinrel->restrictinfo) { RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(i); /* we consider only clauses previously marked hashjoinable */ if (restrictinfo->hashjoinoperator) { Expr *clause = restrictinfo->clause; Var *leftop = get_leftop(clause); Var *rightop = get_rightop(clause); Var *innerop; Cost innerdisbursion; HashPath *hash_path; /* find the inner var and estimate its disbursion */ if (intMember(leftop->varno, innerrel->relids)) innerop = leftop; else innerop = rightop; innerdisbursion = estimate_disbursion(root, innerop); hash_path = create_hashjoin_path(joinrel, outerrel->size, innerrel->size, outerrel->width, innerrel->width, (Path *) outerrel->cheapestpath, (Path *) innerrel->cheapestpath, lcons(clause, NIL), innerdisbursion); hpath_list = lappend(hpath_list, hash_path); } } return hpath_list; } /* * Estimate disbursion of the specified Var * * We use a default of 0.1 if we can't figure out anything better. * This will typically discourage use of a hash rather strongly, * if the inner relation is large. We do not want to hash unless * we know that the inner rel is well-dispersed (or the alternatives * seem much worse). */ static Cost estimate_disbursion(Query *root, Var *var) { Oid relid; if (! IsA(var, Var)) return 0.1; relid = getrelid(var->varno, root->rtable); return (Cost) get_attdisbursion(relid, var->varattno, 0.1); } /* * select_mergejoin_clauses * Select mergejoin clauses that are usable for a particular join. * Returns a list of RestrictInfo nodes for those clauses. * * Currently, all we need is the restrictinfo list of the joinrel. * By definition, any mergejoinable clause in that list will work --- * it must involve only vars in the join, or it wouldn't have been * in the restrict list, and it must involve vars on both sides of * the join, or it wouldn't have made it up to this level of join. * Since we currently allow only simple Vars as the left and right * sides of mergejoin clauses, that means the mergejoin clauses must * be usable for this join. If we ever allow more complex expressions * containing multiple Vars, we would need to check that each side * of a potential joinclause uses only vars from one side of the join. */ static List * select_mergejoin_clauses(List *restrictinfo_list) { List *result_list = NIL; List *i; foreach(i, restrictinfo_list) { RestrictInfo *restrictinfo = lfirst(i); if (restrictinfo->mergejoinoperator != InvalidOid) result_list = lcons(restrictinfo, result_list); } return result_list; }