postgresql/src/backend/optimizer/path/joinpath.c
Tom Lane b1577a7c78 New cost model for planning, incorporating a penalty for random page
accesses versus sequential accesses, a (very crude) estimate of the
effects of caching on random page accesses, and cost to evaluate WHERE-
clause expressions.  Export critical parameters for this model as SET
variables.  Also, create SET variables for the planner's enable flags
(enable_seqscan, enable_indexscan, etc) so that these can be controlled
more conveniently than via PGOPTIONS.

Planner now estimates both startup cost (cost before retrieving
first tuple) and total cost of each path, so it can optimize queries
with LIMIT on a reasonable basis by interpolating between these costs.
Same facility is a win for EXISTS(...) subqueries and some other cases.

Redesign pathkey representation to achieve a major speedup in planning
(I saw as much as 5X on a 10-way join); also minor changes in planner
to reduce memory consumption by recycling discarded Path nodes and
not constructing unnecessary lists.

Minor cleanups to display more-plausible costs in some cases in
EXPLAIN output.

Initdb forced by change in interface to index cost estimation
functions.
2000-02-15 20:49:31 +00:00

713 lines
23 KiB
C

/*-------------------------------------------------------------------------
*
* joinpath.c
* Routines to find all possible paths for processing a set of joins
*
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/optimizer/path/joinpath.c,v 1.52 2000/02/15 20:49:17 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include <sys/types.h>
#include <math.h>
#include "postgres.h"
#include "access/htup.h"
#include "catalog/pg_attribute.h"
#include "optimizer/clauses.h"
#include "optimizer/cost.h"
#include "optimizer/pathnode.h"
#include "optimizer/paths.h"
#include "optimizer/restrictinfo.h"
#include "parser/parsetree.h"
#include "utils/lsyscache.h"
static void sort_inner_and_outer(Query *root, RelOptInfo *joinrel,
RelOptInfo *outerrel, RelOptInfo *innerrel,
List *restrictlist, List *mergeclause_list);
static void match_unsorted_outer(Query *root, RelOptInfo *joinrel,
RelOptInfo *outerrel, RelOptInfo *innerrel,
List *restrictlist, List *mergeclause_list);
#ifdef NOT_USED
static void match_unsorted_inner(Query *root, RelOptInfo *joinrel,
RelOptInfo *outerrel, RelOptInfo *innerrel,
List *restrictlist, List *mergeclause_list);
#endif
static void hash_inner_and_outer(Query *root, RelOptInfo *joinrel,
RelOptInfo *outerrel, RelOptInfo *innerrel,
List *restrictlist);
static Path *best_innerjoin(List *join_paths, List *outer_relid);
static Selectivity estimate_disbursion(Query *root, Var *var);
static List *select_mergejoin_clauses(RelOptInfo *joinrel,
RelOptInfo *outerrel,
RelOptInfo *innerrel,
List *restrictlist);
/*
* add_paths_to_joinrel
* Given a join relation and two component rels from which it can be made,
* consider all possible paths that use the two component rels as outer
* and inner rel respectively. Add these paths to the join rel's pathlist
* if they survive comparison with other paths (and remove any existing
* paths that are dominated by these paths).
*
* Modifies the pathlist field of the joinrel node to contain the best
* paths found so far.
*/
void
add_paths_to_joinrel(Query *root,
RelOptInfo *joinrel,
RelOptInfo *outerrel,
RelOptInfo *innerrel,
List *restrictlist)
{
List *mergeclause_list = NIL;
/*
* Find potential mergejoin clauses.
*/
if (enable_mergejoin)
mergeclause_list = select_mergejoin_clauses(joinrel,
outerrel,
innerrel,
restrictlist);
/*
* 1. Consider mergejoin paths where both relations must be
* explicitly sorted.
*/
sort_inner_and_outer(root, joinrel, outerrel, innerrel,
restrictlist, mergeclause_list);
/*
* 2. Consider paths where the outer relation need not be
* explicitly sorted. This includes both nestloops and
* mergejoins where the outer path is already ordered.
*/
match_unsorted_outer(root, joinrel, outerrel, innerrel,
restrictlist, mergeclause_list);
#ifdef NOT_USED
/*
* 3. Consider paths where the inner relation need not be
* explicitly sorted. This includes mergejoins only
* (nestloops were already built in match_unsorted_outer).
*
* Diked out as redundant 2/13/2000 -- tgl. There isn't any
* really significant difference between the inner and outer
* side of a mergejoin, so match_unsorted_inner creates no paths
* that aren't equivalent to those made by match_unsorted_outer
* when add_paths_to_joinrel() is invoked with the two rels given
* in the other order.
*/
match_unsorted_inner(root, joinrel, outerrel, innerrel,
restrictlist, mergeclause_list);
#endif
/*
* 4. Consider paths where both outer and inner relations must be
* hashed before being joined.
*/
if (enable_hashjoin)
hash_inner_and_outer(root, joinrel, outerrel, innerrel,
restrictlist);
}
/*
* sort_inner_and_outer
* Create mergejoin join paths by explicitly sorting both the outer and
* inner join relations on each available merge ordering.
*
* 'joinrel' is the join relation
* 'outerrel' is the outer join relation
* 'innerrel' is the inner join relation
* 'restrictlist' contains all of the RestrictInfo nodes for restriction
* clauses that apply to this join
* 'mergeclause_list' is a list of RestrictInfo nodes for available
* mergejoin clauses in this join
*/
static void
sort_inner_and_outer(Query *root,
RelOptInfo *joinrel,
RelOptInfo *outerrel,
RelOptInfo *innerrel,
List *restrictlist,
List *mergeclause_list)
{
List *i;
/*
* Each possible ordering of the available mergejoin clauses will
* generate a differently-sorted result path at essentially the
* same cost. We have no basis for choosing one over another at
* this level of joining, but some sort orders may be more useful
* than others for higher-level mergejoins. Generating a path here
* for *every* permutation of mergejoin clauses doesn't seem like
* a winning strategy, however; the cost in planning time is too high.
*
* For now, we generate one path for each mergejoin clause, listing that
* clause first and the rest in random order. This should allow at least
* a one-clause mergejoin without re-sorting against any other possible
* mergejoin partner path. But if we've not guessed the right ordering
* of secondary clauses, we may end up evaluating clauses as qpquals when
* they could have been done as mergeclauses. We need to figure out a
* better way. (Two possible approaches: look at all the relevant index
* relations to suggest plausible sort orders, or make just one output
* path and somehow mark it as having a sort-order that can be rearranged
* freely.)
*/
foreach(i, mergeclause_list)
{
RestrictInfo *restrictinfo = lfirst(i);
List *curclause_list;
List *outerkeys;
List *innerkeys;
List *merge_pathkeys;
/* Make a mergeclause list with this guy first. */
curclause_list = lcons(restrictinfo,
lremove(restrictinfo,
listCopy(mergeclause_list)));
/* Build sort pathkeys for both sides.
*
* Note: it's possible that the cheapest paths will already be
* sorted properly. create_mergejoin_path will detect that case
* and suppress an explicit sort step, so we needn't do so here.
*/
outerkeys = make_pathkeys_for_mergeclauses(root,
curclause_list,
outerrel->targetlist);
innerkeys = make_pathkeys_for_mergeclauses(root,
curclause_list,
innerrel->targetlist);
/* Build pathkeys representing output sort order. */
merge_pathkeys = build_join_pathkeys(outerkeys,
joinrel->targetlist,
root->equi_key_list);
/*
* And now we can make the path. We only consider the cheapest-
* total-cost input paths, since we are assuming here that a sort
* is required. We will consider cheapest-startup-cost input paths
* later, and only if they don't need a sort.
*/
add_path(joinrel, (Path *)
create_mergejoin_path(joinrel,
outerrel->cheapest_total_path,
innerrel->cheapest_total_path,
restrictlist,
merge_pathkeys,
get_actual_clauses(curclause_list),
outerkeys,
innerkeys));
}
}
/*
* match_unsorted_outer
* Creates possible join paths for processing a single join relation
* 'joinrel' by employing either iterative substitution or
* mergejoining on each of its possible outer paths (considering
* only outer paths that are already ordered well enough for merging).
*
* We always generate a nestloop path for each available outer path.
* In fact we may generate as many as three: one on the cheapest-total-cost
* inner path, one on the cheapest-startup-cost inner path (if different),
* and one on the best inner-indexscan path (if any).
*
* We also consider mergejoins if mergejoin clauses are available. We have
* two ways to generate the inner path for a mergejoin: sort the cheapest
* inner path, or use an inner path that is already suitably ordered for the
* merge. If we have several mergeclauses, it could be that there is no inner
* path (or only a very expensive one) for the full list of mergeclauses, but
* better paths exist if we truncate the mergeclause list (thereby discarding
* some sort key requirements). So, we consider truncations of the
* mergeclause list as well as the full list. (Ideally we'd consider all
* subsets of the mergeclause list, but that seems way too expensive.)
*
* 'joinrel' is the join relation
* 'outerrel' is the outer join relation
* 'innerrel' is the inner join relation
* 'restrictlist' contains all of the RestrictInfo nodes for restriction
* clauses that apply to this join
* 'mergeclause_list' is a list of RestrictInfo nodes for available
* mergejoin clauses in this join
*/
static void
match_unsorted_outer(Query *root,
RelOptInfo *joinrel,
RelOptInfo *outerrel,
RelOptInfo *innerrel,
List *restrictlist,
List *mergeclause_list)
{
Path *bestinnerjoin;
List *i;
/*
* Get the best innerjoin indexpath (if any) for this outer rel.
* It's the same for all outer paths.
*/
bestinnerjoin = best_innerjoin(innerrel->innerjoin, outerrel->relids);
foreach(i, outerrel->pathlist)
{
Path *outerpath = (Path *) lfirst(i);
List *merge_pathkeys;
List *mergeclauses;
List *innersortkeys;
List *trialsortkeys;
Path *cheapest_startup_inner;
Path *cheapest_total_inner;
int clausecnt;
/*
* The result will have this sort order (even if it is implemented
* as a nestloop, and even if some of the mergeclauses are implemented
* by qpquals rather than as true mergeclauses):
*/
merge_pathkeys = build_join_pathkeys(outerpath->pathkeys,
joinrel->targetlist,
root->equi_key_list);
/*
* Always consider a nestloop join with this outer and cheapest-
* total-cost inner. Consider nestloops using the cheapest-
* startup-cost inner as well, and the best innerjoin indexpath.
*/
add_path(joinrel, (Path *)
create_nestloop_path(joinrel,
outerpath,
innerrel->cheapest_total_path,
restrictlist,
merge_pathkeys));
if (innerrel->cheapest_startup_path != innerrel->cheapest_total_path)
add_path(joinrel, (Path *)
create_nestloop_path(joinrel,
outerpath,
innerrel->cheapest_startup_path,
restrictlist,
merge_pathkeys));
if (bestinnerjoin != NULL)
add_path(joinrel, (Path *)
create_nestloop_path(joinrel,
outerpath,
bestinnerjoin,
restrictlist,
merge_pathkeys));
/* Look for useful mergeclauses (if any) */
mergeclauses = find_mergeclauses_for_pathkeys(outerpath->pathkeys,
mergeclause_list);
/* Done with this outer path if no chance for a mergejoin */
if (mergeclauses == NIL)
continue;
/* Compute the required ordering of the inner path */
innersortkeys = make_pathkeys_for_mergeclauses(root,
mergeclauses,
innerrel->targetlist);
/*
* Generate a mergejoin on the basis of sorting the cheapest inner.
* Since a sort will be needed, only cheapest total cost matters.
*/
add_path(joinrel, (Path *)
create_mergejoin_path(joinrel,
outerpath,
innerrel->cheapest_total_path,
restrictlist,
merge_pathkeys,
get_actual_clauses(mergeclauses),
NIL,
innersortkeys));
/*
* Look for presorted inner paths that satisfy the mergeclause list
* or any truncation thereof. Here, we consider both cheap startup
* cost and cheap total cost.
*/
trialsortkeys = listCopy(innersortkeys); /* modifiable copy */
cheapest_startup_inner = NULL;
cheapest_total_inner = NULL;
for (clausecnt = length(mergeclauses); clausecnt > 0; clausecnt--)
{
Path *innerpath;
/* Look for an inner path ordered well enough to merge with
* the first 'clausecnt' mergeclauses. NB: trialsortkeys list
* is modified destructively, which is why we made a copy...
*/
trialsortkeys = ltruncate(clausecnt, trialsortkeys);
innerpath = get_cheapest_path_for_pathkeys(innerrel->pathlist,
trialsortkeys,
TOTAL_COST);
if (innerpath != NULL &&
(cheapest_total_inner == NULL ||
compare_path_costs(innerpath, cheapest_total_inner,
TOTAL_COST) < 0))
{
/* Found a cheap (or even-cheaper) sorted path */
List *newclauses;
newclauses = ltruncate(clausecnt,
get_actual_clauses(mergeclauses));
add_path(joinrel, (Path *)
create_mergejoin_path(joinrel,
outerpath,
innerpath,
restrictlist,
merge_pathkeys,
newclauses,
NIL,
NIL));
cheapest_total_inner = innerpath;
}
/* Same on the basis of cheapest startup cost ... */
innerpath = get_cheapest_path_for_pathkeys(innerrel->pathlist,
trialsortkeys,
STARTUP_COST);
if (innerpath != NULL &&
(cheapest_startup_inner == NULL ||
compare_path_costs(innerpath, cheapest_startup_inner,
STARTUP_COST) < 0))
{
/* Found a cheap (or even-cheaper) sorted path */
if (innerpath != cheapest_total_inner)
{
List *newclauses;
newclauses = ltruncate(clausecnt,
get_actual_clauses(mergeclauses));
add_path(joinrel, (Path *)
create_mergejoin_path(joinrel,
outerpath,
innerpath,
restrictlist,
merge_pathkeys,
newclauses,
NIL,
NIL));
}
cheapest_startup_inner = innerpath;
}
}
}
}
#ifdef NOT_USED
/*
* match_unsorted_inner
* Generate mergejoin paths that use an explicit sort of the outer path
* with an already-ordered inner path.
*
* 'joinrel' is the join result relation
* 'outerrel' is the outer join relation
* 'innerrel' is the inner join relation
* 'restrictlist' contains all of the RestrictInfo nodes for restriction
* clauses that apply to this join
* 'mergeclause_list' is a list of RestrictInfo nodes for available
* mergejoin clauses in this join
*/
static void
match_unsorted_inner(Query *root,
RelOptInfo *joinrel,
RelOptInfo *outerrel,
RelOptInfo *innerrel,
List *restrictlist,
List *mergeclause_list)
{
List *i;
foreach(i, innerrel->pathlist)
{
Path *innerpath = (Path *) lfirst(i);
List *mergeclauses;
List *outersortkeys;
List *merge_pathkeys;
Path *totalouterpath;
Path *startupouterpath;
/* Look for useful mergeclauses (if any) */
mergeclauses = find_mergeclauses_for_pathkeys(innerpath->pathkeys,
mergeclause_list);
if (mergeclauses == NIL)
continue;
/* Compute the required ordering of the outer path */
outersortkeys = make_pathkeys_for_mergeclauses(root,
mergeclauses,
outerrel->targetlist);
/*
* Generate a mergejoin on the basis of sorting the cheapest outer.
* Since a sort will be needed, only cheapest total cost matters.
*/
merge_pathkeys = build_join_pathkeys(outersortkeys,
joinrel->targetlist,
root->equi_key_list);
add_path(joinrel, (Path *)
create_mergejoin_path(joinrel,
outerrel->cheapest_total_path,
innerpath,
restrictlist,
merge_pathkeys,
get_actual_clauses(mergeclauses),
outersortkeys,
NIL));
/*
* Now generate mergejoins based on already-sufficiently-ordered
* outer paths. There's likely to be some redundancy here with paths
* already generated by merge_unsorted_outer ... but since
* merge_unsorted_outer doesn't consider all permutations of the
* mergeclause list, it may fail to notice that this particular
* innerpath could have been used with this outerpath.
*/
totalouterpath = get_cheapest_path_for_pathkeys(outerrel->pathlist,
outersortkeys,
TOTAL_COST);
if (totalouterpath == NULL)
continue; /* there won't be a startup-cost path either */
merge_pathkeys = build_join_pathkeys(totalouterpath->pathkeys,
joinrel->targetlist,
root->equi_key_list);
add_path(joinrel, (Path *)
create_mergejoin_path(joinrel,
totalouterpath,
innerpath,
restrictlist,
merge_pathkeys,
get_actual_clauses(mergeclauses),
NIL,
NIL));
startupouterpath = get_cheapest_path_for_pathkeys(outerrel->pathlist,
outersortkeys,
STARTUP_COST);
if (startupouterpath != NULL && startupouterpath != totalouterpath)
{
merge_pathkeys = build_join_pathkeys(startupouterpath->pathkeys,
joinrel->targetlist,
root->equi_key_list);
add_path(joinrel, (Path *)
create_mergejoin_path(joinrel,
startupouterpath,
innerpath,
restrictlist,
merge_pathkeys,
get_actual_clauses(mergeclauses),
NIL,
NIL));
}
}
}
#endif
/*
* hash_inner_and_outer
* Create hashjoin join paths by explicitly hashing both the outer and
* inner join relations of each available hash clause.
*
* 'joinrel' is the join relation
* 'outerrel' is the outer join relation
* 'innerrel' is the inner join relation
* 'restrictlist' contains all of the RestrictInfo nodes for restriction
* clauses that apply to this join
*/
static void
hash_inner_and_outer(Query *root,
RelOptInfo *joinrel,
RelOptInfo *outerrel,
RelOptInfo *innerrel,
List *restrictlist)
{
Relids outerrelids = outerrel->relids;
Relids innerrelids = innerrel->relids;
List *i;
/*
* Scan the join's restrictinfo list to find hashjoinable clauses
* that are usable with this pair of sub-relations. Since we currently
* accept only var-op-var clauses as hashjoinable, we need only check
* the membership of the vars to determine whether a particular clause
* can be used with this pair of sub-relations. This code would need
* to be upgraded if we wanted to allow more-complex expressions in
* hash joins.
*/
foreach(i, restrictlist)
{
RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(i);
Expr *clause;
Var *left,
*right,
*inner;
Selectivity innerdisbursion;
if (restrictinfo->hashjoinoperator == InvalidOid)
continue; /* not hashjoinable */
clause = restrictinfo->clause;
/* these must be OK, since check_hashjoinable accepted the clause */
left = get_leftop(clause);
right = get_rightop(clause);
/* check if clause is usable with these sub-rels, find inner var */
if (intMember(left->varno, outerrelids) &&
intMember(right->varno, innerrelids))
inner = right;
else if (intMember(left->varno, innerrelids) &&
intMember(right->varno, outerrelids))
inner = left;
else
continue; /* no good for these input relations */
/* estimate disbursion of inner var for costing purposes */
innerdisbursion = estimate_disbursion(root, inner);
/*
* We consider both the cheapest-total-cost and cheapest-startup-cost
* outer paths. There's no need to consider any but the cheapest-
* total-cost inner path, however.
*/
add_path(joinrel, (Path *)
create_hashjoin_path(joinrel,
outerrel->cheapest_total_path,
innerrel->cheapest_total_path,
restrictlist,
lcons(clause, NIL),
innerdisbursion));
if (outerrel->cheapest_startup_path != outerrel->cheapest_total_path)
add_path(joinrel, (Path *)
create_hashjoin_path(joinrel,
outerrel->cheapest_startup_path,
innerrel->cheapest_total_path,
restrictlist,
lcons(clause, NIL),
innerdisbursion));
}
}
/*
* best_innerjoin
* Find the cheapest index path that has already been identified by
* indexable_joinclauses() as being a possible inner path for the given
* outer relation(s) in a nestloop join.
*
* We compare indexpaths on total_cost only, assuming that they will all have
* zero or negligible startup_cost. We might have to think harder someday...
*
* 'join_paths' is a list of potential inner indexscan join paths
* 'outer_relids' is the relid list of the outer join relation
*
* Returns the pathnode of the best path, or NULL if there's no
* usable path.
*/
static Path *
best_innerjoin(List *join_paths, Relids outer_relids)
{
Path *cheapest = (Path *) NULL;
List *join_path;
foreach(join_path, join_paths)
{
Path *path = (Path *) lfirst(join_path);
Assert(IsA(path, IndexPath));
/* path->joinrelids is the set of base rels that must be part of
* outer_relids in order to use this inner path, because those
* rels are used in the index join quals of this inner path.
*/
if (is_subseti(((IndexPath *) path)->joinrelids, outer_relids) &&
(cheapest == NULL ||
compare_path_costs(path, cheapest, TOTAL_COST) < 0))
cheapest = path;
}
return cheapest;
}
/*
* Estimate disbursion of the specified Var
*
* We use a default of 0.1 if we can't figure out anything better.
* This will typically discourage use of a hash rather strongly,
* if the inner relation is large. We do not want to hash unless
* we know that the inner rel is well-dispersed (or the alternatives
* seem much worse).
*/
static Selectivity
estimate_disbursion(Query *root, Var *var)
{
Oid relid;
if (! IsA(var, Var))
return 0.1;
relid = getrelid(var->varno, root->rtable);
return (Selectivity) get_attdisbursion(relid, var->varattno, 0.1);
}
/*
* select_mergejoin_clauses
* Select mergejoin clauses that are usable for a particular join.
* Returns a list of RestrictInfo nodes for those clauses.
*
* We examine each restrictinfo clause known for the join to see
* if it is mergejoinable and involves vars from the two sub-relations
* currently of interest.
*
* Since we currently allow only plain Vars as the left and right sides
* of mergejoin clauses, this test is relatively simple. This routine
* would need to be upgraded to support more-complex expressions
* as sides of mergejoins. In theory, we could allow arbitrarily complex
* expressions in mergejoins, so long as one side uses only vars from one
* sub-relation and the other side uses only vars from the other.
*/
static List *
select_mergejoin_clauses(RelOptInfo *joinrel,
RelOptInfo *outerrel,
RelOptInfo *innerrel,
List *restrictlist)
{
List *result_list = NIL;
Relids outerrelids = outerrel->relids;
Relids innerrelids = innerrel->relids;
List *i;
foreach(i, restrictlist)
{
RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(i);
Expr *clause;
Var *left,
*right;
if (restrictinfo->mergejoinoperator == InvalidOid)
continue; /* not mergejoinable */
clause = restrictinfo->clause;
/* these must be OK, since check_mergejoinable accepted the clause */
left = get_leftop(clause);
right = get_rightop(clause);
if ((intMember(left->varno, outerrelids) &&
intMember(right->varno, innerrelids)) ||
(intMember(left->varno, innerrelids) &&
intMember(right->varno, outerrelids)))
result_list = lcons(restrictinfo, result_list);
}
return result_list;
}