Implement "join removal" for cases where the inner side of a left join

is unique and is not referenced above the join.  In this case the inner
side doesn't affect the query result and can be thrown away entirely.
Although perhaps nobody would ever write such a thing by hand, it's
a reasonably common case in machine-generated SQL.

The current implementation only recognizes the case where the inner side
is a simple relation with a unique index matching the query conditions.
This is enough for the use-cases that have been shown so far, but we
might want to try to handle other cases later.

Robert Haas, somewhat rewritten by Tom
This commit is contained in:
Tom Lane 2009-09-17 20:49:29 +00:00
parent e3f027115a
commit 488d70ab46
11 changed files with 349 additions and 11 deletions

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.363 2009/07/30 02:45:37 tgl Exp $
* $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.364 2009/09/17 20:49:28 tgl Exp $
*
* NOTES
* Every node type that can appear in stored rules' parsetrees *must*
@ -1421,6 +1421,16 @@ _outUniquePath(StringInfo str, UniquePath *node)
WRITE_FLOAT_FIELD(rows, "%.0f");
}
static void
_outNoOpPath(StringInfo str, NoOpPath *node)
{
WRITE_NODE_TYPE("NOOPPATH");
_outPathInfo(str, (Path *) node);
WRITE_NODE_FIELD(subpath);
}
static void
_outNestPath(StringInfo str, NestPath *node)
{
@ -2634,6 +2644,9 @@ _outNode(StringInfo str, void *obj)
case T_UniquePath:
_outUniquePath(str, obj);
break;
case T_NoOpPath:
_outNoOpPath(str, obj);
break;
case T_NestPath:
_outNestPath(str, obj);
break;

View File

@ -1,4 +1,4 @@
$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.50 2009/07/21 02:02:44 tgl Exp $
$PostgreSQL: pgsql/src/backend/optimizer/README,v 1.51 2009/09/17 20:49:28 tgl Exp $
Optimizer
=========
@ -354,6 +354,7 @@ RelOptInfo - a relation or joined relations
NestPath - nested-loop joins
MergePath - merge joins
HashPath - hash joins
NoOpPath - same as its input path (used when a join is removed)
EquivalenceClass - a data structure representing a set of values known equal

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/path/allpaths.c,v 1.185 2009/09/02 17:52:24 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/path/allpaths.c,v 1.186 2009/09/17 20:49:28 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -1387,6 +1387,10 @@ print_path(PlannerInfo *root, Path *path, int indent)
ptype = "Unique";
subpath = ((UniquePath *) path)->subpath;
break;
case T_NoOpPath:
ptype = "NoOp";
subpath = ((NoOpPath *) path)->subpath;
break;
case T_NestPath:
ptype = "NestLoop";
join = true;

View File

@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/path/indxpath.c,v 1.241 2009/08/04 16:08:36 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/path/indxpath.c,v 1.242 2009/09/17 20:49:28 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -1918,6 +1918,86 @@ find_clauses_for_join(PlannerInfo *root, RelOptInfo *rel,
return clause_list;
}
/*
* relation_has_unique_index_for
* Determine whether the relation provably has at most one row satisfying
* a set of equality conditions, because the conditions constrain all
* columns of some unique index.
*
* The conditions are provided as a list of RestrictInfo nodes, where the
* caller has already determined that each condition is a mergejoinable
* equality with an expression in this relation on one side, and an
* expression not involving this relation on the other. The transient
* outer_is_left flag is used to identify which side we should look at:
* left side if outer_is_left is false, right side if it is true.
*/
bool
relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
List *restrictlist)
{
ListCell *ic;
/* Short-circuit the easy case */
if (restrictlist == NIL)
return false;
/* Examine each index of the relation ... */
foreach(ic, rel->indexlist)
{
IndexOptInfo *ind = (IndexOptInfo *) lfirst(ic);
int c;
/*
* If the index is not unique or if it's a partial index that doesn't
* match the query, it's useless here.
*/
if (!ind->unique || (ind->indpred != NIL && !ind->predOK))
continue;
/*
* Try to find each index column in the list of conditions. This is
* O(n^2) or worse, but we expect all the lists to be short.
*/
for (c = 0; c < ind->ncolumns; c++)
{
ListCell *lc;
foreach(lc, restrictlist)
{
RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
Node *rexpr;
/*
* The condition's equality operator must be a member of the
* index opfamily, else it is not asserting the right kind
* of equality behavior for this index. We check this first
* since it's probably cheaper than match_index_to_operand().
*/
if (!list_member_oid(rinfo->mergeopfamilies, ind->opfamily[c]))
continue;
/* OK, see if the condition operand matches the index key */
if (rinfo->outer_is_left)
rexpr = get_rightop(rinfo->clause);
else
rexpr = get_leftop(rinfo->clause);
if (match_index_to_operand(rexpr, c, ind))
break; /* found a match; column is unique */
}
if (lc == NULL)
break; /* no match; this index doesn't help us */
}
/* Matched all columns of this index? */
if (c == ind->ncolumns)
return true;
}
return false;
}
/****************************************************************************
* ---- PATH CREATION UTILITIES ----

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/path/joinpath.c,v 1.123 2009/09/12 22:12:04 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/path/joinpath.c,v 1.124 2009/09/17 20:49:28 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -22,6 +22,11 @@
#include "optimizer/paths.h"
static bool join_is_removable(PlannerInfo *root, RelOptInfo *joinrel,
RelOptInfo *outerrel, RelOptInfo *innerrel,
List *restrictlist, JoinType jointype);
static void generate_outer_only(PlannerInfo *root, RelOptInfo *joinrel,
RelOptInfo *outerrel);
static void sort_inner_and_outer(PlannerInfo *root, RelOptInfo *joinrel,
RelOptInfo *outerrel, RelOptInfo *innerrel,
List *restrictlist, List *mergeclause_list,
@ -78,11 +83,26 @@ add_paths_to_joinrel(PlannerInfo *root,
{
List *mergeclause_list = NIL;
/*
* 0. Consider join removal. This is always the most efficient strategy,
* so if it works, there's no need to consider anything further.
*/
if (join_is_removable(root, joinrel, outerrel, innerrel,
restrictlist, jointype))
{
generate_outer_only(root, joinrel, outerrel);
return;
}
/*
* Find potential mergejoin clauses. We can skip this if we are not
* interested in doing a mergejoin. However, mergejoin is currently our
* only way of implementing full outer joins, so override mergejoin
* disable if it's a full join.
*
* Note: do this after join_is_removable(), because this sets the
* outer_is_left flags in the mergejoin clauses, while join_is_removable
* uses those flags for its own purposes.
*/
if (enable_mergejoin || jointype == JOIN_FULL)
mergeclause_list = select_mergejoin_clauses(root,
@ -133,6 +153,180 @@ add_paths_to_joinrel(PlannerInfo *root,
restrictlist, jointype, sjinfo);
}
/*
* join_is_removable
* Determine whether we need not perform the join at all, because
* it will just duplicate its left input.
*
* This is true for a left join for which the join condition cannot match
* more than one inner-side row. (There are other possibly interesting
* cases, but we don't have the infrastructure to prove them.)
*
* Note: there is no need to consider the symmetrical case of duplicating the
* right input, because add_paths_to_joinrel() will be called with each rel
* on the outer side.
*/
static bool
join_is_removable(PlannerInfo *root,
RelOptInfo *joinrel,
RelOptInfo *outerrel,
RelOptInfo *innerrel,
List *restrictlist,
JoinType jointype)
{
List *clause_list = NIL;
ListCell *l;
int attroff;
/*
* Currently, we only know how to remove left joins to a baserel with
* unique indexes. We can check most of these criteria pretty trivially
* to avoid doing useless extra work. But checking whether any of the
* indexes are unique would require iterating over the indexlist, so for
* now we just make sure there are indexes of some sort or other. If none
* of them are unique, join removal will still fail, just slightly later.
*/
if (jointype != JOIN_LEFT ||
innerrel->reloptkind == RELOPT_JOINREL ||
innerrel->rtekind != RTE_RELATION ||
innerrel->indexlist == NIL)
return false;
/*
* We can't remove the join if any inner-rel attributes are used above
* the join.
*
* As a micro-optimization, it seems better to start with max_attr and
* count down rather than starting with min_attr and counting up, on the
* theory that the system attributes are somewhat less likely to be wanted
* and should be tested last.
*/
for (attroff = innerrel->max_attr - innerrel->min_attr;
attroff >= 0;
attroff--)
{
if (!bms_is_subset(innerrel->attr_needed[attroff], joinrel->relids))
return false;
}
/*
* Search for mergejoinable clauses that constrain the inner rel against
* either the outer rel or a pseudoconstant. If an operator is
* mergejoinable then it behaves like equality for some btree opclass,
* so it's what we want. The mergejoinability test also eliminates
* clauses containing volatile functions, which we couldn't depend on.
*/
foreach(l, restrictlist)
{
RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(l);
/*
* We are always considering an outer join here, so ignore pushed-down
* clauses. Also ignore anything that doesn't have a mergejoinable
* operator.
*/
if (restrictinfo->is_pushed_down)
continue;
if (!restrictinfo->can_join ||
restrictinfo->mergeopfamilies == NIL)
continue; /* not mergejoinable */
/*
* Check if clause is usable with these input rels. All the vars
* needed on each side of the clause must be available from one or the
* other of the input rels.
*/
if (bms_is_subset(restrictinfo->left_relids, outerrel->relids) &&
bms_is_subset(restrictinfo->right_relids, innerrel->relids))
{
/* righthand side is inner */
restrictinfo->outer_is_left = true;
}
else if (bms_is_subset(restrictinfo->left_relids, innerrel->relids) &&
bms_is_subset(restrictinfo->right_relids, outerrel->relids))
{
/* lefthand side is inner */
restrictinfo->outer_is_left = false;
}
else
continue; /* no good for these input relations */
/* OK, add to list */
clause_list = lappend(clause_list, restrictinfo);
}
/* Now examine the rel's restriction clauses for var = const clauses */
foreach(l, innerrel->baserestrictinfo)
{
RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(l);
/*
* Note: can_join won't be set for a restriction clause, but
* mergeopfamilies will be if it has a mergejoinable operator
* and doesn't contain volatile functions.
*/
if (restrictinfo->mergeopfamilies == NIL)
continue; /* not mergejoinable */
/*
* The clause certainly doesn't refer to anything but the given
* rel. If either side is pseudoconstant then we can use it.
*/
if (bms_is_empty(restrictinfo->left_relids))
{
/* righthand side is inner */
restrictinfo->outer_is_left = true;
}
else if (bms_is_empty(restrictinfo->right_relids))
{
/* lefthand side is inner */
restrictinfo->outer_is_left = false;
}
else
continue;
/* OK, add to list */
clause_list = lappend(clause_list, restrictinfo);
}
/* Now examine the indexes to see if we have a matching unique index */
if (relation_has_unique_index_for(root, innerrel, clause_list))
return true;
/*
* Some day it would be nice to check for other methods of establishing
* distinctness.
*/
return false;
}
/*
* generate_outer_only
* Generate "join" paths when we have found the join is removable.
*/
static void
generate_outer_only(PlannerInfo *root, RelOptInfo *joinrel,
RelOptInfo *outerrel)
{
ListCell *lc;
/*
* For the moment, replicate all of the outerrel's paths as join paths.
* Some of them might not really be interesting above the join, if they
* have sort orderings that have no real use except to do a mergejoin
* for the join we've just found we don't need. But distinguishing that
* case probably isn't worth the extra code it would take.
*/
foreach(lc, outerrel->pathlist)
{
Path *outerpath = (Path *) lfirst(lc);
add_path(joinrel, (Path *)
create_noop_path(root, joinrel, outerpath));
}
}
/*
* sort_inner_and_outer
* Create mergejoin join paths by explicitly sorting both the outer and

View File

@ -10,7 +10,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.262 2009/09/12 22:12:04 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.263 2009/09/17 20:49:29 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -164,6 +164,11 @@ create_plan(PlannerInfo *root, Path *best_path)
case T_WorkTableScan:
plan = create_scan_plan(root, best_path);
break;
case T_Join:
/* this is only used for no-op joins */
Assert(IsA(best_path, NoOpPath));
plan = create_plan(root, ((NoOpPath *) best_path)->subpath);
break;
case T_HashJoin:
case T_MergeJoin:
case T_NestLoop:

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.153 2009/09/12 22:12:04 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.154 2009/09/17 20:49:29 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -1215,6 +1215,26 @@ distinct_col_search(int colno, List *colnos, List *opids)
return InvalidOid;
}
/*
* create_noop_path
* Creates a path equivalent to the input subpath, but having a different
* parent rel. This is used when a join is found to be removable.
*/
NoOpPath *
create_noop_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
{
NoOpPath *pathnode = makeNode(NoOpPath);
pathnode->path.pathtype = T_Join; /* by convention */
pathnode->path.parent = rel;
pathnode->path.startup_cost = subpath->startup_cost;
pathnode->path.total_cost = subpath->total_cost;
pathnode->path.pathkeys = subpath->pathkeys;
pathnode->subpath = subpath;
return pathnode;
}
/*
* create_subqueryscan_path
* Creates a path corresponding to a sequential scan of a subquery,

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/nodes/nodes.h,v 1.224 2009/07/30 02:45:37 tgl Exp $
* $PostgreSQL: pgsql/src/include/nodes/nodes.h,v 1.225 2009/09/17 20:49:29 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -211,6 +211,7 @@ typedef enum NodeTag
T_ResultPath,
T_MaterialPath,
T_UniquePath,
T_NoOpPath,
T_EquivalenceClass,
T_EquivalenceMember,
T_PathKey,

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.174 2009/07/16 20:55:44 tgl Exp $
* $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.175 2009/09/17 20:49:29 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -783,6 +783,22 @@ typedef struct UniquePath
double rows; /* estimated number of result tuples */
} UniquePath;
/*
* NoOpPath represents exactly the same plan as its subpath. This is used
* when we have determined that a join can be eliminated. The difference
* between the NoOpPath and its subpath is just that the NoOpPath's parent
* is the whole join relation while the subpath is for one of the joined
* relations (and the other one isn't needed).
*
* Note: path.pathtype is always T_Join, but this won't actually give rise
* to a Join plan node.
*/
typedef struct NoOpPath
{
Path path;
Path *subpath;
} NoOpPath;
/*
* All join-type paths share these fields.
*/

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.80 2009/01/01 17:24:00 momjian Exp $
* $PostgreSQL: pgsql/src/include/optimizer/pathnode.h,v 1.81 2009/09/17 20:49:29 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -51,6 +51,8 @@ extern ResultPath *create_result_path(List *quals);
extern MaterialPath *create_material_path(RelOptInfo *rel, Path *subpath);
extern UniquePath *create_unique_path(PlannerInfo *root, RelOptInfo *rel,
Path *subpath, SpecialJoinInfo *sjinfo);
extern NoOpPath *create_noop_path(PlannerInfo *root, RelOptInfo *rel,
Path *subpath);
extern Path *create_subqueryscan_path(RelOptInfo *rel, List *pathkeys);
extern Path *create_functionscan_path(PlannerInfo *root, RelOptInfo *rel);
extern Path *create_valuesscan_path(PlannerInfo *root, RelOptInfo *rel);

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/optimizer/paths.h,v 1.107 2009/06/11 14:49:11 momjian Exp $
* $PostgreSQL: pgsql/src/include/optimizer/paths.h,v 1.108 2009/09/17 20:49:29 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -57,6 +57,8 @@ extern List *generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel,
extern void best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
RelOptInfo *outer_rel, JoinType jointype,
Path **cheapest_startup, Path **cheapest_total);
extern bool relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
List *restrictlist);
extern List *group_clauses_by_indexkey(IndexOptInfo *index,
List *clauses, List *outer_clauses,
Relids outer_relids,