postgresql/src/backend/optimizer/path/orindxpath.c

386 lines
13 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* orindxpath.c
* Routines to find index paths that match a set of OR clauses
*
2004-08-29 06:13:13 +02:00
* Portions Copyright (c) 1996-2004, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
2004-08-29 06:13:13 +02:00
* $PostgreSQL: pgsql/src/backend/optimizer/path/orindxpath.c,v 1.61 2004/08/29 04:12:33 momjian Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
1999-07-16 07:00:38 +02:00
#include "optimizer/clauses.h"
#include "optimizer/cost.h"
#include "optimizer/pathnode.h"
1999-07-16 07:00:38 +02:00
#include "optimizer/paths.h"
#include "optimizer/restrictinfo.h"
static IndexPath *best_or_subclause_indexes(Query *root, RelOptInfo *rel,
List *subclauses);
static bool best_or_subclause_index(Query *root,
RelOptInfo *rel,
Expr *subclause,
IndexOptInfo **retIndexInfo,
List **retIndexClauses,
List **retIndexQuals,
Cost *retStartupCost,
Cost *retTotalCost);
/*----------
* create_or_index_quals
* Examine join OR-of-AND quals to see if any useful restriction OR
* clauses can be extracted. If so, add them to the query.
*
* Although a join clause must reference other relations overall,
* an OR of ANDs clause might contain sub-clauses that reference just this
* relation and can be used to build a restriction clause.
* For example consider
* WHERE ((a.x = 42 AND b.y = 43) OR (a.x = 44 AND b.z = 45));
* We can transform this into
* WHERE ((a.x = 42 AND b.y = 43) OR (a.x = 44 AND b.z = 45))
* AND (a.x = 42 OR a.x = 44)
* AND (b.y = 43 OR b.z = 45);
* which opens the potential to build OR indexscans on a and b. In essence
* this is a partial transformation to CNF (AND of ORs format). It is not
* complete, however, because we do not unravel the original OR --- doing so
* would usually bloat the qualification expression to little gain.
*
* The added quals are partially redundant with the original OR, and therefore
* will cause the size of the joinrel to be underestimated when it is finally
* formed. (This would be true of a full transformation to CNF as well; the
* fault is not really in the transformation, but in clauselist_selectivity's
* inability to recognize redundant conditions.) To minimize the collateral
* damage, we want to minimize the number of quals added. Therefore we do
* not add every possible extracted restriction condition to the query.
* Instead, we search for the single restriction condition that generates
* the most useful (cheapest) OR indexscan, and add only that condition.
* This is a pretty ad-hoc heuristic, but quite useful.
*
* We can then compensate for the redundancy of the added qual by poking
* the recorded selectivity of the original OR clause, thereby ensuring
* the added qual doesn't change the estimated size of the joinrel when
* it is finally formed. This is a MAJOR HACK: it depends on the fact
* that clause selectivities are cached and on the fact that the same
* RestrictInfo node will appear in every joininfo list that might be used
* when the joinrel is formed. And it probably isn't right in cases where
* the size estimation is nonlinear (i.e., outer and IN joins). But it
* beats not doing anything.
*
* NOTE: one might think this messiness could be worked around by generating
* the indexscan path with a small path->rows value, and not touching the
* rel's baserestrictinfo or rel->rows. However, that does not work.
* The optimizer's fundamental design assumes that every general-purpose
* Path for a given relation generates the same number of rows. Without
* this assumption we'd not be able to optimize solely on the cost of Paths,
* but would have to take number of output rows into account as well.
* (Perhaps someday that'd be worth doing, but it's a pretty big change...)
*
* 'rel' is the relation entry for which quals are to be created
*
* If successful, adds qual(s) to rel->baserestrictinfo and returns TRUE.
* If no quals available, returns FALSE and doesn't change rel.
*
* Note: check_partial_indexes() must have been run previously.
*----------
*/
bool
create_or_index_quals(Query *root, RelOptInfo *rel)
{
IndexPath *bestpath = NULL;
RestrictInfo *bestrinfo = NULL;
List *newrinfos;
RestrictInfo *or_rinfo;
Selectivity or_selec,
orig_selec;
ListCell *i;
/*
* We use the best_or_subclause_indexes() machinery to locate the
* best combination of restriction subclauses. Note we must ignore
* any joinclauses that are not marked valid_everywhere, because they
* cannot be pushed down due to outer-join rules.
*/
foreach(i, rel->joininfo)
{
JoinInfo *joininfo = (JoinInfo *) lfirst(i);
ListCell *j;
foreach(j, joininfo->jinfo_restrictinfo)
{
RestrictInfo *rinfo = (RestrictInfo *) lfirst(j);
if (restriction_is_or_clause(rinfo) &&
rinfo->valid_everywhere)
{
IndexPath *pathnode;
pathnode = best_or_subclause_indexes(root,
rel,
((BoolExpr *) rinfo->orclause)->args);
if (pathnode)
{
if (bestpath == NULL ||
pathnode->path.total_cost < bestpath->path.total_cost)
{
bestpath = pathnode;
bestrinfo = rinfo;
}
}
}
}
}
/* Fail if no suitable clauses found */
if (bestpath == NULL)
return false;
/*
* Convert the indexclauses structure to a RestrictInfo tree,
* and add it to the rel's restriction list.
*/
newrinfos = make_restrictinfo_from_indexclauses(bestpath->indexclauses,
true, true);
Assert(list_length(newrinfos) == 1);
or_rinfo = (RestrictInfo *) linitial(newrinfos);
rel->baserestrictinfo = list_concat(rel->baserestrictinfo, newrinfos);
/*
* Adjust the original OR clause's cached selectivity to compensate
* for the selectivity of the added (but redundant) lower-level qual.
* This should result in the join rel getting approximately the same
* rows estimate as it would have gotten without all these shenanigans.
* (XXX major hack alert ... this depends on the assumption that the
* selectivity will stay cached ...)
*/
or_selec = clause_selectivity(root, (Node *) or_rinfo,
0, JOIN_INNER);
if (or_selec > 0 && or_selec < 1)
{
orig_selec = clause_selectivity(root, (Node *) bestrinfo,
0, JOIN_INNER);
bestrinfo->this_selec = orig_selec / or_selec;
/* clamp result to sane range */
if (bestrinfo->this_selec > 1)
bestrinfo->this_selec = 1;
}
/* Tell caller to recompute rel's rows estimate */
return true;
}
/*
* create_or_index_paths
* Creates multi-scan index paths for indexes that match OR clauses.
*
* 'rel' is the relation entry for which the paths are to be created
*
* Returns nothing, but adds paths to rel->pathlist via add_path().
*
* Note: check_partial_indexes() must have been run previously.
*/
void
create_or_index_paths(Query *root, RelOptInfo *rel)
{
ListCell *l;
/*
* Check each restriction clause to see if it is an OR clause, and if so,
* try to make a path using it.
*/
foreach(l, rel->baserestrictinfo)
{
RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
if (restriction_is_or_clause(rinfo))
{
IndexPath *pathnode;
pathnode = best_or_subclause_indexes(root,
rel,
((BoolExpr *) rinfo->orclause)->args);
if (pathnode)
add_path(rel, (Path *) pathnode);
}
}
}
/*
* best_or_subclause_indexes
* Determine the best index to be used in conjunction with each subclause
* of an OR clause, and build a Path for a multi-index scan.
*
* 'rel' is the node of the relation to be scanned
* 'subclauses' are the subclauses of the OR clause (must be the modified
* form that includes sub-RestrictInfo clauses)
*
* Returns an IndexPath if successful, or NULL if it is not possible to
* find an index for each OR subclause.
*
* NOTE: we choose each scan on the basis of its total cost, ignoring startup
* cost. This is reasonable as long as all index types have zero or small
* startup cost, but we might have to work harder if any index types with
* nontrivial startup cost are ever invented.
*
* This routine also creates the indexqual list that will be needed by
* the executor. The indexqual list has one entry for each scan of the base
* rel, which is a sublist of indexqual conditions to apply in that scan.
* The implicit semantics are AND across each sublist of quals, and OR across
* the toplevel list (note that the executor takes care not to return any
* single tuple more than once).
*/
static IndexPath *
best_or_subclause_indexes(Query *root,
1999-05-26 00:43:53 +02:00
RelOptInfo *rel,
List *subclauses)
{
List *infos = NIL;
List *clauses = NIL;
List *quals = NIL;
Cost path_startup_cost = 0;
Cost path_total_cost = 0;
ListCell *slist;
IndexPath *pathnode;
/* Gather info for each OR subclause */
foreach(slist, subclauses)
{
Expr *subclause = lfirst(slist);
IndexOptInfo *best_indexinfo;
List *best_indexclauses;
List *best_indexquals;
Cost best_startup_cost;
Cost best_total_cost;
if (!best_or_subclause_index(root, rel, subclause,
&best_indexinfo,
&best_indexclauses, &best_indexquals,
&best_startup_cost, &best_total_cost))
return NULL; /* failed to match this subclause */
infos = lappend(infos, best_indexinfo);
clauses = lappend(clauses, best_indexclauses);
quals = lappend(quals, best_indexquals);
/*
* Path startup_cost is the startup cost for the first index scan only;
* startup costs for later scans will be paid later on, so they just
* get reflected in total_cost.
*
* Total cost is sum of the per-scan costs.
*/
if (slist == list_head(subclauses)) /* first scan? */
path_startup_cost = best_startup_cost;
path_total_cost += best_total_cost;
}
/* We succeeded, so build an IndexPath node */
pathnode = makeNode(IndexPath);
pathnode->path.pathtype = T_IndexScan;
pathnode->path.parent = rel;
pathnode->path.startup_cost = path_startup_cost;
pathnode->path.total_cost = path_total_cost;
/*
* This is an IndexScan, but the overall result will consist of tuples
* extracted in multiple passes (one for each subclause of the OR),
* so the result cannot be claimed to have any particular ordering.
*/
pathnode->path.pathkeys = NIL;
pathnode->indexinfo = infos;
pathnode->indexclauses = clauses;
pathnode->indexquals = quals;
/* It's not an innerjoin path. */
pathnode->isjoininner = false;
/* We don't actually care what order the index scans in. */
pathnode->indexscandir = NoMovementScanDirection;
/*
* The number of rows is the same as the parent rel's estimate, since
* this isn't a join inner indexscan.
*/
pathnode->rows = rel->rows;
return pathnode;
}
/*
* best_or_subclause_index
* Determines which is the best index to be used with a subclause of an
* OR clause by estimating the cost of using each index and selecting
* the least expensive (considering total cost only, for now).
*
* Returns FALSE if no index exists that can be used with this OR subclause;
* in that case the output parameters are not set.
*
* 'rel' is the node of the relation to be scanned
* 'subclause' is the OR subclause being considered
*
* '*retIndexInfo' gets the IndexOptInfo of the best index
* '*retIndexClauses' gets a list of the index clauses for the best index
* '*retIndexQuals' gets a list of the expanded indexquals for the best index
* '*retStartupCost' gets the startup cost of a scan with that index
* '*retTotalCost' gets the total cost of a scan with that index
*/
static bool
best_or_subclause_index(Query *root,
1999-05-26 00:43:53 +02:00
RelOptInfo *rel,
Expr *subclause,
IndexOptInfo **retIndexInfo, /* return value */
List **retIndexClauses, /* return value */
List **retIndexQuals, /* return value */
Cost *retStartupCost, /* return value */
Cost *retTotalCost) /* return value */
{
bool found = false;
ListCell *ilist;
1998-08-31 09:19:56 +02:00
foreach(ilist, rel->indexlist)
{
IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist);
List *indexclauses;
List *indexquals;
Path subclause_path;
/* Ignore partial indexes that do not match the query */
if (index->indpred != NIL && !index->predOK)
continue;
/* Collect index clauses usable with this index */
indexclauses = group_clauses_by_indexkey_for_or(rel, index, subclause);
/* Ignore index if it doesn't match the subclause at all */
if (indexclauses == NIL)
continue;
/* Convert clauses to indexquals the executor can handle */
indexquals = expand_indexqual_conditions(index, indexclauses);
cost_index(&subclause_path, root, rel, index, indexquals, false);
if (!found || subclause_path.total_cost < *retTotalCost)
{
*retIndexInfo = index;
*retIndexClauses = flatten_clausegroups_list(indexclauses);
*retIndexQuals = indexquals;
*retStartupCost = subclause_path.startup_cost;
*retTotalCost = subclause_path.total_cost;
found = true;
}
1998-08-02 00:12:13 +02:00
}
return found;
}