1996-08-28 03:59:28 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
2019-01-29 22:49:25 +01:00
|
|
|
* pathnodes.h
|
|
|
|
* Definitions for planner's internal data structures, especially Paths.
|
1996-08-28 03:59:28 +02:00
|
|
|
*
|
|
|
|
*
|
2022-01-08 01:04:57 +01:00
|
|
|
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-08-28 03:59:28 +02:00
|
|
|
*
|
2019-01-29 22:49:25 +01:00
|
|
|
* src/include/nodes/pathnodes.h
|
1996-08-28 03:59:28 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
2019-01-29 22:49:25 +01:00
|
|
|
#ifndef PATHNODES_H
|
|
|
|
#define PATHNODES_H
|
1996-08-28 03:59:28 +02:00
|
|
|
|
2000-02-15 21:49:31 +01:00
|
|
|
#include "access/sdir.h"
|
2014-11-07 23:26:02 +01:00
|
|
|
#include "lib/stringinfo.h"
|
2007-02-19 08:03:34 +01:00
|
|
|
#include "nodes/params.h"
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "nodes/parsenodes.h"
|
2004-11-26 22:08:35 +01:00
|
|
|
#include "storage/block.h"
|
1996-08-28 03:59:28 +02:00
|
|
|
|
2003-02-08 21:20:55 +01:00
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
/*
|
1999-02-18 01:49:48 +01:00
|
|
|
* Relids
|
2003-02-08 21:20:55 +01:00
|
|
|
* Set of relation identifiers (indexes into the rangetable).
|
1996-08-28 03:59:28 +02:00
|
|
|
*/
|
2003-02-08 21:20:55 +01:00
|
|
|
typedef Bitmapset *Relids;
|
1996-08-28 03:59:28 +02:00
|
|
|
|
2000-02-15 21:49:31 +01:00
|
|
|
/*
|
|
|
|
* When looking for a "cheapest path", this enum specifies whether we want
|
|
|
|
* cheapest startup cost or cheapest total cost.
|
|
|
|
*/
|
|
|
|
typedef enum CostSelector
|
|
|
|
{
|
|
|
|
STARTUP_COST, TOTAL_COST
|
|
|
|
} CostSelector;
|
|
|
|
|
2003-01-12 23:35:29 +01:00
|
|
|
/*
|
|
|
|
* The cost estimate produced by cost_qual_eval() includes both a one-time
|
|
|
|
* (startup) cost, and a per-tuple cost.
|
|
|
|
*/
|
|
|
|
typedef struct QualCost
|
|
|
|
{
|
|
|
|
Cost startup; /* one-time cost */
|
|
|
|
Cost per_tuple; /* per-evaluation cost */
|
|
|
|
} QualCost;
|
|
|
|
|
2011-04-24 22:55:20 +02:00
|
|
|
/*
|
|
|
|
* Costing aggregate function execution requires these statistics about
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
* the aggregates to be executed by a given Agg node. Note that the costs
|
|
|
|
* include the execution costs of the aggregates' argument expressions as
|
2016-06-26 21:55:01 +02:00
|
|
|
* well as the aggregate functions themselves. Also, the fields must be
|
|
|
|
* defined so that initializing the struct to zeroes with memset is correct.
|
2011-04-24 22:55:20 +02:00
|
|
|
*/
|
|
|
|
typedef struct AggClauseCosts
|
|
|
|
{
|
|
|
|
QualCost transCost; /* total per-input-row execution costs */
|
2019-02-10 00:32:23 +01:00
|
|
|
QualCost finalCost; /* total per-aggregated-row costs */
|
2011-04-24 22:55:20 +02:00
|
|
|
Size transitionSpace; /* space for pass-by-ref transition data */
|
|
|
|
} AggClauseCosts;
|
|
|
|
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
/*
|
|
|
|
* This enum identifies the different types of "upper" (post-scan/join)
|
|
|
|
* relations that we might deal with during planning.
|
|
|
|
*/
|
|
|
|
typedef enum UpperRelationKind
|
|
|
|
{
|
|
|
|
UPPERREL_SETOP, /* result of UNION/INTERSECT/EXCEPT, if any */
|
Add a new upper planner relation for partially-aggregated results.
Up until now, we've abused grouped_rel->partial_pathlist as a place to
store partial paths that have been partially aggregate, but that's
really not correct, because a partial path for a relation is supposed
to be one which produces the correct results with the addition of only
a Gather or Gather Merge node, and these paths also require a Finalize
Aggregate step. Instead, add a new partially_group_rel which can hold
either partial paths (which need to be gathered and then have
aggregation finalized) or non-partial paths (which only need to have
aggregation finalized). This allows us to reuse generate_gather_paths
for partially_grouped_rel instead of writing new code, so that this
patch actually basically no net new code while making things cleaner,
simplifying things for pending patches for partition-wise aggregate.
Robert Haas and Jeevan Chalke. The larger patch series of which this
patch is a part was also reviewed and tested by Antonin Houska,
Rajkumar Raghuwanshi, David Rowley, Dilip Kumar, Konstantin Knizhnik,
Pascal Legrand, Rafia Sabih, and me.
Discussion: http://postgr.es/m/CA+TgmobrzFYS3+U8a_BCy3-hOvh5UyJbC18rEcYehxhpw5=ETA@mail.gmail.com
Discussion: http://postgr.es/m/CA+TgmoZyQEjdBNuoG9-wC5GQ5GrO4544Myo13dVptvx+uLg9uQ@mail.gmail.com
2018-02-26 15:30:12 +01:00
|
|
|
UPPERREL_PARTIAL_GROUP_AGG, /* result of partial grouping/aggregation, if
|
|
|
|
* any */
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
UPPERREL_GROUP_AGG, /* result of grouping/aggregation, if any */
|
|
|
|
UPPERREL_WINDOW, /* result of window functions, if any */
|
2021-08-22 13:31:16 +02:00
|
|
|
UPPERREL_PARTIAL_DISTINCT, /* result of partial "SELECT DISTINCT", if any */
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
UPPERREL_DISTINCT, /* result of "SELECT DISTINCT", if any */
|
|
|
|
UPPERREL_ORDERED, /* result of ORDER BY, if any */
|
|
|
|
UPPERREL_FINAL /* result of any remaining top-level actions */
|
|
|
|
/* NB: UPPERREL_FINAL must be last enum entry; it's used to size arrays */
|
|
|
|
} UpperRelationKind;
|
|
|
|
|
2007-02-19 08:03:34 +01:00
|
|
|
/*----------
|
|
|
|
* PlannerGlobal
|
|
|
|
* Global information for planning/optimization
|
|
|
|
*
|
|
|
|
* PlannerGlobal holds state for an entire planner invocation; this state
|
|
|
|
* is shared across all levels of sub-Queries that exist in the command being
|
|
|
|
* planned.
|
|
|
|
*----------
|
|
|
|
*/
|
|
|
|
typedef struct PlannerGlobal
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
|
|
|
|
ParamListInfo boundParams; /* Param values provided to planner() */
|
|
|
|
|
2007-02-22 23:00:26 +01:00
|
|
|
List *subplans; /* Plans for SubPlan nodes */
|
|
|
|
|
2011-09-03 21:35:12 +02:00
|
|
|
List *subroots; /* PlannerInfos for SubPlan nodes */
|
2009-10-12 20:10:51 +02:00
|
|
|
|
2007-02-27 02:11:26 +01:00
|
|
|
Bitmapset *rewindPlanIDs; /* indices of subplans that require REWIND */
|
|
|
|
|
2007-02-22 23:00:26 +01:00
|
|
|
List *finalrtable; /* "flat" rangetable for executor */
|
2007-09-20 19:56:33 +02:00
|
|
|
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
List *finalrowmarks; /* "flat" list of PlanRowMarks */
|
2009-10-12 20:10:51 +02:00
|
|
|
|
2011-02-26 00:56:23 +01:00
|
|
|
List *resultRelations; /* "flat" list of integer RT indexes */
|
2017-03-21 14:48:04 +01:00
|
|
|
|
Further adjust EXPLAIN's choices of table alias names.
This patch causes EXPLAIN to always assign a separate table alias to the
parent RTE of an append relation (inheritance set); before, such RTEs
were ignored if not actually scanned by the plan. Since the child RTEs
now always have that same alias to start with (cf. commit 55a1954da),
the net effect is that the parent RTE usually gets the alias used or
implied by the query text, and the children all get that alias with "_N"
appended. (The exception to "usually" is if there are duplicate aliases
in different subtrees of the original query; then some of those original
RTEs will also have "_N" appended.)
This results in more uniform output for partitioned-table plans than
we had before: the partitioned table itself gets the original alias,
and all child tables have aliases with "_N", rather than the previous
behavior where one of the children would get an alias without "_N".
The reason for giving the parent RTE an alias, even if it isn't scanned
by the plan, is that we now use the parent's alias to qualify Vars that
refer to an appendrel output column and appear above the Append or
MergeAppend that computes the appendrel. But below the append, Vars
refer to some one of the child relations, and are displayed that way.
This seems clearer than the old behavior where a Var that could carry
values from any child relation was displayed as if it referred to only
one of them.
While at it, change ruleutils.c so that the code paths used by EXPLAIN
deal in Plan trees not PlanState trees. This effectively reverts a
decision made in commit 1cc29fe7c, which seemed like a good idea at
the time to make ruleutils.c consistent with explain.c. However,
it's problematic because we'd really like to allow executor startup
pruning to remove all the children of an append node when possible,
leaving no child PlanState to resolve Vars against. (That's not done
here, but will be in the next patch.) This requires different handling
of subplans and initplans than before, but is otherwise a pretty
straightforward change.
Discussion: https://postgr.es/m/001001d4f44b$2a2cca50$7e865ef0$@lab.ntt.co.jp
2019-12-11 23:05:18 +01:00
|
|
|
List *appendRelations; /* "flat" list of AppendRelInfos */
|
|
|
|
|
2007-10-11 20:05:27 +02:00
|
|
|
List *relationOids; /* OIDs of relations the plan depends on */
|
|
|
|
|
2008-09-09 20:58:09 +02:00
|
|
|
List *invalItems; /* other dependencies, as PlanInvalItems */
|
|
|
|
|
2017-11-13 21:24:12 +01:00
|
|
|
List *paramExecTypes; /* type OIDs for PARAM_EXEC Params */
|
Fix PARAM_EXEC assignment mechanism to be safe in the presence of WITH.
The planner previously assumed that parameter Vars having the same absolute
query level, varno, and varattno could safely be assigned the same runtime
PARAM_EXEC slot, even though they might be different Vars appearing in
different subqueries. This was (probably) safe before the introduction of
CTEs, but the lazy-evalution mechanism used for CTEs means that a CTE can
be executed during execution of some other subquery, causing the lifespan
of Params at the same syntactic nesting level as the CTE to overlap with
use of the same slots inside the CTE. In 9.1 we created additional hazards
by using the same parameter-assignment technology for nestloop inner scan
parameters, but it was broken before that, as illustrated by the added
regression test.
To fix, restructure the planner's management of PlannerParamItems so that
items having different semantic lifespans are kept rigorously separated.
This will probably result in complex queries using more runtime PARAM_EXEC
slots than before, but the slots are cheap enough that this hardly matters.
Also, stop generating PlannerParamItems containing Params for subquery
outputs: all we really need to do is reserve the PARAM_EXEC slot number,
and that now only takes incrementing a counter. The planning code is
simpler and probably faster than before, as well as being more correct.
Per report from Vik Reykja.
These changes will mostly also need to be made in the back branches, but
I'm going to hold off on that until after 9.2.0 wraps.
2012-09-05 18:54:03 +02:00
|
|
|
|
2008-10-21 22:42:53 +02:00
|
|
|
Index lastPHId; /* highest PlaceHolderVar ID assigned */
|
|
|
|
|
2011-02-10 05:27:07 +01:00
|
|
|
Index lastRowMarkId; /* highest PlanRowMark ID assigned */
|
|
|
|
|
2015-12-08 00:56:14 +01:00
|
|
|
int lastPlanNodeId; /* highest plan node ID assigned */
|
2015-09-29 03:55:57 +02:00
|
|
|
|
2007-09-20 19:56:33 +02:00
|
|
|
bool transientPlan; /* redo plan when TransactionXmin changes? */
|
Row-Level Security Policies (RLS)
Building on the updatable security-barrier views work, add the
ability to define policies on tables to limit the set of rows
which are returned from a query and which are allowed to be added
to a table. Expressions defined by the policy for filtering are
added to the security barrier quals of the query, while expressions
defined to check records being added to a table are added to the
with-check options of the query.
New top-level commands are CREATE/ALTER/DROP POLICY and are
controlled by the table owner. Row Security is able to be enabled
and disabled by the owner on a per-table basis using
ALTER TABLE .. ENABLE/DISABLE ROW SECURITY.
Per discussion, ROW SECURITY is disabled on tables by default and
must be enabled for policies on the table to be used. If no
policies exist on a table with ROW SECURITY enabled, a default-deny
policy is used and no records will be visible.
By default, row security is applied at all times except for the
table owner and the superuser. A new GUC, row_security, is added
which can be set to ON, OFF, or FORCE. When set to FORCE, row
security will be applied even for the table owner and superusers.
When set to OFF, row security will be disabled when allowed and an
error will be thrown if the user does not have rights to bypass row
security.
Per discussion, pg_dump sets row_security = OFF by default to ensure
that exports and backups will have all data in the table or will
error if there are insufficient privileges to bypass row security.
A new option has been added to pg_dump, --enable-row-security, to
ask pg_dump to export with row security enabled.
A new role capability, BYPASSRLS, which can only be set by the
superuser, is added to allow other users to be able to bypass row
security using row_security = OFF.
Many thanks to the various individuals who have helped with the
design, particularly Robert Haas for his feedback.
Authors include Craig Ringer, KaiGai Kohei, Adam Brightwell, Dean
Rasheed, with additional changes and rework by me.
Reviewers have included all of the above, Greg Smith,
Jeff McCormick, and Robert Haas.
2014-09-19 17:18:35 +02:00
|
|
|
|
Avoid invalidating all foreign-join cached plans when user mappings change.
We must not push down a foreign join when the foreign tables involved
should be accessed under different user mappings. Previously we tried
to enforce that rule literally during planning, but that meant that the
resulting plans were dependent on the current contents of the
pg_user_mapping catalog, and we had to blow away all cached plans
containing any remote join when anything at all changed in pg_user_mapping.
This could have been improved somewhat, but the fact that a syscache inval
callback has very limited info about what changed made it hard to do better
within that design. Instead, let's change the planner to not consider user
mappings per se, but to allow a foreign join if both RTEs have the same
checkAsUser value. If they do, then they necessarily will use the same
user mapping at runtime, and we don't need to know specifically which one
that is. Post-plan-time changes in pg_user_mapping no longer require any
plan invalidation.
This rule does give up some optimization ability, to wit where two foreign
table references come from views with different owners or one's from a view
and one's directly in the query, but nonetheless the same user mapping
would have applied. We'll sacrifice the first case, but to not regress
more than we have to in the second case, allow a foreign join involving
both zero and nonzero checkAsUser values if the nonzero one is the same as
the prevailing effective userID. In that case, mark the plan as only
runnable by that userID.
The plancache code already had a notion of plans being userID-specific,
in order to support RLS. It was a little confused though, in particular
lacking clarity of thought as to whether it was the rewritten query or just
the finished plan that's dependent on the userID. Rearrange that code so
that it's clearer what depends on which, and so that the same logic applies
to both RLS-injected role dependency and foreign-join-injected role
dependency.
Note that this patch doesn't remove the other issue mentioned in the
original complaint, which is that while we'll reliably stop using a foreign
join if it's disallowed in a new context, we might fail to start using a
foreign join if it's now allowed, but we previously created a generic
cached plan that didn't use one. It was agreed that the chance of winning
that way was not high enough to justify the much larger number of plan
invalidations that would have to occur if we tried to cause it to happen.
In passing, clean up randomly-varying spelling of EXPLAIN commands in
postgres_fdw.sql, and fix a COSTS ON example that had been allowed to
leak into the committed tests.
This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the
previous attempt at ensuring we wouldn't push down foreign joins that
span permissions contexts.
Etsuro Fujita and Tom Lane
Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
|
|
|
bool dependsOnRole; /* is plan specific to current role? */
|
2015-09-16 21:38:47 +02:00
|
|
|
|
2015-12-08 00:56:14 +01:00
|
|
|
bool parallelModeOK; /* parallel mode potentially OK? */
|
2015-09-16 21:38:47 +02:00
|
|
|
|
2015-12-08 00:56:14 +01:00
|
|
|
bool parallelModeNeeded; /* parallel mode actually required? */
|
2016-08-19 20:03:07 +02:00
|
|
|
|
|
|
|
char maxParallelHazard; /* worst PROPARALLEL hazard level */
|
Allow ATTACH PARTITION with only ShareUpdateExclusiveLock.
We still require AccessExclusiveLock on the partition itself, because
otherwise an insert that violates the newly-imposed partition
constraint could be in progress at the same time that we're changing
that constraint; only the lock level on the parent relation is
weakened.
To make this safe, we have to cope with (at least) three separate
problems. First, relevant DDL might commit while we're in the process
of building a PartitionDesc. If so, find_inheritance_children() might
see a new partition while the RELOID system cache still has the old
partition bound cached, and even before invalidation messages have
been queued. To fix that, if we see that the pg_class tuple seems to
be missing or to have a null relpartbound, refetch the value directly
from the table. We can't get the wrong value, because DETACH PARTITION
still requires AccessExclusiveLock throughout; if we ever want to
change that, this will need more thought. In testing, I found it quite
difficult to hit even the null-relpartbound case; the race condition
is extremely tight, but the theoretical risk is there.
Second, successive calls to RelationGetPartitionDesc might not return
the same answer. The query planner will get confused if lookup up the
PartitionDesc for a particular relation does not return a consistent
answer for the entire duration of query planning. Likewise, query
execution will get confused if the same relation seems to have a
different PartitionDesc at different times. Invent a new
PartitionDirectory concept and use it to ensure consistency. This
ensures that a single invocation of either the planner or the executor
sees the same view of the PartitionDesc from beginning to end, but it
does not guarantee that the planner and the executor see the same
view. Since this allows pointers to old PartitionDesc entries to
survive even after a relcache rebuild, also postpone removing the old
PartitionDesc entry until we're certain no one is using it.
For the most part, it seems to be OK for the planner and executor to
have different views of the PartitionDesc, because the executor will
just ignore any concurrently added partitions which were unknown at
plan time; those partitions won't be part of the inheritance
expansion, but invalidation messages will trigger replanning at some
point. Normally, this happens by the time the very next command is
executed, but if the next command acquires no locks and executes a
prepared query, it can manage not to notice until a new transaction is
started. We might want to tighten that up, but it's material for a
separate patch. There would still be a small window where a query
that started just after an ATTACH PARTITION command committed might
fail to notice its results -- but only if the command starts before
the commit has been acknowledged to the user. All in all, the warts
here around serializability seem small enough to be worth accepting
for the considerable advantage of being able to add partitions without
a full table lock.
Although in general the consequences of new partitions showing up
between planning and execution are limited to the query not noticing
the new partitions, run-time partition pruning will get confused in
that case, so that's the third problem that this patch fixes.
Run-time partition pruning assumes that indexes into the PartitionDesc
are stable between planning and execution. So, add code so that if
new partitions are added between plan time and execution time, the
indexes stored in the subplan_map[] and subpart_map[] arrays within
the plan's PartitionedRelPruneInfo get adjusted accordingly. There
does not seem to be a simple way to generalize this scheme to cope
with partitions that are removed, mostly because they could then get
added back again with different bounds, but it works OK for added
partitions.
This code does not try to ensure that every backend participating in
a parallel query sees the same view of the PartitionDesc. That
currently doesn't matter, because we never pass PartitionDesc
indexes between backends. Each backend will ignore the concurrently
added partitions which it notices, and it doesn't matter if different
backends are ignoring different sets of concurrently added partitions.
If in the future that matters, for example because we allow writes in
parallel query and want all participants to do tuple routing to the same
set of partitions, the PartitionDirectory concept could be improved to
share PartitionDescs across backends. There is a draft patch to
serialize and restore PartitionDescs on the thread where this patch
was discussed, which may be a useful place to start.
Patch by me. Thanks to Alvaro Herrera, David Rowley, Simon Riggs,
Amit Langote, and Michael Paquier for discussion, and to Alvaro
Herrera for some review.
Discussion: http://postgr.es/m/CA+Tgmobt2upbSocvvDej3yzokd7AkiT+PvgFH+a9-5VV1oJNSQ@mail.gmail.com
Discussion: http://postgr.es/m/CA+TgmoZE0r9-cyA-aY6f8WFEROaDLLL7Vf81kZ8MtFCkxpeQSw@mail.gmail.com
Discussion: http://postgr.es/m/CA+TgmoY13KQZF-=HNTrt9UYWYx3_oYOQpu9ioNT49jGgiDpUEA@mail.gmail.com
2019-03-07 17:13:12 +01:00
|
|
|
|
|
|
|
PartitionDirectory partition_directory; /* partition descriptors */
|
2007-02-19 08:03:34 +01:00
|
|
|
} PlannerGlobal;
|
|
|
|
|
2007-02-22 23:00:26 +01:00
|
|
|
/* macro for fetching the Plan associated with a SubPlan node */
|
|
|
|
#define planner_subplan_get_plan(root, subplan) \
|
|
|
|
((Plan *) list_nth((root)->glob->subplans, (subplan)->plan_id - 1))
|
|
|
|
|
2007-02-19 08:03:34 +01:00
|
|
|
|
2005-06-06 00:32:58 +02:00
|
|
|
/*----------
|
|
|
|
* PlannerInfo
|
|
|
|
* Per-query information for planning/optimization
|
|
|
|
*
|
|
|
|
* This struct is conventionally called "root" in all the planner routines.
|
|
|
|
* It holds links to all of the planner's working state, in addition to the
|
2006-01-31 22:39:25 +01:00
|
|
|
* original Query. Note that at present the planner extensively modifies
|
2005-06-06 00:32:58 +02:00
|
|
|
* the passed-in Query data structure; someday that should stop.
|
2019-01-29 21:48:51 +01:00
|
|
|
*
|
|
|
|
* For reasons explained in optimizer/optimizer.h, we define the typedef
|
|
|
|
* either here or in that header, whichever is read first.
|
2005-06-06 00:32:58 +02:00
|
|
|
*----------
|
|
|
|
*/
|
2019-01-29 21:48:51 +01:00
|
|
|
#ifndef HAVE_PLANNERINFO_TYPEDEF
|
|
|
|
typedef struct PlannerInfo PlannerInfo;
|
|
|
|
#define HAVE_PLANNERINFO_TYPEDEF 1
|
|
|
|
#endif
|
2018-06-26 16:35:26 +02:00
|
|
|
|
2019-01-29 21:48:51 +01:00
|
|
|
struct PlannerInfo
|
2005-06-06 00:32:58 +02:00
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
|
|
|
|
Query *parse; /* the Query being planned */
|
|
|
|
|
2007-02-19 08:03:34 +01:00
|
|
|
PlannerGlobal *glob; /* global info for current planner run */
|
|
|
|
|
|
|
|
Index query_level; /* 1 at the outermost Query */
|
|
|
|
|
2019-01-29 21:48:51 +01:00
|
|
|
PlannerInfo *parent_root; /* NULL at outermost Query */
|
2008-10-04 23:56:55 +02:00
|
|
|
|
2015-08-12 05:48:37 +02:00
|
|
|
/*
|
|
|
|
* plan_params contains the expressions that this query level needs to
|
|
|
|
* make available to a lower query level that is currently being planned.
|
|
|
|
* outer_params contains the paramIds of PARAM_EXEC Params that outer
|
|
|
|
* query levels will make available to this query level.
|
|
|
|
*/
|
Fix PARAM_EXEC assignment mechanism to be safe in the presence of WITH.
The planner previously assumed that parameter Vars having the same absolute
query level, varno, and varattno could safely be assigned the same runtime
PARAM_EXEC slot, even though they might be different Vars appearing in
different subqueries. This was (probably) safe before the introduction of
CTEs, but the lazy-evalution mechanism used for CTEs means that a CTE can
be executed during execution of some other subquery, causing the lifespan
of Params at the same syntactic nesting level as the CTE to overlap with
use of the same slots inside the CTE. In 9.1 we created additional hazards
by using the same parameter-assignment technology for nestloop inner scan
parameters, but it was broken before that, as illustrated by the added
regression test.
To fix, restructure the planner's management of PlannerParamItems so that
items having different semantic lifespans are kept rigorously separated.
This will probably result in complex queries using more runtime PARAM_EXEC
slots than before, but the slots are cheap enough that this hardly matters.
Also, stop generating PlannerParamItems containing Params for subquery
outputs: all we really need to do is reserve the PARAM_EXEC slot number,
and that now only takes incrementing a counter. The planning code is
simpler and probably faster than before, as well as being more correct.
Per report from Vik Reykja.
These changes will mostly also need to be made in the back branches, but
I'm going to hold off on that until after 9.2.0 wraps.
2012-09-05 18:54:03 +02:00
|
|
|
List *plan_params; /* list of PlannerParamItems, see below */
|
2015-08-12 05:48:37 +02:00
|
|
|
Bitmapset *outer_params;
|
Fix PARAM_EXEC assignment mechanism to be safe in the presence of WITH.
The planner previously assumed that parameter Vars having the same absolute
query level, varno, and varattno could safely be assigned the same runtime
PARAM_EXEC slot, even though they might be different Vars appearing in
different subqueries. This was (probably) safe before the introduction of
CTEs, but the lazy-evalution mechanism used for CTEs means that a CTE can
be executed during execution of some other subquery, causing the lifespan
of Params at the same syntactic nesting level as the CTE to overlap with
use of the same slots inside the CTE. In 9.1 we created additional hazards
by using the same parameter-assignment technology for nestloop inner scan
parameters, but it was broken before that, as illustrated by the added
regression test.
To fix, restructure the planner's management of PlannerParamItems so that
items having different semantic lifespans are kept rigorously separated.
This will probably result in complex queries using more runtime PARAM_EXEC
slots than before, but the slots are cheap enough that this hardly matters.
Also, stop generating PlannerParamItems containing Params for subquery
outputs: all we really need to do is reserve the PARAM_EXEC slot number,
and that now only takes incrementing a counter. The planning code is
simpler and probably faster than before, as well as being more correct.
Per report from Vik Reykja.
These changes will mostly also need to be made in the back branches, but
I'm going to hold off on that until after 9.2.0 wraps.
2012-09-05 18:54:03 +02:00
|
|
|
|
2005-06-06 06:13:36 +02:00
|
|
|
/*
|
2006-01-31 22:39:25 +01:00
|
|
|
* simple_rel_array holds pointers to "base rels" and "other rels" (see
|
2005-06-06 06:13:36 +02:00
|
|
|
* comments for RelOptInfo for more info). It is indexed by rangetable
|
|
|
|
* index (so entry 0 is always wasted). Entries can be NULL when an RTE
|
2006-01-31 22:39:25 +01:00
|
|
|
* does not correspond to a base relation, such as a join RTE or an
|
|
|
|
* unreferenced view RTE; or if the RelOptInfo hasn't been made yet.
|
2005-06-06 06:13:36 +02:00
|
|
|
*/
|
2007-01-20 21:45:41 +01:00
|
|
|
struct RelOptInfo **simple_rel_array; /* All 1-rel RelOptInfos */
|
2006-01-31 22:39:25 +01:00
|
|
|
int simple_rel_array_size; /* allocated size of array */
|
2005-06-06 06:13:36 +02:00
|
|
|
|
2007-04-21 23:01:45 +02:00
|
|
|
/*
|
|
|
|
* simple_rte_array is the same length as simple_rel_array and holds
|
2019-08-09 18:33:43 +02:00
|
|
|
* pointers to the associated rangetable entries. Using this is a shade
|
|
|
|
* faster than using rt_fetch(), mostly due to fewer indirections.
|
2007-04-21 23:01:45 +02:00
|
|
|
*/
|
|
|
|
RangeTblEntry **simple_rte_array; /* rangetable as an array */
|
|
|
|
|
2018-06-26 16:35:26 +02:00
|
|
|
/*
|
2018-06-27 21:40:24 +02:00
|
|
|
* append_rel_array is the same length as the above arrays, and holds
|
2018-06-26 16:35:26 +02:00
|
|
|
* pointers to the corresponding AppendRelInfo entry indexed by
|
2019-08-09 18:33:43 +02:00
|
|
|
* child_relid, or NULL if the rel is not an appendrel child. The array
|
|
|
|
* itself is not allocated if append_rel_list is empty.
|
2018-06-26 16:35:26 +02:00
|
|
|
*/
|
|
|
|
struct AppendRelInfo **append_rel_array;
|
|
|
|
|
2012-01-28 01:26:38 +01:00
|
|
|
/*
|
|
|
|
* all_baserels is a Relids set of all base relids (but not "other"
|
|
|
|
* relids) in the query; that is, the Relids identifier of the final join
|
Compute correct em_nullable_relids in get_eclass_for_sort_expr().
Bug #8591 from Claudio Freire demonstrates that get_eclass_for_sort_expr
must be able to compute valid em_nullable_relids for any new equivalence
class members it creates. I'd worried about this in the commit message
for db9f0e1d9a4a0842c814a464cdc9758c3f20b96c, but claimed that it wasn't a
problem because multi-member ECs should already exist when it runs. That
is transparently wrong, though, because this function is also called by
initialize_mergeclause_eclasses, which runs during deconstruct_jointree.
The example given in the bug report (which the new regression test item
is based upon) fails because the COALESCE() expression is first seen by
initialize_mergeclause_eclasses rather than process_equivalence.
Fixing this requires passing the appropriate nullable_relids set to
get_eclass_for_sort_expr, and it requires new code to compute that set
for top-level expressions such as ORDER BY, GROUP BY, etc. We store
the top-level nullable_relids in a new field in PlannerInfo to avoid
computing it many times. In the back branches, I've added the new
field at the end of the struct to minimize ABI breakage for planner
plugins. There doesn't seem to be a good alternative to changing
get_eclass_for_sort_expr's API signature, though. There probably aren't
any third-party extensions calling that function directly; moreover,
if there are, they probably need to think about what to pass for
nullable_relids anyway.
Back-patch to 9.2, like the previous patch in this area.
2013-11-15 22:46:18 +01:00
|
|
|
* we need to form. This is computed in make_one_rel, just before we
|
|
|
|
* start making Paths.
|
2012-01-28 01:26:38 +01:00
|
|
|
*/
|
|
|
|
Relids all_baserels;
|
|
|
|
|
Compute correct em_nullable_relids in get_eclass_for_sort_expr().
Bug #8591 from Claudio Freire demonstrates that get_eclass_for_sort_expr
must be able to compute valid em_nullable_relids for any new equivalence
class members it creates. I'd worried about this in the commit message
for db9f0e1d9a4a0842c814a464cdc9758c3f20b96c, but claimed that it wasn't a
problem because multi-member ECs should already exist when it runs. That
is transparently wrong, though, because this function is also called by
initialize_mergeclause_eclasses, which runs during deconstruct_jointree.
The example given in the bug report (which the new regression test item
is based upon) fails because the COALESCE() expression is first seen by
initialize_mergeclause_eclasses rather than process_equivalence.
Fixing this requires passing the appropriate nullable_relids set to
get_eclass_for_sort_expr, and it requires new code to compute that set
for top-level expressions such as ORDER BY, GROUP BY, etc. We store
the top-level nullable_relids in a new field in PlannerInfo to avoid
computing it many times. In the back branches, I've added the new
field at the end of the struct to minimize ABI breakage for planner
plugins. There doesn't seem to be a good alternative to changing
get_eclass_for_sort_expr's API signature, though. There probably aren't
any third-party extensions calling that function directly; moreover,
if there are, they probably need to think about what to pass for
nullable_relids anyway.
Back-patch to 9.2, like the previous patch in this area.
2013-11-15 22:46:18 +01:00
|
|
|
/*
|
|
|
|
* nullable_baserels is a Relids set of base relids that are nullable by
|
|
|
|
* some outer join in the jointree; these are rels that are potentially
|
|
|
|
* nullable below the WHERE clause, SELECT targetlist, etc. This is
|
|
|
|
* computed in deconstruct_jointree.
|
|
|
|
*/
|
|
|
|
Relids nullable_baserels;
|
|
|
|
|
2005-06-09 01:02:05 +02:00
|
|
|
/*
|
|
|
|
* join_rel_list is a list of all join-relation RelOptInfos we have
|
|
|
|
* considered in this planning run. For small problems we just scan the
|
|
|
|
* list to do lookups, but when there are many join relations we build a
|
|
|
|
* hash table for faster lookups. The hash table is present and valid
|
|
|
|
* when join_rel_hash is not NULL. Note that we still maintain the list
|
|
|
|
* even when using the hash table for lookups; this simplifies life for
|
|
|
|
* GEQO.
|
|
|
|
*/
|
2005-06-06 00:32:58 +02:00
|
|
|
List *join_rel_list; /* list of join-relation RelOptInfos */
|
2005-06-09 01:02:05 +02:00
|
|
|
struct HTAB *join_rel_hash; /* optional hashtable for join relations */
|
2005-06-06 00:32:58 +02:00
|
|
|
|
2009-11-28 01:46:19 +01:00
|
|
|
/*
|
|
|
|
* When doing a dynamic-programming-style join search, join_rel_level[k]
|
|
|
|
* is a list of all join-relation RelOptInfos of level k, and
|
|
|
|
* join_cur_level is the current level. New join-relation RelOptInfos are
|
|
|
|
* automatically added to the join_rel_level[join_cur_level] list.
|
|
|
|
* join_rel_level is NULL if not in use.
|
|
|
|
*/
|
|
|
|
List **join_rel_level; /* lists of join-relation RelOptInfos */
|
|
|
|
int join_cur_level; /* index of list being extended */
|
|
|
|
|
2008-10-04 23:56:55 +02:00
|
|
|
List *init_plans; /* init SubPlans for query */
|
|
|
|
|
|
|
|
List *cte_plan_ids; /* per-CTE-item list of subplan IDs */
|
2007-02-19 08:03:34 +01:00
|
|
|
|
Implement UPDATE tab SET (col1,col2,...) = (SELECT ...), ...
This SQL-standard feature allows a sub-SELECT yielding multiple columns
(but only one row) to be used to compute the new values of several columns
to be updated. While the same results can be had with an independent
sub-SELECT per column, such a workaround can require a great deal of
duplicated computation.
The standard actually says that the source for a multi-column assignment
could be any row-valued expression. The implementation used here is
tightly tied to our existing sub-SELECT support and can't handle other
cases; the Bison grammar would have some issues with them too. However,
I don't feel too bad about this since other cases can be converted into
sub-SELECTs. For instance, "SET (a,b,c) = row_valued_function(x)" could
be written "SET (a,b,c) = (SELECT * FROM row_valued_function(x))".
2014-06-18 19:22:25 +02:00
|
|
|
List *multiexpr_params; /* List of Lists of Params for MULTIEXPR
|
|
|
|
* subquery outputs */
|
|
|
|
|
2007-01-20 21:45:41 +01:00
|
|
|
List *eq_classes; /* list of active EquivalenceClasses */
|
2005-06-06 00:32:58 +02:00
|
|
|
|
Speed up finding EquivalenceClasses for a given set of rels
Previously in order to determine which ECs a relation had members in, we
had to loop over all ECs stored in PlannerInfo's eq_classes and check if
ec_relids mentioned the relation. For the most part, this was fine, as
generally, unless queries were fairly complex, the overhead of performing
the lookup would have not been that significant. However, when queries
contained large numbers of joins and ECs, the overhead to find the set of
classes matching a given set of relations could become a significant
portion of the overall planning effort.
Here we allow a much more efficient method to access the ECs which match a
given relation or set of relations. A new Bitmapset field in RelOptInfo
now exists to store the indexes into PlannerInfo's eq_classes list which
each relation is mentioned in. This allows very fast lookups to find all
ECs belonging to a single relation. When we need to lookup ECs belonging
to a given pair of relations, we can simply bitwise-AND the Bitmapsets from
each relation and use the result to perform the lookup.
We also take the opportunity to write a new implementation of
generate_join_implied_equalities which makes use of the new indexes.
generate_join_implied_equalities_for_ecs must remain as is as it can be
given a custom list of ECs, which we can't easily determine the indexes of.
This was originally intended to fix the performance penalty of looking up
foreign keys matching a join condition which was introduced by 100340e2d.
However, we're speeding up much more than just that here.
Author: David Rowley, Tom Lane
Reviewed-by: Tom Lane, Tomas Vondra
Discussion: https://postgr.es/m/6970.1545327857@sss.pgh.pa.us
2019-07-21 07:30:58 +02:00
|
|
|
bool ec_merging_done; /* set true once ECs are canonical */
|
|
|
|
|
2007-01-20 21:45:41 +01:00
|
|
|
List *canon_pathkeys; /* list of "canonical" PathKeys */
|
Teach planner about some cases where a restriction clause can be
propagated inside an outer join. In particular, given
LEFT JOIN ON (A = B) WHERE A = constant, we cannot conclude that
B = constant at the top level (B might be null instead), but we
can nonetheless put a restriction B = constant into the quals for
B's relation, since no inner-side rows not meeting that condition
can contribute to the final result. Similarly, given
FULL JOIN USING (J) WHERE J = constant, we can't directly conclude
that either input J variable = constant, but it's OK to push such
quals into each input rel. Per recent gripe from Kim Bisgaard.
Along the way, remove 'valid_everywhere' flag from RestrictInfo,
as on closer analysis it was not being used for anything, and was
defined backwards anyway.
2005-07-03 01:00:42 +02:00
|
|
|
|
2007-01-20 21:45:41 +01:00
|
|
|
List *left_join_clauses; /* list of RestrictInfos for mergejoinable
|
|
|
|
* outer join clauses w/nonnullable var on
|
|
|
|
* left */
|
Teach planner about some cases where a restriction clause can be
propagated inside an outer join. In particular, given
LEFT JOIN ON (A = B) WHERE A = constant, we cannot conclude that
B = constant at the top level (B might be null instead), but we
can nonetheless put a restriction B = constant into the quals for
B's relation, since no inner-side rows not meeting that condition
can contribute to the final result. Similarly, given
FULL JOIN USING (J) WHERE J = constant, we can't directly conclude
that either input J variable = constant, but it's OK to push such
quals into each input rel. Per recent gripe from Kim Bisgaard.
Along the way, remove 'valid_everywhere' flag from RestrictInfo,
as on closer analysis it was not being used for anything, and was
defined backwards anyway.
2005-07-03 01:00:42 +02:00
|
|
|
|
2007-01-20 21:45:41 +01:00
|
|
|
List *right_join_clauses; /* list of RestrictInfos for mergejoinable
|
|
|
|
* outer join clauses w/nonnullable var on
|
|
|
|
* right */
|
|
|
|
|
|
|
|
List *full_join_clauses; /* list of RestrictInfos for mergejoinable
|
|
|
|
* full join clauses */
|
Teach planner about some cases where a restriction clause can be
propagated inside an outer join. In particular, given
LEFT JOIN ON (A = B) WHERE A = constant, we cannot conclude that
B = constant at the top level (B might be null instead), but we
can nonetheless put a restriction B = constant into the quals for
B's relation, since no inner-side rows not meeting that condition
can contribute to the final result. Similarly, given
FULL JOIN USING (J) WHERE J = constant, we can't directly conclude
that either input J variable = constant, but it's OK to push such
quals into each input rel. Per recent gripe from Kim Bisgaard.
Along the way, remove 'valid_everywhere' flag from RestrictInfo,
as on closer analysis it was not being used for anything, and was
defined backwards anyway.
2005-07-03 01:00:42 +02:00
|
|
|
|
2012-08-27 04:48:55 +02:00
|
|
|
List *join_info_list; /* list of SpecialJoinInfos */
|
|
|
|
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
/*
|
|
|
|
* all_result_relids is empty for SELECT, otherwise it contains at least
|
|
|
|
* parse->resultRelation. For UPDATE/DELETE across an inheritance or
|
|
|
|
* partitioning tree, the result rel's child relids are added. When using
|
|
|
|
* multi-level partitioning, intermediate partitioned rels are included.
|
|
|
|
* leaf_result_relids is similar except that only actual result tables,
|
|
|
|
* not partitioned tables, are included in it.
|
|
|
|
*/
|
|
|
|
Relids all_result_relids; /* set of all result relids */
|
|
|
|
Relids leaf_result_relids; /* set of all leaf relids */
|
|
|
|
|
Use Append rather than MergeAppend for scanning ordered partitions.
If we need ordered output from a scan of a partitioned table, but
the ordering matches the partition ordering, then we don't need to
use a MergeAppend to combine the pre-ordered per-partition scan
results: a plain Append will produce the same results. This
both saves useless comparison work inside the MergeAppend proper,
and allows us to start returning tuples after istarting up just
the first child node not all of them.
However, all is not peaches and cream, because if some of the
child nodes have high startup costs then there will be big
discontinuities in the tuples-returned-versus-elapsed-time curve.
The planner's cost model cannot handle that (yet, anyway).
If we model the Append's startup cost as being just the first
child's startup cost, we may drastically underestimate the cost
of fetching slightly more tuples than are available from the first
child. Since we've had bad experiences with over-optimistic choices
of "fast start" plans for ORDER BY LIMIT queries, that seems scary.
As a klugy workaround, set the startup cost estimate for an ordered
Append to be the sum of its children's startup costs (as MergeAppend
would). This doesn't really describe reality, but it's less likely
to cause a bad plan choice than an underestimated startup cost would.
In practice, the cases where we really care about this optimization
will have child plans that are IndexScans with zero startup cost,
so that the overly conservative estimate is still just zero.
David Rowley, reviewed by Julien Rouhaud and Antonin Houska
Discussion: https://postgr.es/m/CAKJS1f-hAqhPLRk_RaSFTgYxd=Tz5hA7kQ2h4-DhJufQk8TGuw@mail.gmail.com
2019-04-06 01:20:30 +02:00
|
|
|
/*
|
|
|
|
* Note: for AppendRelInfos describing partitions of a partitioned table,
|
|
|
|
* we guarantee that partitions that come earlier in the partitioned
|
|
|
|
* table's PartitionDesc will appear earlier in append_rel_list.
|
|
|
|
*/
|
2006-01-31 22:39:25 +01:00
|
|
|
List *append_rel_list; /* list of AppendRelInfos */
|
|
|
|
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
List *row_identity_vars; /* list of RowIdentityVarInfos */
|
|
|
|
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
List *rowMarks; /* list of PlanRowMarks */
|
|
|
|
|
2008-10-21 22:42:53 +02:00
|
|
|
List *placeholder_list; /* list of PlaceHolderInfos */
|
|
|
|
|
2016-06-18 21:22:34 +02:00
|
|
|
List *fkey_list; /* list of ForeignKeyOptInfos */
|
|
|
|
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
List *query_pathkeys; /* desired pathkeys for query_planner() */
|
2005-06-06 00:32:58 +02:00
|
|
|
|
2008-08-05 04:43:18 +02:00
|
|
|
List *group_pathkeys; /* groupClause pathkeys, if any */
|
2008-12-28 19:54:01 +01:00
|
|
|
List *window_pathkeys; /* pathkeys of bottom window, if any */
|
2008-08-05 04:43:18 +02:00
|
|
|
List *distinct_pathkeys; /* distinctClause pathkeys, if any */
|
|
|
|
List *sort_pathkeys; /* sortClause pathkeys, if any */
|
2005-08-28 00:13:44 +02:00
|
|
|
|
2017-09-21 05:33:04 +02:00
|
|
|
List *part_schemes; /* Canonicalised partition schemes used in the
|
|
|
|
* query. */
|
|
|
|
|
2008-01-11 05:02:18 +01:00
|
|
|
List *initial_rels; /* RelOptInfos we are now trying to join */
|
|
|
|
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
/* Use fetch_upper_rel() to get any particular upper rel */
|
|
|
|
List *upper_rels[UPPERREL_FINAL + 1]; /* upper-rel RelOptInfos */
|
|
|
|
|
2016-03-15 00:23:29 +01:00
|
|
|
/* Result tlists chosen by grouping_planner for upper-stage processing */
|
|
|
|
struct PathTarget *upper_targets[UPPERREL_FINAL + 1];
|
|
|
|
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
/*
|
Avoid passing query tlist around separately from root->processed_tlist.
In the dim past, the planner kept the fully-processed version of the query
targetlist (the result of preprocess_targetlist) in grouping_planner's
local variable "tlist", and only grudgingly passed it to individual other
routines as needed. Later we discovered a need to still have it available
after grouping_planner finishes, and invented the root->processed_tlist
field for that purpose, but it wasn't used internally to grouping_planner;
the tlist was still being passed around separately in the same places as
before.
Now comes a proposed patch to allow appendrel expansion to add entries
to the processed tlist, well after preprocess_targetlist has finished
its work. To avoid having to pass around the tlist explicitly, it's
proposed to allow appendrel expansion to modify root->processed_tlist.
That makes aliasing the tlist with assorted parameters and local
variables really scary. It would accidentally work as long as the
tlist is initially nonempty, because then the List header won't move
around, but it's not exactly hard to think of ways for that to break.
Aliased values are poor programming practice anyway.
Hence, get rid of local variables and parameters that can be identified
with root->processed_tlist, in favor of just using that field directly.
And adjust comments to match. (Some of the new comments speak as though
it's already possible for appendrel expansion to modify the tlist; that's
not true yet, but will happen in a later patch.)
Discussion: https://postgr.es/m/9d7c5112-cb99-6a47-d3be-cf1ee6862a1d@lab.ntt.co.jp
2019-03-27 17:57:41 +01:00
|
|
|
* The fully-processed targetlist is kept here. It differs from
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
* parse->targetList in that (for INSERT) it's been reordered to match the
|
|
|
|
* target table, and defaults have been filled in. Also, additional
|
|
|
|
* resjunk targets may be present. preprocess_targetlist() does most of
|
|
|
|
* that work, but note that more resjunk targets can get added during
|
|
|
|
* appendrel expansion. (Hence, upper_targets mustn't get set up till
|
|
|
|
* after that.)
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
*/
|
|
|
|
List *processed_tlist;
|
|
|
|
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
/*
|
|
|
|
* For UPDATE, this list contains the target table's attribute numbers to
|
|
|
|
* which the first N entries of processed_tlist are to be assigned. (Any
|
|
|
|
* additional entries in processed_tlist must be resjunk.) DO NOT use the
|
|
|
|
* resnos in processed_tlist to identify the UPDATE target columns.
|
|
|
|
*/
|
|
|
|
List *update_colnos;
|
|
|
|
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
/* Fields filled during create_plan() for use in setrefs.c */
|
|
|
|
AttrNumber *grouping_map; /* for GroupingFunc fixup */
|
|
|
|
List *minmax_aggs; /* List of MinMaxAggInfos */
|
|
|
|
|
2007-01-20 21:45:41 +01:00
|
|
|
MemoryContext planner_cxt; /* context holding PlannerInfo */
|
|
|
|
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality total_table_pages; /* # of pages in all non-dummy tables of
|
2018-11-07 18:12:56 +01:00
|
|
|
* query */
|
2006-09-20 00:49:53 +02:00
|
|
|
|
2021-09-15 18:56:13 +02:00
|
|
|
Selectivity tuple_fraction; /* tuple_fraction passed to query_planner */
|
|
|
|
Cardinality limit_tuples; /* limit_tuples passed to query_planner */
|
2005-06-10 05:32:25 +02:00
|
|
|
|
Improve RLS planning by marking individual quals with security levels.
In an RLS query, we must ensure that security filter quals are evaluated
before ordinary query quals, in case the latter contain "leaky" functions
that could expose the contents of sensitive rows. The original
implementation of RLS planning ensured this by pushing the scan of a
secured table into a sub-query that it marked as a security-barrier view.
Unfortunately this results in very inefficient plans in many cases, because
the sub-query cannot be flattened and gets planned independently of the
rest of the query.
To fix, drop the use of sub-queries to enforce RLS qual order, and instead
mark each qual (RestrictInfo) with a security_level field establishing its
priority for evaluation. Quals must be evaluated in security_level order,
except that "leakproof" quals can be allowed to go ahead of quals of lower
security_level, if it's helpful to do so. This has to be enforced within
the ordering of any one list of quals to be evaluated at a table scan node,
and we also have to ensure that quals are not chosen for early evaluation
(i.e., use as an index qual or TID scan qual) if they're not allowed to go
ahead of other quals at the scan node.
This is sufficient to fix the problem for RLS quals, since we only support
RLS policies on simple tables and thus RLS quals will always exist at the
table scan level only. Eventually these qual ordering rules should be
enforced for join quals as well, which would permit improving planning for
explicit security-barrier views; but that's a task for another patch.
Note that FDWs would need to be aware of these rules --- and not, for
example, send an insecure qual for remote execution --- but since we do
not yet allow RLS policies on foreign tables, the case doesn't arise.
This will need to be addressed before we can allow such policies.
Patch by me, reviewed by Stephen Frost and Dean Rasheed.
Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
|
|
|
Index qual_security_level; /* minimum security_level for quals */
|
|
|
|
/* Note: qual_security_level is zero if there are no securityQuals */
|
|
|
|
|
2005-06-06 00:32:58 +02:00
|
|
|
bool hasJoinRTEs; /* true if any RTEs are RTE_JOIN kind */
|
2012-08-27 04:48:55 +02:00
|
|
|
bool hasLateralRTEs; /* true if any RTEs are marked LATERAL */
|
2005-06-06 00:32:58 +02:00
|
|
|
bool hasHavingQual; /* true if havingQual was non-null */
|
Revise the planner's handling of "pseudoconstant" WHERE clauses, that is
clauses containing no variables and no volatile functions. Such a clause
can be used as a one-time qual in a gating Result plan node, to suppress
plan execution entirely when it is false. Even when the clause is true,
putting it in a gating node wins by avoiding repeated evaluation of the
clause. In previous PG releases, query_planner() would do this for
pseudoconstant clauses appearing at the top level of the jointree, but
there was no ability to generate a gating Result deeper in the plan tree.
To fix it, get rid of the special case in query_planner(), and instead
process pseudoconstant clauses through the normal RestrictInfo qual
distribution mechanism. When a pseudoconstant clause is found attached to
a path node in create_plan(), pull it out and generate a gating Result at
that point. This requires special-casing pseudoconstants in selectivity
estimation and cost_qual_eval, but on the whole it's pretty clean.
It probably even makes the planner a bit faster than before for the normal
case of no pseudoconstants, since removing pull_constant_clauses saves one
useless traversal of the qual tree. Per gripe from Phil Frost.
2006-07-01 20:38:33 +02:00
|
|
|
bool hasPseudoConstantQuals; /* true if any RestrictInfo has
|
|
|
|
* pseudoconstant = true */
|
Move resolution of AlternativeSubPlan choices to the planner.
When commit bd3daddaf introduced AlternativeSubPlans, I had some
ambitions towards allowing the choice of subplan to change during
execution. That has not happened, or even been thought about, in the
ensuing twelve years; so it seems like a failed experiment. So let's
rip that out and resolve the choice of subplan at the end of planning
(in setrefs.c) rather than during executor startup. This has a number
of positive benefits:
* Removal of a few hundred lines of executor code, since
AlternativeSubPlans need no longer be supported there.
* Removal of executor-startup overhead (particularly, initialization
of subplans that won't be used).
* Removal of incidental costs of having a larger plan tree, such as
tree-scanning and copying costs in the plancache; not to mention
setrefs.c's own costs of processing the discarded subplans.
* EXPLAIN no longer has to print a weird (and undocumented)
representation of an AlternativeSubPlan choice; it sees only the
subplan actually used. This should mean less confusion for users.
* Since setrefs.c knows which subexpression of a plan node it's
working on at any instant, it's possible to adjust the estimated
number of executions of the subplan based on that. For example,
we should usually estimate more executions of a qual expression
than a targetlist expression. The implementation used here is
pretty simplistic, because we don't want to expend a lot of cycles
on the issue; but it's better than ignoring the point entirely,
as the executor had to.
That last point might possibly result in shifting the choice
between hashed and non-hashed EXISTS subplans in a few cases,
but in general this patch isn't meant to change planner choices.
Since we're doing the resolution so late, it's really impossible
to change any plan choices outside the AlternativeSubPlan itself.
Patch by me; thanks to David Rowley for review.
Discussion: https://postgr.es/m/1992952.1592785225@sss.pgh.pa.us
2020-09-27 18:51:28 +02:00
|
|
|
bool hasAlternativeSubPlans; /* true if we've made any of those */
|
2008-10-04 23:56:55 +02:00
|
|
|
bool hasRecursion; /* true if planning a recursive WITH item */
|
|
|
|
|
2020-11-24 09:45:00 +01:00
|
|
|
/*
|
|
|
|
* Information about aggregates. Filled by preprocess_aggrefs().
|
|
|
|
*/
|
|
|
|
List *agginfos; /* AggInfo structs */
|
|
|
|
List *aggtransinfos; /* AggTransInfo structs */
|
|
|
|
int numOrderedAggs; /* number w/ DISTINCT/ORDER BY/WITHIN GROUP */
|
|
|
|
bool hasNonPartialAggs; /* does any agg not support partial mode? */
|
|
|
|
bool hasNonSerialAggs; /* is any partial agg non-serializable? */
|
|
|
|
|
2008-10-04 23:56:55 +02:00
|
|
|
/* These fields are used only when hasRecursion is true: */
|
|
|
|
int wt_param_id; /* PARAM_EXEC ID for the work table */
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
struct Path *non_recursive_path; /* a path for non-recursive term */
|
2009-07-16 22:55:44 +02:00
|
|
|
|
2010-07-12 19:01:06 +02:00
|
|
|
/* These fields are workspace for createplan.c */
|
|
|
|
Relids curOuterRels; /* outer rels above current node */
|
|
|
|
List *curOuterParams; /* not-yet-assigned NestLoopParams */
|
|
|
|
|
2021-09-14 21:11:21 +02:00
|
|
|
/* These fields are workspace for setrefs.c */
|
|
|
|
bool *isAltSubplan; /* array corresponding to glob->subplans */
|
|
|
|
bool *isUsedSubplan; /* array corresponding to glob->subplans */
|
|
|
|
|
2009-07-16 22:55:44 +02:00
|
|
|
/* optional private data for join_search_hook, e.g., GEQO */
|
|
|
|
void *join_search_private;
|
Faster partition pruning
Add a new module backend/partitioning/partprune.c, implementing a more
sophisticated algorithm for partition pruning. The new module uses each
partition's "boundinfo" for pruning instead of constraint exclusion,
based on an idea proposed by Robert Haas of a "pruning program": a list
of steps generated from the query quals which are run iteratively to
obtain a list of partitions that must be scanned in order to satisfy
those quals.
At present, this targets planner-time partition pruning, but there exist
further patches to apply partition pruning at execution time as well.
This commit also moves some definitions from include/catalog/partition.h
to a new file include/partitioning/partbounds.h, in an attempt to
rationalize partitioning related code.
Authors: Amit Langote, David Rowley, Dilip Kumar
Reviewers: Robert Haas, Kyotaro Horiguchi, Ashutosh Bapat, Jesper Pedersen.
Discussion: https://postgr.es/m/098b9c71-1915-1a2a-8d52-1a7a50ce79e8@lab.ntt.co.jp
2018-04-06 21:23:04 +02:00
|
|
|
|
|
|
|
/* Does this query modify any partition key columns? */
|
|
|
|
bool partColsUpdated;
|
2019-01-29 21:48:51 +01:00
|
|
|
};
|
2005-06-06 00:32:58 +02:00
|
|
|
|
|
|
|
|
2007-04-21 23:01:45 +02:00
|
|
|
/*
|
|
|
|
* In places where it's known that simple_rte_array[] must have been prepared
|
|
|
|
* already, we just index into it to fetch RTEs. In code that might be
|
|
|
|
* executed before or after entering query_planner(), use this macro.
|
|
|
|
*/
|
|
|
|
#define planner_rt_fetch(rti, root) \
|
|
|
|
((root)->simple_rte_array ? (root)->simple_rte_array[rti] : \
|
|
|
|
rt_fetch(rti, (root)->parse->rtable))
|
|
|
|
|
2017-09-21 05:33:04 +02:00
|
|
|
/*
|
|
|
|
* If multiple relations are partitioned the same way, all such partitions
|
|
|
|
* will have a pointer to the same PartitionScheme. A list of PartitionScheme
|
|
|
|
* objects is attached to the PlannerInfo. By design, the partition scheme
|
|
|
|
* incorporates only the general properties of the partition method (LIST vs.
|
|
|
|
* RANGE, number of partitioning columns and the type information for each)
|
|
|
|
* and not the specific bounds.
|
|
|
|
*
|
|
|
|
* We store the opclass-declared input data types instead of the partition key
|
|
|
|
* datatypes since the former rather than the latter are used to compare
|
|
|
|
* partition bounds. Since partition key data types and the opclass declared
|
|
|
|
* input data types are expected to be binary compatible (per ResolveOpClass),
|
|
|
|
* both of those should have same byval and length properties.
|
|
|
|
*/
|
|
|
|
typedef struct PartitionSchemeData
|
|
|
|
{
|
|
|
|
char strategy; /* partition strategy */
|
|
|
|
int16 partnatts; /* number of partition attributes */
|
|
|
|
Oid *partopfamily; /* OIDs of operator families */
|
|
|
|
Oid *partopcintype; /* OIDs of opclass declared input data types */
|
2018-02-28 18:16:09 +01:00
|
|
|
Oid *partcollation; /* OIDs of partitioning collations */
|
2017-09-21 05:33:04 +02:00
|
|
|
|
|
|
|
/* Cached information about partition key data types. */
|
|
|
|
int16 *parttyplen;
|
|
|
|
bool *parttypbyval;
|
Faster partition pruning
Add a new module backend/partitioning/partprune.c, implementing a more
sophisticated algorithm for partition pruning. The new module uses each
partition's "boundinfo" for pruning instead of constraint exclusion,
based on an idea proposed by Robert Haas of a "pruning program": a list
of steps generated from the query quals which are run iteratively to
obtain a list of partitions that must be scanned in order to satisfy
those quals.
At present, this targets planner-time partition pruning, but there exist
further patches to apply partition pruning at execution time as well.
This commit also moves some definitions from include/catalog/partition.h
to a new file include/partitioning/partbounds.h, in an attempt to
rationalize partitioning related code.
Authors: Amit Langote, David Rowley, Dilip Kumar
Reviewers: Robert Haas, Kyotaro Horiguchi, Ashutosh Bapat, Jesper Pedersen.
Discussion: https://postgr.es/m/098b9c71-1915-1a2a-8d52-1a7a50ce79e8@lab.ntt.co.jp
2018-04-06 21:23:04 +02:00
|
|
|
|
|
|
|
/* Cached information about partition comparison functions. */
|
2019-08-16 19:35:31 +02:00
|
|
|
struct FmgrInfo *partsupfunc;
|
2017-09-21 05:33:04 +02:00
|
|
|
} PartitionSchemeData;
|
|
|
|
|
|
|
|
typedef struct PartitionSchemeData *PartitionScheme;
|
2007-04-21 23:01:45 +02:00
|
|
|
|
2000-09-29 20:21:41 +02:00
|
|
|
/*----------
|
1998-07-18 06:22:52 +02:00
|
|
|
* RelOptInfo
|
1999-08-16 04:17:58 +02:00
|
|
|
* Per-relation information for planning/optimization
|
1996-08-28 03:59:28 +02:00
|
|
|
*
|
2002-03-12 01:52:10 +01:00
|
|
|
* For planning purposes, a "base rel" is either a plain relation (a table)
|
2003-01-15 20:35:48 +01:00
|
|
|
* or the output of a sub-SELECT or function that appears in the range table.
|
2002-03-12 01:52:10 +01:00
|
|
|
* In either case it is uniquely identified by an RT index. A "joinrel"
|
|
|
|
* is the joining of two or more base rels. A joinrel is identified by
|
|
|
|
* the set of RT indexes for its component baserels. We create RelOptInfo
|
2005-06-06 00:32:58 +02:00
|
|
|
* nodes for each baserel and joinrel, and store them in the PlannerInfo's
|
2006-01-31 22:39:25 +01:00
|
|
|
* simple_rel_array and join_rel_list respectively.
|
2000-09-29 20:21:41 +02:00
|
|
|
*
|
2002-03-12 01:52:10 +01:00
|
|
|
* Note that there is only one joinrel for any given set of component
|
|
|
|
* baserels, no matter what order we assemble them in; so an unordered
|
|
|
|
* set is the right datatype to identify it with.
|
2000-09-29 20:21:41 +02:00
|
|
|
*
|
2002-03-12 01:52:10 +01:00
|
|
|
* We also have "other rels", which are like base rels in that they refer to
|
2005-06-06 06:13:36 +02:00
|
|
|
* single RT indexes; but they are not part of the join tree, and are given
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
* a different RelOptKind to identify them.
|
2006-01-31 22:39:25 +01:00
|
|
|
* Currently the only kind of otherrels are those made for member relations
|
|
|
|
* of an "append relation", that is an inheritance set or UNION ALL subquery.
|
|
|
|
* An append relation has a parent RTE that is a base rel, which represents
|
|
|
|
* the entire append relation. The member RTEs are otherrels. The parent
|
|
|
|
* is present in the query join tree but the members are not. The member
|
|
|
|
* RTEs and otherrels are used to plan the scans of the individual tables or
|
2010-10-14 22:56:39 +02:00
|
|
|
* subqueries of the append set; then the parent baserel is given Append
|
|
|
|
* and/or MergeAppend paths comprising the best paths for the individual
|
|
|
|
* member rels. (See comments for AppendRelInfo for more information.)
|
2002-03-12 01:52:10 +01:00
|
|
|
*
|
2003-01-15 20:35:48 +01:00
|
|
|
* At one time we also made otherrels to represent join RTEs, for use in
|
|
|
|
* handling join alias Vars. Currently this is not needed because all join
|
|
|
|
* alias Vars are expanded to non-aliased form during preprocess_expression.
|
|
|
|
*
|
Basic partition-wise join functionality.
Instead of joining two partitioned tables in their entirety we can, if
it is an equi-join on the partition keys, join the matching partitions
individually. This involves teaching the planner about "other join"
rels, which are related to regular join rels in the same way that
other member rels are related to baserels. This can use significantly
more CPU time and memory than regular join planning, because there may
now be a set of "other" rels not only for every base relation but also
for every join relation. In most practical cases, this probably
shouldn't be a problem, because (1) it's probably unusual to join many
tables each with many partitions using the partition keys for all
joins and (2) if you do that scenario then you probably have a big
enough machine to handle the increased memory cost of planning and (3)
the resulting plan is highly likely to be better, so what you spend in
planning you'll make up on the execution side. All the same, for now,
turn this feature off by default.
Currently, we can only perform joins between two tables whose
partitioning schemes are absolutely identical. It would be nice to
cope with other scenarios, such as extra partitions on one side or the
other with no match on the other side, but that will have to wait for
a future patch.
Ashutosh Bapat, reviewed and tested by Rajkumar Raghuwanshi, Amit
Langote, Rafia Sabih, Thomas Munro, Dilip Kumar, Antonin Houska, Amit
Khandekar, and by me. A few final adjustments by me.
Discussion: http://postgr.es/m/CAFjFpRfQ8GrQvzp3jA2wnLqrHmaXna-urjm_UY9BqXj=EaDTSA@mail.gmail.com
Discussion: http://postgr.es/m/CAFjFpRcitjfrULr5jfuKWRPsGUX0LQ0k8-yG0Qw2+1LBGNpMdw@mail.gmail.com
2017-10-06 17:11:10 +02:00
|
|
|
* We also have relations representing joins between child relations of
|
|
|
|
* different partitioned tables. These relations are not added to
|
|
|
|
* join_rel_level lists as they are not joined directly by the dynamic
|
|
|
|
* programming algorithm.
|
|
|
|
*
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
* There is also a RelOptKind for "upper" relations, which are RelOptInfos
|
|
|
|
* that describe post-scan/join processing steps, such as aggregation.
|
|
|
|
* Many of the fields in these RelOptInfos are meaningless, but their Path
|
|
|
|
* fields always hold Paths showing ways to do that processing step.
|
|
|
|
*
|
|
|
|
* Lastly, there is a RelOptKind for "dead" relations, which are base rels
|
|
|
|
* that we have proven we don't need to join after all.
|
|
|
|
*
|
2002-03-12 01:52:10 +01:00
|
|
|
* Parts of this data structure are specific to various scan and join
|
|
|
|
* mechanisms. It didn't seem worth creating new node types for them.
|
1996-08-28 03:59:28 +02:00
|
|
|
*
|
2003-02-08 21:20:55 +01:00
|
|
|
* relids - Set of base-relation identifiers; it is a base relation
|
1999-08-16 04:17:58 +02:00
|
|
|
* if there is just one, a join relation if more than one
|
2000-01-09 01:26:47 +01:00
|
|
|
* rows - estimated number of tuples in the relation after restriction
|
|
|
|
* clauses have been applied (ie, output rows of a plan for it)
|
Fix planner's cost estimation for SEMI/ANTI joins with inner indexscans.
When the inner side of a nestloop SEMI or ANTI join is an indexscan that
uses all the join clauses as indexquals, it can be presumed that both
matched and unmatched outer rows will be processed very quickly: for
matched rows, we'll stop after fetching one row from the indexscan, while
for unmatched rows we'll have an indexscan that finds no matching index
entries, which should also be quick. The planner already knew about this,
but it was nonetheless charging for at least one full run of the inner
indexscan, as a consequence of concerns about the behavior of materialized
inner scans --- but those concerns don't apply in the fast case. If the
inner side has low cardinality (many matching rows) this could make an
indexscan plan look far more expensive than it actually is. To fix,
rearrange the work in initial_cost_nestloop/final_cost_nestloop so that we
don't add the inner scan cost until we've inspected the indexquals, and
then we can add either the full-run cost or just the first tuple's cost as
appropriate.
Experimentation with this fix uncovered another problem: add_path and
friends were coded to disregard cheap startup cost when considering
parameterized paths. That's usually okay (and desirable, because it thins
the path herd faster); but in this fast case for SEMI/ANTI joins, it could
result in throwing away the desired plain indexscan path in favor of a
bitmap scan path before we ever get to the join costing logic. In the
many-matching-rows cases of interest here, a bitmap scan will do a lot more
work than required, so this is a problem. To fix, add a per-relation flag
consider_param_startup that works like the existing consider_startup flag,
but applies to parameterized paths, and set it for relations that are the
inside of a SEMI or ANTI join.
To make this patch reasonably safe to back-patch, care has been taken to
avoid changing the planner's behavior except in the very narrow case of
SEMI/ANTI joins with inner indexscans. There are places in
compare_path_costs_fuzzily and add_path_precheck that are not terribly
consistent with the new approach, but changing them will affect planner
decisions at the margins in other cases, so we'll leave that for a
HEAD-only fix.
Back-patch to 9.3; before that, the consider_startup flag didn't exist,
meaning that the second aspect of the patch would be too invasive.
Per a complaint from Peter Holzer and analysis by Tomas Vondra.
2015-06-03 17:58:47 +02:00
|
|
|
* consider_startup - true if there is any value in keeping plain paths for
|
2012-09-02 00:16:24 +02:00
|
|
|
* this rel on the basis of having cheap startup cost
|
Fix planner's cost estimation for SEMI/ANTI joins with inner indexscans.
When the inner side of a nestloop SEMI or ANTI join is an indexscan that
uses all the join clauses as indexquals, it can be presumed that both
matched and unmatched outer rows will be processed very quickly: for
matched rows, we'll stop after fetching one row from the indexscan, while
for unmatched rows we'll have an indexscan that finds no matching index
entries, which should also be quick. The planner already knew about this,
but it was nonetheless charging for at least one full run of the inner
indexscan, as a consequence of concerns about the behavior of materialized
inner scans --- but those concerns don't apply in the fast case. If the
inner side has low cardinality (many matching rows) this could make an
indexscan plan look far more expensive than it actually is. To fix,
rearrange the work in initial_cost_nestloop/final_cost_nestloop so that we
don't add the inner scan cost until we've inspected the indexquals, and
then we can add either the full-run cost or just the first tuple's cost as
appropriate.
Experimentation with this fix uncovered another problem: add_path and
friends were coded to disregard cheap startup cost when considering
parameterized paths. That's usually okay (and desirable, because it thins
the path herd faster); but in this fast case for SEMI/ANTI joins, it could
result in throwing away the desired plain indexscan path in favor of a
bitmap scan path before we ever get to the join costing logic. In the
many-matching-rows cases of interest here, a bitmap scan will do a lot more
work than required, so this is a problem. To fix, add a per-relation flag
consider_param_startup that works like the existing consider_startup flag,
but applies to parameterized paths, and set it for relations that are the
inside of a SEMI or ANTI join.
To make this patch reasonably safe to back-patch, care has been taken to
avoid changing the planner's behavior except in the very narrow case of
SEMI/ANTI joins with inner indexscans. There are places in
compare_path_costs_fuzzily and add_path_precheck that are not terribly
consistent with the new approach, but changing them will affect planner
decisions at the margins in other cases, so we'll leave that for a
HEAD-only fix.
Back-patch to 9.3; before that, the consider_startup flag didn't exist,
meaning that the second aspect of the patch would be too invasive.
Per a complaint from Peter Holzer and analysis by Tomas Vondra.
2015-06-03 17:58:47 +02:00
|
|
|
* consider_param_startup - the same for parameterized paths
|
Add an explicit representation of the output targetlist to Paths.
Up to now, there's been an assumption that all Paths for a given relation
compute the same output column set (targetlist). However, there are good
reasons to remove that assumption. For example, an indexscan on an
expression index might be able to return the value of an expensive function
"for free". While we have the ability to generate such a plan today in
simple cases, we don't have a way to model that it's cheaper than a plan
that computes the function from scratch, nor a way to create such a plan
in join cases (where the function computation would normally happen at
the topmost join node). Also, we need this so that we can have Paths
representing post-scan/join steps, where the targetlist may well change
from one step to the next. Therefore, invent a "struct PathTarget"
representing the columns we expect a plan step to emit. It's convenient
to include the output tuple width and tlist evaluation cost in this struct,
and there will likely be additional fields in future.
While Path nodes that actually do have custom outputs will need their own
PathTargets, it will still be true that most Paths for a given relation
will compute the same tlist. To reduce the overhead added by this patch,
keep a "default PathTarget" in RelOptInfo, and allow Paths that compute
that column set to just point to their parent RelOptInfo's reltarget.
(In the patch as committed, actually every Path is like that, since we
do not yet have any cases of custom PathTargets.)
I took this opportunity to provide some more-honest costing of
PlaceHolderVar evaluation. Up to now, the assumption that "scan/join
reltargetlists have cost zero" was applied not only to Vars, where it's
reasonable, but also PlaceHolderVars where it isn't. Now, we add the eval
cost of a PlaceHolderVar's expression to the first plan level where it can
be computed, by including it in the PathTarget cost field and adding that
to the cost estimates for Paths. This isn't perfect yet but it's much
better than before, and there is a way forward to improve it more. This
costing change affects the join order chosen for a couple of the regression
tests, changing expected row ordering.
2016-02-19 02:01:49 +01:00
|
|
|
* reltarget - Default Path output tlist for this rel; normally contains
|
|
|
|
* Var and PlaceHolderVar nodes for the values we need to
|
|
|
|
* output from this relation.
|
|
|
|
* List is in no particular order, but all rels of an
|
|
|
|
* appendrel set must use corresponding orders.
|
|
|
|
* NOTE: in an appendrel child relation, may contain
|
|
|
|
* arbitrary expressions pulled up from a subquery!
|
1999-08-16 04:17:58 +02:00
|
|
|
* pathlist - List of Path nodes, one for each potentially useful
|
|
|
|
* method of generating the relation
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
* ppilist - ParamPathInfo nodes for parameterized Paths, if any
|
2000-02-15 21:49:31 +01:00
|
|
|
* cheapest_startup_path - the pathlist member with lowest startup cost
|
Adjust definition of cheapest_total_path to work better with LATERAL.
In the initial cut at LATERAL, I kept the rule that cheapest_total_path
was always unparameterized, which meant it had to be NULL if the relation
has no unparameterized paths. It turns out to work much more nicely if
we always have *some* path nominated as cheapest-total for each relation.
In particular, let's still say it's the cheapest unparameterized path if
there is one; if not, take the cheapest-total-cost path among those of
the minimum available parameterization. (The first rule is actually
a special case of the second.)
This allows reversion of some temporary lobotomizations I'd put in place.
In particular, the planner can now consider hash and merge joins for
joins below a parameter-supplying nestloop, even if there aren't any
unparameterized paths available. This should bring planning of
LATERAL-containing queries to the same level as queries not using that
feature.
Along the way, simplify management of parameterized paths in add_path()
and friends. In the original coding for parameterized paths in 9.2,
I tried to minimize the logic changes in add_path(), so it just treated
parameterization as yet another dimension of comparison for paths.
We later made it ignore pathkeys (sort ordering) of parameterized paths,
on the grounds that ordering isn't a useful property for the path on the
inside of a nestloop, so we might as well get rid of useless parameterized
paths as quickly as possible. But we didn't take that reasoning as far as
we should have. Startup cost isn't a useful property inside a nestloop
either, so add_path() ought to discount startup cost of parameterized paths
as well. Having done that, the secondary sorting I'd implemented (in
add_parameterized_path) is no longer needed --- any parameterized path that
survives add_path() at all is worth considering at higher levels. So this
should be a bit faster as well as simpler.
2012-08-30 04:05:27 +02:00
|
|
|
* (regardless of ordering) among the unparameterized paths;
|
|
|
|
* or NULL if there is no unparameterized path
|
2000-02-15 21:49:31 +01:00
|
|
|
* cheapest_total_path - the pathlist member with lowest total cost
|
Adjust definition of cheapest_total_path to work better with LATERAL.
In the initial cut at LATERAL, I kept the rule that cheapest_total_path
was always unparameterized, which meant it had to be NULL if the relation
has no unparameterized paths. It turns out to work much more nicely if
we always have *some* path nominated as cheapest-total for each relation.
In particular, let's still say it's the cheapest unparameterized path if
there is one; if not, take the cheapest-total-cost path among those of
the minimum available parameterization. (The first rule is actually
a special case of the second.)
This allows reversion of some temporary lobotomizations I'd put in place.
In particular, the planner can now consider hash and merge joins for
joins below a parameter-supplying nestloop, even if there aren't any
unparameterized paths available. This should bring planning of
LATERAL-containing queries to the same level as queries not using that
feature.
Along the way, simplify management of parameterized paths in add_path()
and friends. In the original coding for parameterized paths in 9.2,
I tried to minimize the logic changes in add_path(), so it just treated
parameterization as yet another dimension of comparison for paths.
We later made it ignore pathkeys (sort ordering) of parameterized paths,
on the grounds that ordering isn't a useful property for the path on the
inside of a nestloop, so we might as well get rid of useless parameterized
paths as quickly as possible. But we didn't take that reasoning as far as
we should have. Startup cost isn't a useful property inside a nestloop
either, so add_path() ought to discount startup cost of parameterized paths
as well. Having done that, the secondary sorting I'd implemented (in
add_parameterized_path) is no longer needed --- any parameterized path that
survives add_path() at all is worth considering at higher levels. So this
should be a bit faster as well as simpler.
2012-08-30 04:05:27 +02:00
|
|
|
* (regardless of ordering) among the unparameterized paths;
|
|
|
|
* or if there is no unparameterized path, the path with lowest
|
|
|
|
* total cost among the paths with minimum parameterization
|
2003-01-20 19:55:07 +01:00
|
|
|
* cheapest_unique_path - for caching cheapest path to produce unique
|
Adjust definition of cheapest_total_path to work better with LATERAL.
In the initial cut at LATERAL, I kept the rule that cheapest_total_path
was always unparameterized, which meant it had to be NULL if the relation
has no unparameterized paths. It turns out to work much more nicely if
we always have *some* path nominated as cheapest-total for each relation.
In particular, let's still say it's the cheapest unparameterized path if
there is one; if not, take the cheapest-total-cost path among those of
the minimum available parameterization. (The first rule is actually
a special case of the second.)
This allows reversion of some temporary lobotomizations I'd put in place.
In particular, the planner can now consider hash and merge joins for
joins below a parameter-supplying nestloop, even if there aren't any
unparameterized paths available. This should bring planning of
LATERAL-containing queries to the same level as queries not using that
feature.
Along the way, simplify management of parameterized paths in add_path()
and friends. In the original coding for parameterized paths in 9.2,
I tried to minimize the logic changes in add_path(), so it just treated
parameterization as yet another dimension of comparison for paths.
We later made it ignore pathkeys (sort ordering) of parameterized paths,
on the grounds that ordering isn't a useful property for the path on the
inside of a nestloop, so we might as well get rid of useless parameterized
paths as quickly as possible. But we didn't take that reasoning as far as
we should have. Startup cost isn't a useful property inside a nestloop
either, so add_path() ought to discount startup cost of parameterized paths
as well. Having done that, the secondary sorting I'd implemented (in
add_parameterized_path) is no longer needed --- any parameterized path that
survives add_path() at all is worth considering at higher levels. So this
should be a bit faster as well as simpler.
2012-08-30 04:05:27 +02:00
|
|
|
* (no duplicates) output from relation; NULL if not yet requested
|
|
|
|
* cheapest_parameterized_paths - best paths for their parameterizations;
|
|
|
|
* always includes cheapest_total_path, even if that's unparameterized
|
2015-12-11 21:52:16 +01:00
|
|
|
* direct_lateral_relids - rels this rel has direct LATERAL references to
|
2015-12-08 00:56:14 +01:00
|
|
|
* lateral_relids - required outer rels for LATERAL, as a Relids set
|
Still more fixes for planner's handling of LATERAL references.
More fuzz testing by Andreas Seltenreich exposed that the planner did not
cope well with chains of lateral references. If relation X references Y
laterally, and Y references Z laterally, then we will have to scan X on the
inside of a nestloop with Z, so for all intents and purposes X is laterally
dependent on Z too. The planner did not understand this and would generate
intermediate joins that could not be used. While that was usually harmless
except for wasting some planning cycles, under the right circumstances it
would lead to "failed to build any N-way joins" or "could not devise a
query plan" planner failures.
To fix that, convert the existing per-relation lateral_relids and
lateral_referencers relid sets into their transitive closures; that is,
they now show all relations on which a rel is directly or indirectly
laterally dependent. This not only fixes the chained-reference problem
but allows some of the relevant tests to be made substantially simpler
and faster, since they can be reduced to simple bitmap manipulations
instead of searches of the LateralJoinInfo list.
Also, when a PlaceHolderVar that is due to be evaluated at a join contains
lateral references, we should treat those references as indirect lateral
dependencies of each of the join's base relations. This prevents us from
trying to join any individual base relations to the lateral reference
source before the join is formed, which again cannot work.
Andreas' testing also exposed another oversight in the "dangerous
PlaceHolderVar" test added in commit 85e5e222b1dd02f1. Simply rejecting
unsafe join paths in joinpath.c is insufficient, because in some cases
we will end up rejecting *all* possible paths for a particular join, again
leading to "could not devise a query plan" failures. The restriction has
to be known also to join_is_legal and its cohort functions, so that they
will not select a join for which that will happen. I chose to move the
supporting logic into joinrels.c where the latter functions are.
Back-patch to 9.3 where LATERAL support was introduced.
2015-12-11 20:22:20 +01:00
|
|
|
* (includes both direct and indirect lateral references)
|
1996-08-28 03:59:28 +02:00
|
|
|
*
|
2002-03-12 01:52:10 +01:00
|
|
|
* If the relation is a base relation it will have these fields set:
|
1999-08-16 04:17:58 +02:00
|
|
|
*
|
2003-02-08 21:20:55 +01:00
|
|
|
* relid - RTE index (this is redundant with the relids field, but
|
|
|
|
* is provided for convenience of access)
|
Make the planner assume that the entries in a VALUES list are distinct.
Previously, if we had to estimate the number of distinct values in a
VALUES column, we fell back on the default behavior used whenever we lack
statistics, which effectively is that there are Min(# of entries, 200)
distinct values. This can be very badly off with a large VALUES list,
as noted by Jeff Janes.
We could consider actually running an ANALYZE-like scan on the VALUES,
but that seems unduly expensive, and anyway it could not deliver reliable
info if the entries are not all constants. What seems like a better choice
is to assume that the values are all distinct. This will sometimes be just
as wrong as the old code, but it seems more likely to be more nearly right
in many common cases. Also, it is more consistent with what happens in
some related cases, for example WHERE x = ANY(ARRAY[1,2,3,...,n]) and
WHERE x = ANY(VALUES (1),(2),(3),...,(n)) now are estimated similarly.
This was discussed some time ago, but consensus was it'd be better
to slip it in at the start of a development cycle not near the end.
(It should've gone into v10, really, but I forgot about it.)
Discussion: https://postgr.es/m/CAMkU=1xHkyPa8VQgGcCNg3RMFFvVxUdOpus1gKcFuvVi0w6Acg@mail.gmail.com
2017-08-16 21:37:14 +02:00
|
|
|
* rtekind - copy of RTE's rtekind field
|
2003-06-30 01:05:05 +02:00
|
|
|
* min_attr, max_attr - range of valid AttrNumbers for rel
|
|
|
|
* attr_needed - array of bitmapsets indicating the highest joinrel
|
|
|
|
* in which each attribute is needed; if bit 0 is set then
|
|
|
|
* the attribute is needed as part of final targetlist
|
|
|
|
* attr_widths - cache space for per-attribute width estimates;
|
|
|
|
* zero means not computed yet
|
2012-08-27 04:48:55 +02:00
|
|
|
* lateral_vars - lateral cross-references of rel, if any (list of
|
|
|
|
* Vars and PlaceHolderVars)
|
2013-08-18 02:22:37 +02:00
|
|
|
* lateral_referencers - relids of rels that reference this one laterally
|
Still more fixes for planner's handling of LATERAL references.
More fuzz testing by Andreas Seltenreich exposed that the planner did not
cope well with chains of lateral references. If relation X references Y
laterally, and Y references Z laterally, then we will have to scan X on the
inside of a nestloop with Z, so for all intents and purposes X is laterally
dependent on Z too. The planner did not understand this and would generate
intermediate joins that could not be used. While that was usually harmless
except for wasting some planning cycles, under the right circumstances it
would lead to "failed to build any N-way joins" or "could not devise a
query plan" planner failures.
To fix that, convert the existing per-relation lateral_relids and
lateral_referencers relid sets into their transitive closures; that is,
they now show all relations on which a rel is directly or indirectly
laterally dependent. This not only fixes the chained-reference problem
but allows some of the relevant tests to be made substantially simpler
and faster, since they can be reduced to simple bitmap manipulations
instead of searches of the LateralJoinInfo list.
Also, when a PlaceHolderVar that is due to be evaluated at a join contains
lateral references, we should treat those references as indirect lateral
dependencies of each of the join's base relations. This prevents us from
trying to join any individual base relations to the lateral reference
source before the join is formed, which again cannot work.
Andreas' testing also exposed another oversight in the "dangerous
PlaceHolderVar" test added in commit 85e5e222b1dd02f1. Simply rejecting
unsafe join paths in joinpath.c is insufficient, because in some cases
we will end up rejecting *all* possible paths for a particular join, again
leading to "could not devise a query plan" failures. The restriction has
to be known also to join_is_legal and its cohort functions, so that they
will not select a join for which that will happen. I chose to move the
supporting logic into joinrels.c where the latter functions are.
Back-patch to 9.3 where LATERAL support was introduced.
2015-12-11 20:22:20 +01:00
|
|
|
* (includes both direct and indirect lateral references)
|
2001-05-20 22:28:20 +02:00
|
|
|
* indexlist - list of IndexOptInfo nodes for relation's indexes
|
2002-05-12 22:10:05 +02:00
|
|
|
* (always NIL if it's not a table)
|
|
|
|
* pages - number of disk pages in relation (zero if not a table)
|
2000-01-09 01:26:47 +01:00
|
|
|
* tuples - number of tuples in relation (not considering restrictions)
|
2011-10-14 23:23:01 +02:00
|
|
|
* allvisfrac - fraction of disk pages that are marked all-visible
|
Speed up finding EquivalenceClasses for a given set of rels
Previously in order to determine which ECs a relation had members in, we
had to loop over all ECs stored in PlannerInfo's eq_classes and check if
ec_relids mentioned the relation. For the most part, this was fine, as
generally, unless queries were fairly complex, the overhead of performing
the lookup would have not been that significant. However, when queries
contained large numbers of joins and ECs, the overhead to find the set of
classes matching a given set of relations could become a significant
portion of the overall planning effort.
Here we allow a much more efficient method to access the ECs which match a
given relation or set of relations. A new Bitmapset field in RelOptInfo
now exists to store the indexes into PlannerInfo's eq_classes list which
each relation is mentioned in. This allows very fast lookups to find all
ECs belonging to a single relation. When we need to lookup ECs belonging
to a given pair of relations, we can simply bitwise-AND the Bitmapsets from
each relation and use the result to perform the lookup.
We also take the opportunity to write a new implementation of
generate_join_implied_equalities which makes use of the new indexes.
generate_join_implied_equalities_for_ecs must remain as is as it can be
given a custom list of ECs, which we can't easily determine the indexes of.
This was originally intended to fix the performance penalty of looking up
foreign keys matching a join condition which was introduced by 100340e2d.
However, we're speeding up much more than just that here.
Author: David Rowley, Tom Lane
Reviewed-by: Tom Lane, Tomas Vondra
Discussion: https://postgr.es/m/6970.1545327857@sss.pgh.pa.us
2019-07-21 07:30:58 +02:00
|
|
|
* eclass_indexes - EquivalenceClasses that mention this rel (filled
|
|
|
|
* only after EC merging is complete)
|
2011-09-03 21:35:12 +02:00
|
|
|
* subroot - PlannerInfo for subquery (NULL if it's not a subquery)
|
Fix PARAM_EXEC assignment mechanism to be safe in the presence of WITH.
The planner previously assumed that parameter Vars having the same absolute
query level, varno, and varattno could safely be assigned the same runtime
PARAM_EXEC slot, even though they might be different Vars appearing in
different subqueries. This was (probably) safe before the introduction of
CTEs, but the lazy-evalution mechanism used for CTEs means that a CTE can
be executed during execution of some other subquery, causing the lifespan
of Params at the same syntactic nesting level as the CTE to overlap with
use of the same slots inside the CTE. In 9.1 we created additional hazards
by using the same parameter-assignment technology for nestloop inner scan
parameters, but it was broken before that, as illustrated by the added
regression test.
To fix, restructure the planner's management of PlannerParamItems so that
items having different semantic lifespans are kept rigorously separated.
This will probably result in complex queries using more runtime PARAM_EXEC
slots than before, but the slots are cheap enough that this hardly matters.
Also, stop generating PlannerParamItems containing Params for subquery
outputs: all we really need to do is reserve the PARAM_EXEC slot number,
and that now only takes incrementing a counter. The planning code is
simpler and probably faster than before, as well as being more correct.
Per report from Vik Reykja.
These changes will mostly also need to be made in the back branches, but
I'm going to hold off on that until after 9.2.0 wraps.
2012-09-05 18:54:03 +02:00
|
|
|
* subplan_params - list of PlannerParamItems to be passed to subquery
|
2000-09-29 20:21:41 +02:00
|
|
|
*
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
* Note: for a subquery, tuples and subroot are not set immediately
|
2000-09-29 20:21:41 +02:00
|
|
|
* upon creation of the RelOptInfo object; they are filled in when
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
* set_subquery_pathlist processes the object.
|
1999-08-16 04:17:58 +02:00
|
|
|
*
|
2006-01-31 22:39:25 +01:00
|
|
|
* For otherrels that are appendrel members, these fields are filled
|
2015-12-08 00:56:14 +01:00
|
|
|
* in just as for a baserel, except we don't bother with lateral_vars.
|
2002-03-12 01:52:10 +01:00
|
|
|
*
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
* If the relation is either a foreign table or a join of foreign tables that
|
Avoid invalidating all foreign-join cached plans when user mappings change.
We must not push down a foreign join when the foreign tables involved
should be accessed under different user mappings. Previously we tried
to enforce that rule literally during planning, but that meant that the
resulting plans were dependent on the current contents of the
pg_user_mapping catalog, and we had to blow away all cached plans
containing any remote join when anything at all changed in pg_user_mapping.
This could have been improved somewhat, but the fact that a syscache inval
callback has very limited info about what changed made it hard to do better
within that design. Instead, let's change the planner to not consider user
mappings per se, but to allow a foreign join if both RTEs have the same
checkAsUser value. If they do, then they necessarily will use the same
user mapping at runtime, and we don't need to know specifically which one
that is. Post-plan-time changes in pg_user_mapping no longer require any
plan invalidation.
This rule does give up some optimization ability, to wit where two foreign
table references come from views with different owners or one's from a view
and one's directly in the query, but nonetheless the same user mapping
would have applied. We'll sacrifice the first case, but to not regress
more than we have to in the second case, allow a foreign join involving
both zero and nonzero checkAsUser values if the nonzero one is the same as
the prevailing effective userID. In that case, mark the plan as only
runnable by that userID.
The plancache code already had a notion of plans being userID-specific,
in order to support RLS. It was a little confused though, in particular
lacking clarity of thought as to whether it was the rewritten query or just
the finished plan that's dependent on the userID. Rearrange that code so
that it's clearer what depends on which, and so that the same logic applies
to both RLS-injected role dependency and foreign-join-injected role
dependency.
Note that this patch doesn't remove the other issue mentioned in the
original complaint, which is that while we'll reliably stop using a foreign
join if it's disallowed in a new context, we might fail to start using a
foreign join if it's now allowed, but we previously created a generic
cached plan that didn't use one. It was agreed that the chance of winning
that way was not high enough to justify the much larger number of plan
invalidations that would have to occur if we tried to cause it to happen.
In passing, clean up randomly-varying spelling of EXPLAIN commands in
postgres_fdw.sql, and fix a COSTS ON example that had been allowed to
leak into the committed tests.
This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the
previous attempt at ensuring we wouldn't push down foreign joins that
span permissions contexts.
Etsuro Fujita and Tom Lane
Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
|
|
|
* all belong to the same foreign server and are assigned to the same user to
|
|
|
|
* check access permissions as (cf checkAsUser), these fields will be set:
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
*
|
|
|
|
* serverid - OID of foreign server, if foreign table (else InvalidOid)
|
Avoid invalidating all foreign-join cached plans when user mappings change.
We must not push down a foreign join when the foreign tables involved
should be accessed under different user mappings. Previously we tried
to enforce that rule literally during planning, but that meant that the
resulting plans were dependent on the current contents of the
pg_user_mapping catalog, and we had to blow away all cached plans
containing any remote join when anything at all changed in pg_user_mapping.
This could have been improved somewhat, but the fact that a syscache inval
callback has very limited info about what changed made it hard to do better
within that design. Instead, let's change the planner to not consider user
mappings per se, but to allow a foreign join if both RTEs have the same
checkAsUser value. If they do, then they necessarily will use the same
user mapping at runtime, and we don't need to know specifically which one
that is. Post-plan-time changes in pg_user_mapping no longer require any
plan invalidation.
This rule does give up some optimization ability, to wit where two foreign
table references come from views with different owners or one's from a view
and one's directly in the query, but nonetheless the same user mapping
would have applied. We'll sacrifice the first case, but to not regress
more than we have to in the second case, allow a foreign join involving
both zero and nonzero checkAsUser values if the nonzero one is the same as
the prevailing effective userID. In that case, mark the plan as only
runnable by that userID.
The plancache code already had a notion of plans being userID-specific,
in order to support RLS. It was a little confused though, in particular
lacking clarity of thought as to whether it was the rewritten query or just
the finished plan that's dependent on the userID. Rearrange that code so
that it's clearer what depends on which, and so that the same logic applies
to both RLS-injected role dependency and foreign-join-injected role
dependency.
Note that this patch doesn't remove the other issue mentioned in the
original complaint, which is that while we'll reliably stop using a foreign
join if it's disallowed in a new context, we might fail to start using a
foreign join if it's now allowed, but we previously created a generic
cached plan that didn't use one. It was agreed that the chance of winning
that way was not high enough to justify the much larger number of plan
invalidations that would have to occur if we tried to cause it to happen.
In passing, clean up randomly-varying spelling of EXPLAIN commands in
postgres_fdw.sql, and fix a COSTS ON example that had been allowed to
leak into the committed tests.
This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the
previous attempt at ensuring we wouldn't push down foreign joins that
span permissions contexts.
Etsuro Fujita and Tom Lane
Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
|
|
|
* userid - OID of user to check access as (InvalidOid means current user)
|
|
|
|
* useridiscurrent - we've assumed that userid equals current user
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
* fdwroutine - function hooks for FDW, if foreign table (else NULL)
|
|
|
|
* fdw_private - private state for FDW, if foreign table (else NULL)
|
|
|
|
*
|
2017-04-08 04:20:03 +02:00
|
|
|
* Two fields are used to cache knowledge acquired during the join search
|
|
|
|
* about whether this rel is provably unique when being joined to given other
|
|
|
|
* relation(s), ie, it can have at most one row matching any given row from
|
|
|
|
* that join relation. Currently we only attempt such proofs, and thus only
|
|
|
|
* populate these fields, for base rels; but someday they might be used for
|
|
|
|
* join rels too:
|
|
|
|
*
|
|
|
|
* unique_for_rels - list of Relid sets, each one being a set of other
|
|
|
|
* rels for which this one has been proven unique
|
|
|
|
* non_unique_for_rels - list of Relid sets, each one being a set of
|
|
|
|
* other rels for which we have tried and failed to prove
|
|
|
|
* this one unique
|
|
|
|
*
|
2017-09-21 05:33:04 +02:00
|
|
|
* The presence of the following fields depends on the restrictions
|
2002-03-12 01:52:10 +01:00
|
|
|
* and joins that the relation participates in:
|
1996-08-28 03:59:28 +02:00
|
|
|
*
|
2000-02-07 05:41:04 +01:00
|
|
|
* baserestrictinfo - List of RestrictInfo nodes, containing info about
|
2005-06-09 06:19:00 +02:00
|
|
|
* each non-join qualification clause in which this relation
|
2000-02-07 05:41:04 +01:00
|
|
|
* participates (only used for base rels)
|
2000-02-15 21:49:31 +01:00
|
|
|
* baserestrictcost - Estimated cost of evaluating the baserestrictinfo
|
|
|
|
* clauses at a single tuple (only used for base rels)
|
Improve RLS planning by marking individual quals with security levels.
In an RLS query, we must ensure that security filter quals are evaluated
before ordinary query quals, in case the latter contain "leaky" functions
that could expose the contents of sensitive rows. The original
implementation of RLS planning ensured this by pushing the scan of a
secured table into a sub-query that it marked as a security-barrier view.
Unfortunately this results in very inefficient plans in many cases, because
the sub-query cannot be flattened and gets planned independently of the
rest of the query.
To fix, drop the use of sub-queries to enforce RLS qual order, and instead
mark each qual (RestrictInfo) with a security_level field establishing its
priority for evaluation. Quals must be evaluated in security_level order,
except that "leakproof" quals can be allowed to go ahead of quals of lower
security_level, if it's helpful to do so. This has to be enforced within
the ordering of any one list of quals to be evaluated at a table scan node,
and we also have to ensure that quals are not chosen for early evaluation
(i.e., use as an index qual or TID scan qual) if they're not allowed to go
ahead of other quals at the scan node.
This is sufficient to fix the problem for RLS quals, since we only support
RLS policies on simple tables and thus RLS quals will always exist at the
table scan level only. Eventually these qual ordering rules should be
enforced for join quals as well, which would permit improving planning for
explicit security-barrier views; but that's a task for another patch.
Note that FDWs would need to be aware of these rules --- and not, for
example, send an insecure qual for remote execution --- but since we do
not yet allow RLS policies on foreign tables, the case doesn't arise.
This will need to be addressed before we can allow such policies.
Patch by me, reviewed by Stephen Frost and Dean Rasheed.
Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
|
|
|
* baserestrict_min_security - Smallest security_level found among
|
|
|
|
* clauses in baserestrictinfo
|
2005-06-09 06:19:00 +02:00
|
|
|
* joininfo - List of RestrictInfo nodes, containing info about each
|
2007-01-20 21:45:41 +01:00
|
|
|
* join clause in which this relation participates (but
|
|
|
|
* note this excludes clauses that might be derivable from
|
|
|
|
* EquivalenceClasses)
|
|
|
|
* has_eclass_joins - flag that EquivalenceClass joins are possible
|
2000-02-07 05:41:04 +01:00
|
|
|
*
|
|
|
|
* Note: Keeping a restrictinfo list in the RelOptInfo is useful only for
|
|
|
|
* base rels, because for a join rel the set of clauses that are treated as
|
|
|
|
* restrict clauses varies depending on which sub-relations we choose to join.
|
|
|
|
* (For example, in a 3-base-rel join, a clause relating rels 1 and 2 must be
|
|
|
|
* treated as a restrictclause if we join {1} and {2 3} to make {1 2 3}; but
|
|
|
|
* if we join {1 2} and {3} then that clause will be a restrictclause in {1 2}
|
|
|
|
* and should not be processed again at the level of {1 2 3}.) Therefore,
|
|
|
|
* the restrictinfo list in the join case appears in individual JoinPaths
|
|
|
|
* (field joinrestrictinfo), not in the parent relation. But it's OK for
|
2005-06-09 06:19:00 +02:00
|
|
|
* the RelOptInfo to store the joininfo list, because that is the same
|
2000-02-07 05:41:04 +01:00
|
|
|
* for a given rel no matter how we form it.
|
2000-02-15 21:49:31 +01:00
|
|
|
*
|
|
|
|
* We store baserestrictcost in the RelOptInfo (for base relations) because
|
|
|
|
* we know we will need it at least once (to price the sequential scan)
|
|
|
|
* and may need it multiple times to price index scans.
|
2017-09-21 05:33:04 +02:00
|
|
|
*
|
2020-04-03 23:00:25 +02:00
|
|
|
* A join relation is considered to be partitioned if it is formed from a
|
|
|
|
* join of two relations that are partitioned, have matching partitioning
|
|
|
|
* schemes, and are joined on an equijoin of the partitioning columns.
|
|
|
|
* Under those conditions we can consider the join relation to be partitioned
|
|
|
|
* by either relation's partitioning keys, though some care is needed if
|
|
|
|
* either relation can be forced to null by outer-joining. For example, an
|
|
|
|
* outer join like (A LEFT JOIN B ON A.a = B.b) may produce rows with B.b
|
|
|
|
* NULL. These rows may not fit the partitioning conditions imposed on B.
|
|
|
|
* Hence, strictly speaking, the join is not partitioned by B.b and thus
|
|
|
|
* partition keys of an outer join should include partition key expressions
|
|
|
|
* from the non-nullable side only. However, if a subsequent join uses
|
|
|
|
* strict comparison operators (and all commonly-used equijoin operators are
|
|
|
|
* strict), the presence of nulls doesn't cause a problem: such rows couldn't
|
|
|
|
* match anything on the other side and thus they don't create a need to do
|
|
|
|
* any cross-partition sub-joins. Hence we can treat such values as still
|
|
|
|
* partitioning the join output for the purpose of additional partitionwise
|
|
|
|
* joining, so long as a strict join operator is used by the next join.
|
|
|
|
*
|
2017-09-21 05:33:04 +02:00
|
|
|
* If the relation is partitioned, these fields will be set:
|
|
|
|
*
|
Faster partition pruning
Add a new module backend/partitioning/partprune.c, implementing a more
sophisticated algorithm for partition pruning. The new module uses each
partition's "boundinfo" for pruning instead of constraint exclusion,
based on an idea proposed by Robert Haas of a "pruning program": a list
of steps generated from the query quals which are run iteratively to
obtain a list of partitions that must be scanned in order to satisfy
those quals.
At present, this targets planner-time partition pruning, but there exist
further patches to apply partition pruning at execution time as well.
This commit also moves some definitions from include/catalog/partition.h
to a new file include/partitioning/partbounds.h, in an attempt to
rationalize partitioning related code.
Authors: Amit Langote, David Rowley, Dilip Kumar
Reviewers: Robert Haas, Kyotaro Horiguchi, Ashutosh Bapat, Jesper Pedersen.
Discussion: https://postgr.es/m/098b9c71-1915-1a2a-8d52-1a7a50ce79e8@lab.ntt.co.jp
2018-04-06 21:23:04 +02:00
|
|
|
* part_scheme - Partitioning scheme of the relation
|
|
|
|
* nparts - Number of partitions
|
|
|
|
* boundinfo - Partition bounds
|
Allow partitionwise joins in more cases.
Previously, the partitionwise join technique only allowed partitionwise
join when input partitioned tables had exactly the same partition
bounds. This commit extends the technique to some cases when the tables
have different partition bounds, by using an advanced partition-matching
algorithm introduced by this commit. For both the input partitioned
tables, the algorithm checks whether every partition of one input
partitioned table only matches one partition of the other input
partitioned table at most, and vice versa. In such a case the join
between the tables can be broken down into joins between the matching
partitions, so the algorithm produces the pairs of the matching
partitions, plus the partition bounds for the join relation, to allow
partitionwise join for computing the join. Currently, the algorithm
works for list-partitioned and range-partitioned tables, but not
hash-partitioned tables. See comments in partition_bounds_merge().
Ashutosh Bapat and Etsuro Fujita, most of regression tests by Rajkumar
Raghuwanshi, some of the tests by Mark Dilger and Amul Sul, reviewed by
Dmitry Dolgov and Amul Sul, with additional review at various points by
Ashutosh Bapat, Mark Dilger, Robert Haas, Antonin Houska, Amit Langote,
Justin Pryzby, and Tomas Vondra
Discussion: https://postgr.es/m/CAFjFpRdjQvaUEV5DJX3TW6pU5eq54NCkadtxHX2JiJG_GvbrCA@mail.gmail.com
2020-04-08 03:25:00 +02:00
|
|
|
* partbounds_merged - true if partition bounds are merged ones
|
Faster partition pruning
Add a new module backend/partitioning/partprune.c, implementing a more
sophisticated algorithm for partition pruning. The new module uses each
partition's "boundinfo" for pruning instead of constraint exclusion,
based on an idea proposed by Robert Haas of a "pruning program": a list
of steps generated from the query quals which are run iteratively to
obtain a list of partitions that must be scanned in order to satisfy
those quals.
At present, this targets planner-time partition pruning, but there exist
further patches to apply partition pruning at execution time as well.
This commit also moves some definitions from include/catalog/partition.h
to a new file include/partitioning/partbounds.h, in an attempt to
rationalize partitioning related code.
Authors: Amit Langote, David Rowley, Dilip Kumar
Reviewers: Robert Haas, Kyotaro Horiguchi, Ashutosh Bapat, Jesper Pedersen.
Discussion: https://postgr.es/m/098b9c71-1915-1a2a-8d52-1a7a50ce79e8@lab.ntt.co.jp
2018-04-06 21:23:04 +02:00
|
|
|
* partition_qual - Partition constraint if not the root
|
|
|
|
* part_rels - RelOptInfos for each partition
|
Allow partitionwise joins in more cases.
Previously, the partitionwise join technique only allowed partitionwise
join when input partitioned tables had exactly the same partition
bounds. This commit extends the technique to some cases when the tables
have different partition bounds, by using an advanced partition-matching
algorithm introduced by this commit. For both the input partitioned
tables, the algorithm checks whether every partition of one input
partitioned table only matches one partition of the other input
partitioned table at most, and vice versa. In such a case the join
between the tables can be broken down into joins between the matching
partitions, so the algorithm produces the pairs of the matching
partitions, plus the partition bounds for the join relation, to allow
partitionwise join for computing the join. Currently, the algorithm
works for list-partitioned and range-partitioned tables, but not
hash-partitioned tables. See comments in partition_bounds_merge().
Ashutosh Bapat and Etsuro Fujita, most of regression tests by Rajkumar
Raghuwanshi, some of the tests by Mark Dilger and Amul Sul, reviewed by
Dmitry Dolgov and Amul Sul, with additional review at various points by
Ashutosh Bapat, Mark Dilger, Robert Haas, Antonin Houska, Amit Langote,
Justin Pryzby, and Tomas Vondra
Discussion: https://postgr.es/m/CAFjFpRdjQvaUEV5DJX3TW6pU5eq54NCkadtxHX2JiJG_GvbrCA@mail.gmail.com
2020-04-08 03:25:00 +02:00
|
|
|
* all_partrels - Relids set of all partition relids
|
Faster partition pruning
Add a new module backend/partitioning/partprune.c, implementing a more
sophisticated algorithm for partition pruning. The new module uses each
partition's "boundinfo" for pruning instead of constraint exclusion,
based on an idea proposed by Robert Haas of a "pruning program": a list
of steps generated from the query quals which are run iteratively to
obtain a list of partitions that must be scanned in order to satisfy
those quals.
At present, this targets planner-time partition pruning, but there exist
further patches to apply partition pruning at execution time as well.
This commit also moves some definitions from include/catalog/partition.h
to a new file include/partitioning/partbounds.h, in an attempt to
rationalize partitioning related code.
Authors: Amit Langote, David Rowley, Dilip Kumar
Reviewers: Robert Haas, Kyotaro Horiguchi, Ashutosh Bapat, Jesper Pedersen.
Discussion: https://postgr.es/m/098b9c71-1915-1a2a-8d52-1a7a50ce79e8@lab.ntt.co.jp
2018-04-06 21:23:04 +02:00
|
|
|
* partexprs, nullable_partexprs - Partition key expressions
|
2017-09-21 05:33:04 +02:00
|
|
|
*
|
2020-04-03 23:00:25 +02:00
|
|
|
* The partexprs and nullable_partexprs arrays each contain
|
|
|
|
* part_scheme->partnatts elements. Each of the elements is a list of
|
|
|
|
* partition key expressions. For partitioned base relations, there is one
|
|
|
|
* expression in each partexprs element, and nullable_partexprs is empty.
|
|
|
|
* For partitioned join relations, each base relation within the join
|
|
|
|
* contributes one partition key expression per partitioning column;
|
|
|
|
* that expression goes in the partexprs[i] list if the base relation
|
|
|
|
* is not nullable by this join or any lower outer join, or in the
|
|
|
|
* nullable_partexprs[i] list if the base relation is nullable.
|
2020-04-08 04:12:14 +02:00
|
|
|
* Furthermore, FULL JOINs add extra nullable_partexprs expressions
|
|
|
|
* corresponding to COALESCE expressions of the left and right join columns,
|
|
|
|
* to simplify matching join clauses to those lists.
|
2000-09-29 20:21:41 +02:00
|
|
|
*----------
|
1996-08-28 03:59:28 +02:00
|
|
|
*/
|
2021-02-27 10:59:36 +01:00
|
|
|
|
|
|
|
/* Bitmask of flags supported by table AMs */
|
|
|
|
#define AMFLAG_HAS_TID_RANGE (1 << 0)
|
|
|
|
|
2002-03-12 01:52:10 +01:00
|
|
|
typedef enum RelOptKind
|
|
|
|
{
|
|
|
|
RELOPT_BASEREL,
|
|
|
|
RELOPT_JOINREL,
|
2010-03-29 00:59:34 +02:00
|
|
|
RELOPT_OTHER_MEMBER_REL,
|
Basic partition-wise join functionality.
Instead of joining two partitioned tables in their entirety we can, if
it is an equi-join on the partition keys, join the matching partitions
individually. This involves teaching the planner about "other join"
rels, which are related to regular join rels in the same way that
other member rels are related to baserels. This can use significantly
more CPU time and memory than regular join planning, because there may
now be a set of "other" rels not only for every base relation but also
for every join relation. In most practical cases, this probably
shouldn't be a problem, because (1) it's probably unusual to join many
tables each with many partitions using the partition keys for all
joins and (2) if you do that scenario then you probably have a big
enough machine to handle the increased memory cost of planning and (3)
the resulting plan is highly likely to be better, so what you spend in
planning you'll make up on the execution side. All the same, for now,
turn this feature off by default.
Currently, we can only perform joins between two tables whose
partitioning schemes are absolutely identical. It would be nice to
cope with other scenarios, such as extra partitions on one side or the
other with no match on the other side, but that will have to wait for
a future patch.
Ashutosh Bapat, reviewed and tested by Rajkumar Raghuwanshi, Amit
Langote, Rafia Sabih, Thomas Munro, Dilip Kumar, Antonin Houska, Amit
Khandekar, and by me. A few final adjustments by me.
Discussion: http://postgr.es/m/CAFjFpRfQ8GrQvzp3jA2wnLqrHmaXna-urjm_UY9BqXj=EaDTSA@mail.gmail.com
Discussion: http://postgr.es/m/CAFjFpRcitjfrULr5jfuKWRPsGUX0LQ0k8-yG0Qw2+1LBGNpMdw@mail.gmail.com
2017-10-06 17:11:10 +02:00
|
|
|
RELOPT_OTHER_JOINREL,
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
RELOPT_UPPER_REL,
|
Implement partition-wise grouping/aggregation.
If the partition keys of input relation are part of the GROUP BY
clause, all the rows belonging to a given group come from a single
partition. This allows aggregation/grouping over a partitioned
relation to be broken down * into aggregation/grouping on each
partition. This should be no worse, and often better, than the normal
approach.
If the GROUP BY clause does not contain all the partition keys, we can
still perform partial aggregation for each partition and then finalize
aggregation after appending the partial results. This is less certain
to be a win, but it's still useful.
Jeevan Chalke, Ashutosh Bapat, Robert Haas. The larger patch series
of which this patch is a part was also reviewed and tested by Antonin
Houska, Rajkumar Raghuwanshi, David Rowley, Dilip Kumar, Konstantin
Knizhnik, Pascal Legrand, and Rafia Sabih.
Discussion: http://postgr.es/m/CAM2+6=V64_xhstVHie0Rz=KPEQnLJMZt_e314P0jaT_oJ9MR8A@mail.gmail.com
2018-03-22 17:49:48 +01:00
|
|
|
RELOPT_OTHER_UPPER_REL,
|
2010-03-29 00:59:34 +02:00
|
|
|
RELOPT_DEADREL
|
2002-03-12 01:52:10 +01:00
|
|
|
} RelOptKind;
|
1996-08-28 03:59:28 +02:00
|
|
|
|
Abstract logic to allow for multiple kinds of child rels.
Currently, the only type of child relation is an "other member rel",
which is the child of a baserel, but in the future joins and even
upper relations may have child rels. To facilitate that, introduce
macros that test to test for particular RelOptKind values, and use
them in various places where they help to clarify the sense of a test.
(For example, a test may allow RELOPT_OTHER_MEMBER_REL either because
it intends to allow child rels, or because it intends to allow simple
rels.)
Also, remove find_childrel_top_parent, which will not work for a
child rel that is not a baserel. Instead, add a new RelOptInfo
member top_parent_relids to track the same kind of information in a
more generic manner.
Ashutosh Bapat, slightly tweaked by me. Review and testing of the
patch set from which this was taken by Rajkumar Raghuwanshi and Rafia
Sabih.
Discussion: http://postgr.es/m/CA+TgmoagTnF2yqR3PT2rv=om=wJiZ4-A+ATwdnriTGku1CLYxA@mail.gmail.com
2017-04-04 04:41:31 +02:00
|
|
|
/*
|
|
|
|
* Is the given relation a simple relation i.e a base or "other" member
|
|
|
|
* relation?
|
|
|
|
*/
|
|
|
|
#define IS_SIMPLE_REL(rel) \
|
|
|
|
((rel)->reloptkind == RELOPT_BASEREL || \
|
|
|
|
(rel)->reloptkind == RELOPT_OTHER_MEMBER_REL)
|
|
|
|
|
|
|
|
/* Is the given relation a join relation? */
|
Basic partition-wise join functionality.
Instead of joining two partitioned tables in their entirety we can, if
it is an equi-join on the partition keys, join the matching partitions
individually. This involves teaching the planner about "other join"
rels, which are related to regular join rels in the same way that
other member rels are related to baserels. This can use significantly
more CPU time and memory than regular join planning, because there may
now be a set of "other" rels not only for every base relation but also
for every join relation. In most practical cases, this probably
shouldn't be a problem, because (1) it's probably unusual to join many
tables each with many partitions using the partition keys for all
joins and (2) if you do that scenario then you probably have a big
enough machine to handle the increased memory cost of planning and (3)
the resulting plan is highly likely to be better, so what you spend in
planning you'll make up on the execution side. All the same, for now,
turn this feature off by default.
Currently, we can only perform joins between two tables whose
partitioning schemes are absolutely identical. It would be nice to
cope with other scenarios, such as extra partitions on one side or the
other with no match on the other side, but that will have to wait for
a future patch.
Ashutosh Bapat, reviewed and tested by Rajkumar Raghuwanshi, Amit
Langote, Rafia Sabih, Thomas Munro, Dilip Kumar, Antonin Houska, Amit
Khandekar, and by me. A few final adjustments by me.
Discussion: http://postgr.es/m/CAFjFpRfQ8GrQvzp3jA2wnLqrHmaXna-urjm_UY9BqXj=EaDTSA@mail.gmail.com
Discussion: http://postgr.es/m/CAFjFpRcitjfrULr5jfuKWRPsGUX0LQ0k8-yG0Qw2+1LBGNpMdw@mail.gmail.com
2017-10-06 17:11:10 +02:00
|
|
|
#define IS_JOIN_REL(rel) \
|
|
|
|
((rel)->reloptkind == RELOPT_JOINREL || \
|
|
|
|
(rel)->reloptkind == RELOPT_OTHER_JOINREL)
|
Abstract logic to allow for multiple kinds of child rels.
Currently, the only type of child relation is an "other member rel",
which is the child of a baserel, but in the future joins and even
upper relations may have child rels. To facilitate that, introduce
macros that test to test for particular RelOptKind values, and use
them in various places where they help to clarify the sense of a test.
(For example, a test may allow RELOPT_OTHER_MEMBER_REL either because
it intends to allow child rels, or because it intends to allow simple
rels.)
Also, remove find_childrel_top_parent, which will not work for a
child rel that is not a baserel. Instead, add a new RelOptInfo
member top_parent_relids to track the same kind of information in a
more generic manner.
Ashutosh Bapat, slightly tweaked by me. Review and testing of the
patch set from which this was taken by Rajkumar Raghuwanshi and Rafia
Sabih.
Discussion: http://postgr.es/m/CA+TgmoagTnF2yqR3PT2rv=om=wJiZ4-A+ATwdnriTGku1CLYxA@mail.gmail.com
2017-04-04 04:41:31 +02:00
|
|
|
|
|
|
|
/* Is the given relation an upper relation? */
|
Implement partition-wise grouping/aggregation.
If the partition keys of input relation are part of the GROUP BY
clause, all the rows belonging to a given group come from a single
partition. This allows aggregation/grouping over a partitioned
relation to be broken down * into aggregation/grouping on each
partition. This should be no worse, and often better, than the normal
approach.
If the GROUP BY clause does not contain all the partition keys, we can
still perform partial aggregation for each partition and then finalize
aggregation after appending the partial results. This is less certain
to be a win, but it's still useful.
Jeevan Chalke, Ashutosh Bapat, Robert Haas. The larger patch series
of which this patch is a part was also reviewed and tested by Antonin
Houska, Rajkumar Raghuwanshi, David Rowley, Dilip Kumar, Konstantin
Knizhnik, Pascal Legrand, and Rafia Sabih.
Discussion: http://postgr.es/m/CAM2+6=V64_xhstVHie0Rz=KPEQnLJMZt_e314P0jaT_oJ9MR8A@mail.gmail.com
2018-03-22 17:49:48 +01:00
|
|
|
#define IS_UPPER_REL(rel) \
|
|
|
|
((rel)->reloptkind == RELOPT_UPPER_REL || \
|
|
|
|
(rel)->reloptkind == RELOPT_OTHER_UPPER_REL)
|
Abstract logic to allow for multiple kinds of child rels.
Currently, the only type of child relation is an "other member rel",
which is the child of a baserel, but in the future joins and even
upper relations may have child rels. To facilitate that, introduce
macros that test to test for particular RelOptKind values, and use
them in various places where they help to clarify the sense of a test.
(For example, a test may allow RELOPT_OTHER_MEMBER_REL either because
it intends to allow child rels, or because it intends to allow simple
rels.)
Also, remove find_childrel_top_parent, which will not work for a
child rel that is not a baserel. Instead, add a new RelOptInfo
member top_parent_relids to track the same kind of information in a
more generic manner.
Ashutosh Bapat, slightly tweaked by me. Review and testing of the
patch set from which this was taken by Rajkumar Raghuwanshi and Rafia
Sabih.
Discussion: http://postgr.es/m/CA+TgmoagTnF2yqR3PT2rv=om=wJiZ4-A+ATwdnriTGku1CLYxA@mail.gmail.com
2017-04-04 04:41:31 +02:00
|
|
|
|
|
|
|
/* Is the given relation an "other" relation? */
|
Basic partition-wise join functionality.
Instead of joining two partitioned tables in their entirety we can, if
it is an equi-join on the partition keys, join the matching partitions
individually. This involves teaching the planner about "other join"
rels, which are related to regular join rels in the same way that
other member rels are related to baserels. This can use significantly
more CPU time and memory than regular join planning, because there may
now be a set of "other" rels not only for every base relation but also
for every join relation. In most practical cases, this probably
shouldn't be a problem, because (1) it's probably unusual to join many
tables each with many partitions using the partition keys for all
joins and (2) if you do that scenario then you probably have a big
enough machine to handle the increased memory cost of planning and (3)
the resulting plan is highly likely to be better, so what you spend in
planning you'll make up on the execution side. All the same, for now,
turn this feature off by default.
Currently, we can only perform joins between two tables whose
partitioning schemes are absolutely identical. It would be nice to
cope with other scenarios, such as extra partitions on one side or the
other with no match on the other side, but that will have to wait for
a future patch.
Ashutosh Bapat, reviewed and tested by Rajkumar Raghuwanshi, Amit
Langote, Rafia Sabih, Thomas Munro, Dilip Kumar, Antonin Houska, Amit
Khandekar, and by me. A few final adjustments by me.
Discussion: http://postgr.es/m/CAFjFpRfQ8GrQvzp3jA2wnLqrHmaXna-urjm_UY9BqXj=EaDTSA@mail.gmail.com
Discussion: http://postgr.es/m/CAFjFpRcitjfrULr5jfuKWRPsGUX0LQ0k8-yG0Qw2+1LBGNpMdw@mail.gmail.com
2017-10-06 17:11:10 +02:00
|
|
|
#define IS_OTHER_REL(rel) \
|
|
|
|
((rel)->reloptkind == RELOPT_OTHER_MEMBER_REL || \
|
Implement partition-wise grouping/aggregation.
If the partition keys of input relation are part of the GROUP BY
clause, all the rows belonging to a given group come from a single
partition. This allows aggregation/grouping over a partitioned
relation to be broken down * into aggregation/grouping on each
partition. This should be no worse, and often better, than the normal
approach.
If the GROUP BY clause does not contain all the partition keys, we can
still perform partial aggregation for each partition and then finalize
aggregation after appending the partial results. This is less certain
to be a win, but it's still useful.
Jeevan Chalke, Ashutosh Bapat, Robert Haas. The larger patch series
of which this patch is a part was also reviewed and tested by Antonin
Houska, Rajkumar Raghuwanshi, David Rowley, Dilip Kumar, Konstantin
Knizhnik, Pascal Legrand, and Rafia Sabih.
Discussion: http://postgr.es/m/CAM2+6=V64_xhstVHie0Rz=KPEQnLJMZt_e314P0jaT_oJ9MR8A@mail.gmail.com
2018-03-22 17:49:48 +01:00
|
|
|
(rel)->reloptkind == RELOPT_OTHER_JOINREL || \
|
|
|
|
(rel)->reloptkind == RELOPT_OTHER_UPPER_REL)
|
Abstract logic to allow for multiple kinds of child rels.
Currently, the only type of child relation is an "other member rel",
which is the child of a baserel, but in the future joins and even
upper relations may have child rels. To facilitate that, introduce
macros that test to test for particular RelOptKind values, and use
them in various places where they help to clarify the sense of a test.
(For example, a test may allow RELOPT_OTHER_MEMBER_REL either because
it intends to allow child rels, or because it intends to allow simple
rels.)
Also, remove find_childrel_top_parent, which will not work for a
child rel that is not a baserel. Instead, add a new RelOptInfo
member top_parent_relids to track the same kind of information in a
more generic manner.
Ashutosh Bapat, slightly tweaked by me. Review and testing of the
patch set from which this was taken by Rajkumar Raghuwanshi and Rafia
Sabih.
Discussion: http://postgr.es/m/CA+TgmoagTnF2yqR3PT2rv=om=wJiZ4-A+ATwdnriTGku1CLYxA@mail.gmail.com
2017-04-04 04:41:31 +02:00
|
|
|
|
1998-07-18 06:22:52 +02:00
|
|
|
typedef struct RelOptInfo
|
1996-08-28 03:59:28 +02:00
|
|
|
{
|
|
|
|
NodeTag type;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2002-03-12 01:52:10 +01:00
|
|
|
RelOptKind reloptkind;
|
|
|
|
|
1999-08-16 04:17:58 +02:00
|
|
|
/* all relations included in this RelOptInfo */
|
2003-02-08 21:20:55 +01:00
|
|
|
Relids relids; /* set of base relids (rangetable indexes) */
|
1999-08-16 04:17:58 +02:00
|
|
|
|
2000-01-09 01:26:47 +01:00
|
|
|
/* size estimates generated by planner */
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality rows; /* estimated number of result tuples */
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2012-09-02 00:16:24 +02:00
|
|
|
/* per-relation planner control flags */
|
|
|
|
bool consider_startup; /* keep cheap-startup-cost paths? */
|
Fix planner's cost estimation for SEMI/ANTI joins with inner indexscans.
When the inner side of a nestloop SEMI or ANTI join is an indexscan that
uses all the join clauses as indexquals, it can be presumed that both
matched and unmatched outer rows will be processed very quickly: for
matched rows, we'll stop after fetching one row from the indexscan, while
for unmatched rows we'll have an indexscan that finds no matching index
entries, which should also be quick. The planner already knew about this,
but it was nonetheless charging for at least one full run of the inner
indexscan, as a consequence of concerns about the behavior of materialized
inner scans --- but those concerns don't apply in the fast case. If the
inner side has low cardinality (many matching rows) this could make an
indexscan plan look far more expensive than it actually is. To fix,
rearrange the work in initial_cost_nestloop/final_cost_nestloop so that we
don't add the inner scan cost until we've inspected the indexquals, and
then we can add either the full-run cost or just the first tuple's cost as
appropriate.
Experimentation with this fix uncovered another problem: add_path and
friends were coded to disregard cheap startup cost when considering
parameterized paths. That's usually okay (and desirable, because it thins
the path herd faster); but in this fast case for SEMI/ANTI joins, it could
result in throwing away the desired plain indexscan path in favor of a
bitmap scan path before we ever get to the join costing logic. In the
many-matching-rows cases of interest here, a bitmap scan will do a lot more
work than required, so this is a problem. To fix, add a per-relation flag
consider_param_startup that works like the existing consider_startup flag,
but applies to parameterized paths, and set it for relations that are the
inside of a SEMI or ANTI join.
To make this patch reasonably safe to back-patch, care has been taken to
avoid changing the planner's behavior except in the very narrow case of
SEMI/ANTI joins with inner indexscans. There are places in
compare_path_costs_fuzzily and add_path_precheck that are not terribly
consistent with the new approach, but changing them will affect planner
decisions at the margins in other cases, so we'll leave that for a
HEAD-only fix.
Back-patch to 9.3; before that, the consider_startup flag didn't exist,
meaning that the second aspect of the patch would be too invasive.
Per a complaint from Peter Holzer and analysis by Tomas Vondra.
2015-06-03 17:58:47 +02:00
|
|
|
bool consider_param_startup; /* ditto, for parameterized paths? */
|
Generate parallel sequential scan plans in simple cases.
Add a new flag, consider_parallel, to each RelOptInfo, indicating
whether a plan for that relation could conceivably be run inside of
a parallel worker. Right now, we're pretty conservative: for example,
it might be possible to defer applying a parallel-restricted qual
in a worker, and later do it in the leader, but right now we just
don't try to parallelize access to that relation. That's probably
the right decision in most cases, anyway.
Using the new flag, generate parallel sequential scan plans for plain
baserels, meaning that we now have parallel sequential scan in
PostgreSQL. The logic here is pretty unsophisticated right now: the
costing model probably isn't right in detail, and we can't push joins
beneath Gather nodes, so the number of plans that can actually benefit
from this is pretty limited right now. Lots more work is needed.
Nevertheless, it seems time to enable this functionality so that all
this code can actually be tested easily by users and developers.
Note that, if you wish to test this functionality, it will be
necessary to set max_parallel_degree to a value greater than the
default of 0. Once a few more loose ends have been tidied up here, we
might want to consider changing the default value of this GUC, but
I'm leaving it alone for now.
Along the way, fix a bug in cost_gather: the previous coding thought
that a Gather node's transfer overhead should be costed on the basis of
the relation size rather than the number of tuples that actually need
to be passed off to the leader.
Patch by me, reviewed in earlier versions by Amit Kapila.
2015-11-11 15:02:52 +01:00
|
|
|
bool consider_parallel; /* consider parallel paths? */
|
2012-09-02 00:16:24 +02:00
|
|
|
|
Add an explicit representation of the output targetlist to Paths.
Up to now, there's been an assumption that all Paths for a given relation
compute the same output column set (targetlist). However, there are good
reasons to remove that assumption. For example, an indexscan on an
expression index might be able to return the value of an expensive function
"for free". While we have the ability to generate such a plan today in
simple cases, we don't have a way to model that it's cheaper than a plan
that computes the function from scratch, nor a way to create such a plan
in join cases (where the function computation would normally happen at
the topmost join node). Also, we need this so that we can have Paths
representing post-scan/join steps, where the targetlist may well change
from one step to the next. Therefore, invent a "struct PathTarget"
representing the columns we expect a plan step to emit. It's convenient
to include the output tuple width and tlist evaluation cost in this struct,
and there will likely be additional fields in future.
While Path nodes that actually do have custom outputs will need their own
PathTargets, it will still be true that most Paths for a given relation
will compute the same tlist. To reduce the overhead added by this patch,
keep a "default PathTarget" in RelOptInfo, and allow Paths that compute
that column set to just point to their parent RelOptInfo's reltarget.
(In the patch as committed, actually every Path is like that, since we
do not yet have any cases of custom PathTargets.)
I took this opportunity to provide some more-honest costing of
PlaceHolderVar evaluation. Up to now, the assumption that "scan/join
reltargetlists have cost zero" was applied not only to Vars, where it's
reasonable, but also PlaceHolderVars where it isn't. Now, we add the eval
cost of a PlaceHolderVar's expression to the first plan level where it can
be computed, by including it in the PathTarget cost field and adding that
to the cost estimates for Paths. This isn't perfect yet but it's much
better than before, and there is a way forward to improve it more. This
costing change affects the join order chosen for a couple of the regression
tests, changing expected row ordering.
2016-02-19 02:01:49 +01:00
|
|
|
/* default result targetlist for Paths scanning this relation */
|
2016-03-14 21:59:59 +01:00
|
|
|
struct PathTarget *reltarget; /* list of Vars/Exprs, cost, width */
|
Add an explicit representation of the output targetlist to Paths.
Up to now, there's been an assumption that all Paths for a given relation
compute the same output column set (targetlist). However, there are good
reasons to remove that assumption. For example, an indexscan on an
expression index might be able to return the value of an expensive function
"for free". While we have the ability to generate such a plan today in
simple cases, we don't have a way to model that it's cheaper than a plan
that computes the function from scratch, nor a way to create such a plan
in join cases (where the function computation would normally happen at
the topmost join node). Also, we need this so that we can have Paths
representing post-scan/join steps, where the targetlist may well change
from one step to the next. Therefore, invent a "struct PathTarget"
representing the columns we expect a plan step to emit. It's convenient
to include the output tuple width and tlist evaluation cost in this struct,
and there will likely be additional fields in future.
While Path nodes that actually do have custom outputs will need their own
PathTargets, it will still be true that most Paths for a given relation
will compute the same tlist. To reduce the overhead added by this patch,
keep a "default PathTarget" in RelOptInfo, and allow Paths that compute
that column set to just point to their parent RelOptInfo's reltarget.
(In the patch as committed, actually every Path is like that, since we
do not yet have any cases of custom PathTargets.)
I took this opportunity to provide some more-honest costing of
PlaceHolderVar evaluation. Up to now, the assumption that "scan/join
reltargetlists have cost zero" was applied not only to Vars, where it's
reasonable, but also PlaceHolderVars where it isn't. Now, we add the eval
cost of a PlaceHolderVar's expression to the first plan level where it can
be computed, by including it in the PathTarget cost field and adding that
to the cost estimates for Paths. This isn't perfect yet but it's much
better than before, and there is a way forward to improve it more. This
costing change affects the join order chosen for a couple of the regression
tests, changing expected row ordering.
2016-02-19 02:01:49 +01:00
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
/* materialization information */
|
1999-02-04 02:47:02 +01:00
|
|
|
List *pathlist; /* Path structures */
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
List *ppilist; /* ParamPathInfos used in pathlist */
|
2016-01-20 20:29:22 +01:00
|
|
|
List *partial_pathlist; /* partial Paths */
|
2000-02-15 21:49:31 +01:00
|
|
|
struct Path *cheapest_startup_path;
|
|
|
|
struct Path *cheapest_total_path;
|
2003-01-20 19:55:07 +01:00
|
|
|
struct Path *cheapest_unique_path;
|
2012-01-28 01:26:38 +01:00
|
|
|
List *cheapest_parameterized_paths;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2015-12-08 00:56:14 +01:00
|
|
|
/* parameterization information needed for both base rels and join rels */
|
|
|
|
/* (see also lateral_vars and lateral_referencers) */
|
2015-12-11 21:52:16 +01:00
|
|
|
Relids direct_lateral_relids; /* rels directly laterally referenced */
|
2015-12-08 00:56:14 +01:00
|
|
|
Relids lateral_relids; /* minimum parameterization of rel */
|
|
|
|
|
2000-09-29 20:21:41 +02:00
|
|
|
/* information about a base rel (not set for join rels!) */
|
2003-02-08 21:20:55 +01:00
|
|
|
Index relid;
|
2010-01-05 22:54:00 +01:00
|
|
|
Oid reltablespace; /* containing tablespace */
|
Make the planner assume that the entries in a VALUES list are distinct.
Previously, if we had to estimate the number of distinct values in a
VALUES column, we fell back on the default behavior used whenever we lack
statistics, which effectively is that there are Min(# of entries, 200)
distinct values. This can be very badly off with a large VALUES list,
as noted by Jeff Janes.
We could consider actually running an ANALYZE-like scan on the VALUES,
but that seems unduly expensive, and anyway it could not deliver reliable
info if the entries are not all constants. What seems like a better choice
is to assume that the values are all distinct. This will sometimes be just
as wrong as the old code, but it seems more likely to be more nearly right
in many common cases. Also, it is more consistent with what happens in
some related cases, for example WHERE x = ANY(ARRAY[1,2,3,...,n]) and
WHERE x = ANY(VALUES (1),(2),(3),...,(n)) now are estimated similarly.
This was discussed some time ago, but consensus was it'd be better
to slip it in at the start of a development cycle not near the end.
(It should've gone into v10, really, but I forgot about it.)
Discussion: https://postgr.es/m/CAMkU=1xHkyPa8VQgGcCNg3RMFFvVxUdOpus1gKcFuvVi0w6Acg@mail.gmail.com
2017-08-16 21:37:14 +02:00
|
|
|
RTEKind rtekind; /* RELATION, SUBQUERY, FUNCTION, etc */
|
2003-06-30 01:05:05 +02:00
|
|
|
AttrNumber min_attr; /* smallest attrno of rel (often <0) */
|
|
|
|
AttrNumber max_attr; /* largest attrno of rel */
|
|
|
|
Relids *attr_needed; /* array indexed [min_attr .. max_attr] */
|
|
|
|
int32 *attr_widths; /* array indexed [min_attr .. max_attr] */
|
2012-08-27 04:48:55 +02:00
|
|
|
List *lateral_vars; /* LATERAL Vars and PHVs referenced by rel */
|
2013-08-18 02:22:37 +02:00
|
|
|
Relids lateral_referencers; /* rels that reference me laterally */
|
2008-10-17 22:23:45 +02:00
|
|
|
List *indexlist; /* list of IndexOptInfo */
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
|
|
|
List *statlist; /* list of StatisticExtInfo */
|
2011-10-14 23:23:01 +02:00
|
|
|
BlockNumber pages; /* size estimates derived from pg_class */
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality tuples;
|
2011-10-14 23:23:01 +02:00
|
|
|
double allvisfrac;
|
Speed up finding EquivalenceClasses for a given set of rels
Previously in order to determine which ECs a relation had members in, we
had to loop over all ECs stored in PlannerInfo's eq_classes and check if
ec_relids mentioned the relation. For the most part, this was fine, as
generally, unless queries were fairly complex, the overhead of performing
the lookup would have not been that significant. However, when queries
contained large numbers of joins and ECs, the overhead to find the set of
classes matching a given set of relations could become a significant
portion of the overall planning effort.
Here we allow a much more efficient method to access the ECs which match a
given relation or set of relations. A new Bitmapset field in RelOptInfo
now exists to store the indexes into PlannerInfo's eq_classes list which
each relation is mentioned in. This allows very fast lookups to find all
ECs belonging to a single relation. When we need to lookup ECs belonging
to a given pair of relations, we can simply bitwise-AND the Bitmapsets from
each relation and use the result to perform the lookup.
We also take the opportunity to write a new implementation of
generate_join_implied_equalities which makes use of the new indexes.
generate_join_implied_equalities_for_ecs must remain as is as it can be
given a custom list of ECs, which we can't easily determine the indexes of.
This was originally intended to fix the performance penalty of looking up
foreign keys matching a join condition which was introduced by 100340e2d.
However, we're speeding up much more than just that here.
Author: David Rowley, Tom Lane
Reviewed-by: Tom Lane, Tomas Vondra
Discussion: https://postgr.es/m/6970.1545327857@sss.pgh.pa.us
2019-07-21 07:30:58 +02:00
|
|
|
Bitmapset *eclass_indexes; /* Indexes in PlannerInfo's eq_classes list of
|
|
|
|
* ECs that mention this rel */
|
2011-09-03 21:35:12 +02:00
|
|
|
PlannerInfo *subroot; /* if subquery */
|
Fix PARAM_EXEC assignment mechanism to be safe in the presence of WITH.
The planner previously assumed that parameter Vars having the same absolute
query level, varno, and varattno could safely be assigned the same runtime
PARAM_EXEC slot, even though they might be different Vars appearing in
different subqueries. This was (probably) safe before the introduction of
CTEs, but the lazy-evalution mechanism used for CTEs means that a CTE can
be executed during execution of some other subquery, causing the lifespan
of Params at the same syntactic nesting level as the CTE to overlap with
use of the same slots inside the CTE. In 9.1 we created additional hazards
by using the same parameter-assignment technology for nestloop inner scan
parameters, but it was broken before that, as illustrated by the added
regression test.
To fix, restructure the planner's management of PlannerParamItems so that
items having different semantic lifespans are kept rigorously separated.
This will probably result in complex queries using more runtime PARAM_EXEC
slots than before, but the slots are cheap enough that this hardly matters.
Also, stop generating PlannerParamItems containing Params for subquery
outputs: all we really need to do is reserve the PARAM_EXEC slot number,
and that now only takes incrementing a counter. The planning code is
simpler and probably faster than before, as well as being more correct.
Per report from Vik Reykja.
These changes will mostly also need to be made in the back branches, but
I'm going to hold off on that until after 9.2.0 wraps.
2012-09-05 18:54:03 +02:00
|
|
|
List *subplan_params; /* if subquery */
|
2016-06-09 15:08:27 +02:00
|
|
|
int rel_parallel_workers; /* wanted number of parallel workers */
|
2021-02-27 10:59:36 +01:00
|
|
|
uint32 amflags; /* Bitmask of optional features supported by
|
|
|
|
* the table AM */
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
|
|
|
|
/* Information about foreign tables and foreign joins */
|
|
|
|
Oid serverid; /* identifies server for the table or join */
|
Avoid invalidating all foreign-join cached plans when user mappings change.
We must not push down a foreign join when the foreign tables involved
should be accessed under different user mappings. Previously we tried
to enforce that rule literally during planning, but that meant that the
resulting plans were dependent on the current contents of the
pg_user_mapping catalog, and we had to blow away all cached plans
containing any remote join when anything at all changed in pg_user_mapping.
This could have been improved somewhat, but the fact that a syscache inval
callback has very limited info about what changed made it hard to do better
within that design. Instead, let's change the planner to not consider user
mappings per se, but to allow a foreign join if both RTEs have the same
checkAsUser value. If they do, then they necessarily will use the same
user mapping at runtime, and we don't need to know specifically which one
that is. Post-plan-time changes in pg_user_mapping no longer require any
plan invalidation.
This rule does give up some optimization ability, to wit where two foreign
table references come from views with different owners or one's from a view
and one's directly in the query, but nonetheless the same user mapping
would have applied. We'll sacrifice the first case, but to not regress
more than we have to in the second case, allow a foreign join involving
both zero and nonzero checkAsUser values if the nonzero one is the same as
the prevailing effective userID. In that case, mark the plan as only
runnable by that userID.
The plancache code already had a notion of plans being userID-specific,
in order to support RLS. It was a little confused though, in particular
lacking clarity of thought as to whether it was the rewritten query or just
the finished plan that's dependent on the userID. Rearrange that code so
that it's clearer what depends on which, and so that the same logic applies
to both RLS-injected role dependency and foreign-join-injected role
dependency.
Note that this patch doesn't remove the other issue mentioned in the
original complaint, which is that while we'll reliably stop using a foreign
join if it's disallowed in a new context, we might fail to start using a
foreign join if it's now allowed, but we previously created a generic
cached plan that didn't use one. It was agreed that the chance of winning
that way was not high enough to justify the much larger number of plan
invalidations that would have to occur if we tried to cause it to happen.
In passing, clean up randomly-varying spelling of EXPLAIN commands in
postgres_fdw.sql, and fix a COSTS ON example that had been allowed to
leak into the committed tests.
This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the
previous attempt at ensuring we wouldn't push down foreign joins that
span permissions contexts.
Etsuro Fujita and Tom Lane
Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
|
|
|
Oid userid; /* identifies user to check access as */
|
|
|
|
bool useridiscurrent; /* join is only valid for current user */
|
Revise FDW planning API, again.
Further reflection shows that a single callback isn't very workable if we
desire to let FDWs generate multiple Paths, because that forces the FDW to
do all work necessary to generate a valid Plan node for each Path. Instead
split the former PlanForeignScan API into three steps: GetForeignRelSize,
GetForeignPaths, GetForeignPlan. We had already bit the bullet of breaking
the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain,
and it's substantially more flexible for complex FDWs.
Add an fdw_private field to RelOptInfo so that the new functions can save
state there rather than possibly having to recalculate information two or
three times.
In addition, we'd not thought through what would be needed to allow an FDW
to set up subexpressions of its choice for runtime execution. We could
treat ForeignScan.fdw_private as an executable expression but that seems
likely to break existing FDWs unnecessarily (in particular, it would
restrict the set of node types allowable in fdw_private to those supported
by expression_tree_walker). Instead, invent a separate field fdw_exprs
which will receive the postprocessing appropriate for expression trees.
(One field is enough since it can be a list of expressions; also, we assume
the corresponding expression state tree(s) will be held within fdw_state,
so we don't need to add anything to ForeignScanState.)
Per review of Hanada Shigeru's pgsql_fdw patch. We may need to tweak this
further as we continue to work on that patch, but to me it feels a lot
closer to being right now.
2012-03-09 18:48:48 +01:00
|
|
|
/* use "struct FdwRoutine" to avoid including fdwapi.h here */
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
struct FdwRoutine *fdwroutine;
|
|
|
|
void *fdw_private;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2017-04-08 04:20:03 +02:00
|
|
|
/* cache space for remembering if we have proven this relation unique */
|
|
|
|
List *unique_for_rels; /* known unique for these other relid
|
|
|
|
* set(s) */
|
|
|
|
List *non_unique_for_rels; /* known not unique for these set(s) */
|
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
/* used by various scans and joins: */
|
2000-02-07 05:41:04 +01:00
|
|
|
List *baserestrictinfo; /* RestrictInfo structures (if base rel) */
|
2003-01-12 23:35:29 +01:00
|
|
|
QualCost baserestrictcost; /* cost of evaluating the above */
|
Improve RLS planning by marking individual quals with security levels.
In an RLS query, we must ensure that security filter quals are evaluated
before ordinary query quals, in case the latter contain "leaky" functions
that could expose the contents of sensitive rows. The original
implementation of RLS planning ensured this by pushing the scan of a
secured table into a sub-query that it marked as a security-barrier view.
Unfortunately this results in very inefficient plans in many cases, because
the sub-query cannot be flattened and gets planned independently of the
rest of the query.
To fix, drop the use of sub-queries to enforce RLS qual order, and instead
mark each qual (RestrictInfo) with a security_level field establishing its
priority for evaluation. Quals must be evaluated in security_level order,
except that "leakproof" quals can be allowed to go ahead of quals of lower
security_level, if it's helpful to do so. This has to be enforced within
the ordering of any one list of quals to be evaluated at a table scan node,
and we also have to ensure that quals are not chosen for early evaluation
(i.e., use as an index qual or TID scan qual) if they're not allowed to go
ahead of other quals at the scan node.
This is sufficient to fix the problem for RLS quals, since we only support
RLS policies on simple tables and thus RLS quals will always exist at the
table scan level only. Eventually these qual ordering rules should be
enforced for join quals as well, which would permit improving planning for
explicit security-barrier views; but that's a task for another patch.
Note that FDWs would need to be aware of these rules --- and not, for
example, send an insecure qual for remote execution --- but since we do
not yet allow RLS policies on foreign tables, the case doesn't arise.
This will need to be addressed before we can allow such policies.
Patch by me, reviewed by Stephen Frost and Dean Rasheed.
Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
|
|
|
Index baserestrict_min_security; /* min security_level found in
|
|
|
|
* baserestrictinfo */
|
2005-06-09 06:19:00 +02:00
|
|
|
List *joininfo; /* RestrictInfo structures for join clauses
|
|
|
|
* involving this rel */
|
2007-01-20 21:45:41 +01:00
|
|
|
bool has_eclass_joins; /* T means joininfo is incomplete */
|
Abstract logic to allow for multiple kinds of child rels.
Currently, the only type of child relation is an "other member rel",
which is the child of a baserel, but in the future joins and even
upper relations may have child rels. To facilitate that, introduce
macros that test to test for particular RelOptKind values, and use
them in various places where they help to clarify the sense of a test.
(For example, a test may allow RELOPT_OTHER_MEMBER_REL either because
it intends to allow child rels, or because it intends to allow simple
rels.)
Also, remove find_childrel_top_parent, which will not work for a
child rel that is not a baserel. Instead, add a new RelOptInfo
member top_parent_relids to track the same kind of information in a
more generic manner.
Ashutosh Bapat, slightly tweaked by me. Review and testing of the
patch set from which this was taken by Rajkumar Raghuwanshi and Rafia
Sabih.
Discussion: http://postgr.es/m/CA+TgmoagTnF2yqR3PT2rv=om=wJiZ4-A+ATwdnriTGku1CLYxA@mail.gmail.com
2017-04-04 04:41:31 +02:00
|
|
|
|
Disable support for partitionwise joins in problematic cases.
Commit f49842d, which added support for partitionwise joins, built the
child's tlist by applying adjust_appendrel_attrs() to the parent's. So in
the case where the parent's included a whole-row Var for the parent, the
child's contained a ConvertRowtypeExpr. To cope with that, that commit
added code to the planner, such as setrefs.c, but some code paths still
assumed that the tlist for a scan (or join) rel would only include Vars
and PlaceHolderVars, which was true before that commit, causing errors:
* When creating an explicit sort node for an input path for a mergejoin
path for a child join, prepare_sort_from_pathkeys() threw the 'could not
find pathkey item to sort' error.
* When deparsing a relation participating in a pushed down child join as a
subquery in contrib/postgres_fdw, get_relation_column_alias_ids() threw
the 'unexpected expression in subquery output' error.
* When performing set_plan_references() on a local join plan generated by
contrib/postgres_fdw for EvalPlanQual support for a pushed down child
join, fix_join_expr() threw the 'variable not found in subplan target
lists' error.
To fix these, two approaches have been proposed: one by Ashutosh Bapat and
one by me. While the former keeps building the child's tlist with a
ConvertRowtypeExpr, the latter builds it with a whole-row Var for the
child not to violate the planner assumption, and tries to fix it up later,
But both approaches need more work, so refuse to generate partitionwise
join paths when whole-row Vars are involved, instead. We don't need to
handle ConvertRowtypeExprs in the child's tlists for now, so this commit
also removes the changes to the planner.
Previously, partitionwise join computed attr_needed data for each child
separately, and built the child join's tlist using that data, which also
required an extra step for adding PlaceHolderVars to that tlist, but it
would be more efficient to build it from the parent join's tlist through
the adjust_appendrel_attrs() transformation. So this commit builds that
list that way, and simplifies build_joinrel_tlist() and placeholder.c as
well as part of set_append_rel_size() to basically what they were before
partitionwise join went in.
Back-patch to PG11 where partitionwise join was introduced.
Report by Rajkumar Raghuwanshi. Analysis by Ashutosh Bapat, who also
provided some of regression tests. Patch by me, reviewed by Robert Haas.
Discussion: https://postgr.es/m/CAKcux6ktu-8tefLWtQuuZBYFaZA83vUzuRd7c1YHC-yEWyYFpg@mail.gmail.com
2018-08-31 13:34:06 +02:00
|
|
|
/* used by partitionwise joins: */
|
2018-11-07 18:12:56 +01:00
|
|
|
bool consider_partitionwise_join; /* consider partitionwise join
|
|
|
|
* paths? (if partitioned rel) */
|
Disable support for partitionwise joins in problematic cases.
Commit f49842d, which added support for partitionwise joins, built the
child's tlist by applying adjust_appendrel_attrs() to the parent's. So in
the case where the parent's included a whole-row Var for the parent, the
child's contained a ConvertRowtypeExpr. To cope with that, that commit
added code to the planner, such as setrefs.c, but some code paths still
assumed that the tlist for a scan (or join) rel would only include Vars
and PlaceHolderVars, which was true before that commit, causing errors:
* When creating an explicit sort node for an input path for a mergejoin
path for a child join, prepare_sort_from_pathkeys() threw the 'could not
find pathkey item to sort' error.
* When deparsing a relation participating in a pushed down child join as a
subquery in contrib/postgres_fdw, get_relation_column_alias_ids() threw
the 'unexpected expression in subquery output' error.
* When performing set_plan_references() on a local join plan generated by
contrib/postgres_fdw for EvalPlanQual support for a pushed down child
join, fix_join_expr() threw the 'variable not found in subplan target
lists' error.
To fix these, two approaches have been proposed: one by Ashutosh Bapat and
one by me. While the former keeps building the child's tlist with a
ConvertRowtypeExpr, the latter builds it with a whole-row Var for the
child not to violate the planner assumption, and tries to fix it up later,
But both approaches need more work, so refuse to generate partitionwise
join paths when whole-row Vars are involved, instead. We don't need to
handle ConvertRowtypeExprs in the child's tlists for now, so this commit
also removes the changes to the planner.
Previously, partitionwise join computed attr_needed data for each child
separately, and built the child join's tlist using that data, which also
required an extra step for adding PlaceHolderVars to that tlist, but it
would be more efficient to build it from the parent join's tlist through
the adjust_appendrel_attrs() transformation. So this commit builds that
list that way, and simplifies build_joinrel_tlist() and placeholder.c as
well as part of set_append_rel_size() to basically what they were before
partitionwise join went in.
Back-patch to PG11 where partitionwise join was introduced.
Report by Rajkumar Raghuwanshi. Analysis by Ashutosh Bapat, who also
provided some of regression tests. Patch by me, reviewed by Robert Haas.
Discussion: https://postgr.es/m/CAKcux6ktu-8tefLWtQuuZBYFaZA83vUzuRd7c1YHC-yEWyYFpg@mail.gmail.com
2018-08-31 13:34:06 +02:00
|
|
|
Relids top_parent_relids; /* Relids of topmost parents (if "other"
|
|
|
|
* rel) */
|
2017-09-21 05:33:04 +02:00
|
|
|
|
2020-04-03 23:00:25 +02:00
|
|
|
/* used for partitioned relations: */
|
|
|
|
PartitionScheme part_scheme; /* Partitioning scheme */
|
Allow partitionwise joins in more cases.
Previously, the partitionwise join technique only allowed partitionwise
join when input partitioned tables had exactly the same partition
bounds. This commit extends the technique to some cases when the tables
have different partition bounds, by using an advanced partition-matching
algorithm introduced by this commit. For both the input partitioned
tables, the algorithm checks whether every partition of one input
partitioned table only matches one partition of the other input
partitioned table at most, and vice versa. In such a case the join
between the tables can be broken down into joins between the matching
partitions, so the algorithm produces the pairs of the matching
partitions, plus the partition bounds for the join relation, to allow
partitionwise join for computing the join. Currently, the algorithm
works for list-partitioned and range-partitioned tables, but not
hash-partitioned tables. See comments in partition_bounds_merge().
Ashutosh Bapat and Etsuro Fujita, most of regression tests by Rajkumar
Raghuwanshi, some of the tests by Mark Dilger and Amul Sul, reviewed by
Dmitry Dolgov and Amul Sul, with additional review at various points by
Ashutosh Bapat, Mark Dilger, Robert Haas, Antonin Houska, Amit Langote,
Justin Pryzby, and Tomas Vondra
Discussion: https://postgr.es/m/CAFjFpRdjQvaUEV5DJX3TW6pU5eq54NCkadtxHX2JiJG_GvbrCA@mail.gmail.com
2020-04-08 03:25:00 +02:00
|
|
|
int nparts; /* Number of partitions; -1 if not yet set; in
|
|
|
|
* case of a join relation 0 means it's
|
|
|
|
* considered unpartitioned */
|
2017-09-21 05:33:04 +02:00
|
|
|
struct PartitionBoundInfoData *boundinfo; /* Partition bounds */
|
Allow partitionwise joins in more cases.
Previously, the partitionwise join technique only allowed partitionwise
join when input partitioned tables had exactly the same partition
bounds. This commit extends the technique to some cases when the tables
have different partition bounds, by using an advanced partition-matching
algorithm introduced by this commit. For both the input partitioned
tables, the algorithm checks whether every partition of one input
partitioned table only matches one partition of the other input
partitioned table at most, and vice versa. In such a case the join
between the tables can be broken down into joins between the matching
partitions, so the algorithm produces the pairs of the matching
partitions, plus the partition bounds for the join relation, to allow
partitionwise join for computing the join. Currently, the algorithm
works for list-partitioned and range-partitioned tables, but not
hash-partitioned tables. See comments in partition_bounds_merge().
Ashutosh Bapat and Etsuro Fujita, most of regression tests by Rajkumar
Raghuwanshi, some of the tests by Mark Dilger and Amul Sul, reviewed by
Dmitry Dolgov and Amul Sul, with additional review at various points by
Ashutosh Bapat, Mark Dilger, Robert Haas, Antonin Houska, Amit Langote,
Justin Pryzby, and Tomas Vondra
Discussion: https://postgr.es/m/CAFjFpRdjQvaUEV5DJX3TW6pU5eq54NCkadtxHX2JiJG_GvbrCA@mail.gmail.com
2020-04-08 03:25:00 +02:00
|
|
|
bool partbounds_merged; /* True if partition bounds were created
|
|
|
|
* by partition_bounds_merge() */
|
2020-04-03 23:00:25 +02:00
|
|
|
List *partition_qual; /* Partition constraint, if not the root */
|
2017-09-21 05:33:04 +02:00
|
|
|
struct RelOptInfo **part_rels; /* Array of RelOptInfos of partitions,
|
2020-04-03 23:00:25 +02:00
|
|
|
* stored in the same order as bounds */
|
2021-08-03 01:47:24 +02:00
|
|
|
Bitmapset *live_parts; /* Bitmap with members acting as indexes into
|
|
|
|
* the part_rels[] array to indicate which
|
|
|
|
* partitions survived partition pruning. */
|
Allow partitionwise joins in more cases.
Previously, the partitionwise join technique only allowed partitionwise
join when input partitioned tables had exactly the same partition
bounds. This commit extends the technique to some cases when the tables
have different partition bounds, by using an advanced partition-matching
algorithm introduced by this commit. For both the input partitioned
tables, the algorithm checks whether every partition of one input
partitioned table only matches one partition of the other input
partitioned table at most, and vice versa. In such a case the join
between the tables can be broken down into joins between the matching
partitions, so the algorithm produces the pairs of the matching
partitions, plus the partition bounds for the join relation, to allow
partitionwise join for computing the join. Currently, the algorithm
works for list-partitioned and range-partitioned tables, but not
hash-partitioned tables. See comments in partition_bounds_merge().
Ashutosh Bapat and Etsuro Fujita, most of regression tests by Rajkumar
Raghuwanshi, some of the tests by Mark Dilger and Amul Sul, reviewed by
Dmitry Dolgov and Amul Sul, with additional review at various points by
Ashutosh Bapat, Mark Dilger, Robert Haas, Antonin Houska, Amit Langote,
Justin Pryzby, and Tomas Vondra
Discussion: https://postgr.es/m/CAFjFpRdjQvaUEV5DJX3TW6pU5eq54NCkadtxHX2JiJG_GvbrCA@mail.gmail.com
2020-04-08 03:25:00 +02:00
|
|
|
Relids all_partrels; /* Relids set of all partition relids */
|
2020-04-03 23:00:25 +02:00
|
|
|
List **partexprs; /* Non-nullable partition key expressions */
|
|
|
|
List **nullable_partexprs; /* Nullable partition key expressions */
|
1999-02-04 02:47:02 +01:00
|
|
|
} RelOptInfo;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Basic partition-wise join functionality.
Instead of joining two partitioned tables in their entirety we can, if
it is an equi-join on the partition keys, join the matching partitions
individually. This involves teaching the planner about "other join"
rels, which are related to regular join rels in the same way that
other member rels are related to baserels. This can use significantly
more CPU time and memory than regular join planning, because there may
now be a set of "other" rels not only for every base relation but also
for every join relation. In most practical cases, this probably
shouldn't be a problem, because (1) it's probably unusual to join many
tables each with many partitions using the partition keys for all
joins and (2) if you do that scenario then you probably have a big
enough machine to handle the increased memory cost of planning and (3)
the resulting plan is highly likely to be better, so what you spend in
planning you'll make up on the execution side. All the same, for now,
turn this feature off by default.
Currently, we can only perform joins between two tables whose
partitioning schemes are absolutely identical. It would be nice to
cope with other scenarios, such as extra partitions on one side or the
other with no match on the other side, but that will have to wait for
a future patch.
Ashutosh Bapat, reviewed and tested by Rajkumar Raghuwanshi, Amit
Langote, Rafia Sabih, Thomas Munro, Dilip Kumar, Antonin Houska, Amit
Khandekar, and by me. A few final adjustments by me.
Discussion: http://postgr.es/m/CAFjFpRfQ8GrQvzp3jA2wnLqrHmaXna-urjm_UY9BqXj=EaDTSA@mail.gmail.com
Discussion: http://postgr.es/m/CAFjFpRcitjfrULr5jfuKWRPsGUX0LQ0k8-yG0Qw2+1LBGNpMdw@mail.gmail.com
2017-10-06 17:11:10 +02:00
|
|
|
/*
|
|
|
|
* Is given relation partitioned?
|
|
|
|
*
|
2018-02-05 23:31:57 +01:00
|
|
|
* It's not enough to test whether rel->part_scheme is set, because it might
|
|
|
|
* be that the basic partitioning properties of the input relations matched
|
Fix handling of targetlist SRFs when scan/join relation is known empty.
When we introduced separate ProjectSetPath nodes for application of
set-returning functions in v10, we inadvertently broke some cases where
we're supposed to recognize that the result of a subquery is known to be
empty (contain zero rows). That's because IS_DUMMY_REL was just looking
for a childless AppendPath without allowing for a ProjectSetPath being
possibly stuck on top. In itself, this didn't do anything much worse
than produce slightly worse plans for some corner cases.
Then in v11, commit 11cf92f6e rearranged things to allow the scan/join
targetlist to be applied directly to partial paths before they get
gathered. But it inserted a short-circuit path for dummy relations
that was a little too short: it failed to insert a ProjectSetPath node
at all for a targetlist containing set-returning functions, resulting in
bogus "set-valued function called in context that cannot accept a set"
errors, as reported in bug #15669 from Madelaine Thibaut.
The best way to fix this mess seems to be to reimplement IS_DUMMY_REL
so that it drills down through any ProjectSetPath nodes that might be
there (and it seems like we'd better allow for ProjectionPath as well).
While we're at it, make it look at rel->pathlist not cheapest_total_path,
so that it gives the right answer independently of whether set_cheapest
has been done lately. That dependency looks pretty shaky in the context
of code like apply_scanjoin_target_to_paths, and even if it's not broken
today it'd certainly bite us at some point. (Nastily, unsafe use of the
old coding would almost always work; the hazard comes down to possibly
looking through a dangling pointer, and only once in a blue moon would
you find something there that resulted in the wrong answer.)
It now looks like it was a mistake for IS_DUMMY_REL to be a macro: if
there are any extensions using it, they'll continue to use the old
inadequate logic until they're recompiled, after which they'll fail
to load into server versions predating this fix. Hopefully there are
few such extensions.
Having fixed IS_DUMMY_REL, the special path for dummy rels in
apply_scanjoin_target_to_paths is unnecessary as well as being wrong,
so we can just drop it.
Also change a few places that were testing for partitioned-ness of a
planner relation but not using IS_PARTITIONED_REL for the purpose; that
seems unsafe as well as inconsistent, plus it required an ugly hack in
apply_scanjoin_target_to_paths.
In passing, save a few cycles in apply_scanjoin_target_to_paths by
skipping processing of pre-existing paths for partitioned rels,
and do some cosmetic cleanup and comment adjustment in that function.
I renamed IS_DUMMY_PATH to IS_DUMMY_APPEND with the intention of breaking
any code that might be using it, since in almost every case that would
be wrong; IS_DUMMY_REL is what to be using instead.
In HEAD, also make set_dummy_rel_pathlist static (since it's no longer
used from outside allpaths.c), and delete is_dummy_plan, since it's no
longer used anywhere.
Back-patch as appropriate into v11 and v10.
Tom Lane and Julien Rouhaud
Discussion: https://postgr.es/m/15669-02fb3296cca26203@postgresql.org
2019-03-07 20:21:52 +01:00
|
|
|
* but the partition bounds did not. Also, if we are able to prove a rel
|
|
|
|
* dummy (empty), we should henceforth treat it as unpartitioned.
|
Basic partition-wise join functionality.
Instead of joining two partitioned tables in their entirety we can, if
it is an equi-join on the partition keys, join the matching partitions
individually. This involves teaching the planner about "other join"
rels, which are related to regular join rels in the same way that
other member rels are related to baserels. This can use significantly
more CPU time and memory than regular join planning, because there may
now be a set of "other" rels not only for every base relation but also
for every join relation. In most practical cases, this probably
shouldn't be a problem, because (1) it's probably unusual to join many
tables each with many partitions using the partition keys for all
joins and (2) if you do that scenario then you probably have a big
enough machine to handle the increased memory cost of planning and (3)
the resulting plan is highly likely to be better, so what you spend in
planning you'll make up on the execution side. All the same, for now,
turn this feature off by default.
Currently, we can only perform joins between two tables whose
partitioning schemes are absolutely identical. It would be nice to
cope with other scenarios, such as extra partitions on one side or the
other with no match on the other side, but that will have to wait for
a future patch.
Ashutosh Bapat, reviewed and tested by Rajkumar Raghuwanshi, Amit
Langote, Rafia Sabih, Thomas Munro, Dilip Kumar, Antonin Houska, Amit
Khandekar, and by me. A few final adjustments by me.
Discussion: http://postgr.es/m/CAFjFpRfQ8GrQvzp3jA2wnLqrHmaXna-urjm_UY9BqXj=EaDTSA@mail.gmail.com
Discussion: http://postgr.es/m/CAFjFpRcitjfrULr5jfuKWRPsGUX0LQ0k8-yG0Qw2+1LBGNpMdw@mail.gmail.com
2017-10-06 17:11:10 +02:00
|
|
|
*/
|
|
|
|
#define IS_PARTITIONED_REL(rel) \
|
2018-02-05 23:31:57 +01:00
|
|
|
((rel)->part_scheme && (rel)->boundinfo && (rel)->nparts > 0 && \
|
Fix handling of targetlist SRFs when scan/join relation is known empty.
When we introduced separate ProjectSetPath nodes for application of
set-returning functions in v10, we inadvertently broke some cases where
we're supposed to recognize that the result of a subquery is known to be
empty (contain zero rows). That's because IS_DUMMY_REL was just looking
for a childless AppendPath without allowing for a ProjectSetPath being
possibly stuck on top. In itself, this didn't do anything much worse
than produce slightly worse plans for some corner cases.
Then in v11, commit 11cf92f6e rearranged things to allow the scan/join
targetlist to be applied directly to partial paths before they get
gathered. But it inserted a short-circuit path for dummy relations
that was a little too short: it failed to insert a ProjectSetPath node
at all for a targetlist containing set-returning functions, resulting in
bogus "set-valued function called in context that cannot accept a set"
errors, as reported in bug #15669 from Madelaine Thibaut.
The best way to fix this mess seems to be to reimplement IS_DUMMY_REL
so that it drills down through any ProjectSetPath nodes that might be
there (and it seems like we'd better allow for ProjectionPath as well).
While we're at it, make it look at rel->pathlist not cheapest_total_path,
so that it gives the right answer independently of whether set_cheapest
has been done lately. That dependency looks pretty shaky in the context
of code like apply_scanjoin_target_to_paths, and even if it's not broken
today it'd certainly bite us at some point. (Nastily, unsafe use of the
old coding would almost always work; the hazard comes down to possibly
looking through a dangling pointer, and only once in a blue moon would
you find something there that resulted in the wrong answer.)
It now looks like it was a mistake for IS_DUMMY_REL to be a macro: if
there are any extensions using it, they'll continue to use the old
inadequate logic until they're recompiled, after which they'll fail
to load into server versions predating this fix. Hopefully there are
few such extensions.
Having fixed IS_DUMMY_REL, the special path for dummy rels in
apply_scanjoin_target_to_paths is unnecessary as well as being wrong,
so we can just drop it.
Also change a few places that were testing for partitioned-ness of a
planner relation but not using IS_PARTITIONED_REL for the purpose; that
seems unsafe as well as inconsistent, plus it required an ugly hack in
apply_scanjoin_target_to_paths.
In passing, save a few cycles in apply_scanjoin_target_to_paths by
skipping processing of pre-existing paths for partitioned rels,
and do some cosmetic cleanup and comment adjustment in that function.
I renamed IS_DUMMY_PATH to IS_DUMMY_APPEND with the intention of breaking
any code that might be using it, since in almost every case that would
be wrong; IS_DUMMY_REL is what to be using instead.
In HEAD, also make set_dummy_rel_pathlist static (since it's no longer
used from outside allpaths.c), and delete is_dummy_plan, since it's no
longer used anywhere.
Back-patch as appropriate into v11 and v10.
Tom Lane and Julien Rouhaud
Discussion: https://postgr.es/m/15669-02fb3296cca26203@postgresql.org
2019-03-07 20:21:52 +01:00
|
|
|
(rel)->part_rels && !IS_DUMMY_REL(rel))
|
Basic partition-wise join functionality.
Instead of joining two partitioned tables in their entirety we can, if
it is an equi-join on the partition keys, join the matching partitions
individually. This involves teaching the planner about "other join"
rels, which are related to regular join rels in the same way that
other member rels are related to baserels. This can use significantly
more CPU time and memory than regular join planning, because there may
now be a set of "other" rels not only for every base relation but also
for every join relation. In most practical cases, this probably
shouldn't be a problem, because (1) it's probably unusual to join many
tables each with many partitions using the partition keys for all
joins and (2) if you do that scenario then you probably have a big
enough machine to handle the increased memory cost of planning and (3)
the resulting plan is highly likely to be better, so what you spend in
planning you'll make up on the execution side. All the same, for now,
turn this feature off by default.
Currently, we can only perform joins between two tables whose
partitioning schemes are absolutely identical. It would be nice to
cope with other scenarios, such as extra partitions on one side or the
other with no match on the other side, but that will have to wait for
a future patch.
Ashutosh Bapat, reviewed and tested by Rajkumar Raghuwanshi, Amit
Langote, Rafia Sabih, Thomas Munro, Dilip Kumar, Antonin Houska, Amit
Khandekar, and by me. A few final adjustments by me.
Discussion: http://postgr.es/m/CAFjFpRfQ8GrQvzp3jA2wnLqrHmaXna-urjm_UY9BqXj=EaDTSA@mail.gmail.com
Discussion: http://postgr.es/m/CAFjFpRcitjfrULr5jfuKWRPsGUX0LQ0k8-yG0Qw2+1LBGNpMdw@mail.gmail.com
2017-10-06 17:11:10 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Convenience macro to make sure that a partitioned relation has all the
|
|
|
|
* required members set.
|
|
|
|
*/
|
|
|
|
#define REL_HAS_ALL_PART_PROPS(rel) \
|
|
|
|
((rel)->part_scheme && (rel)->boundinfo && (rel)->nparts > 0 && \
|
|
|
|
(rel)->part_rels && (rel)->partexprs && (rel)->nullable_partexprs)
|
|
|
|
|
2000-01-09 01:26:47 +01:00
|
|
|
/*
|
|
|
|
* IndexOptInfo
|
|
|
|
* Per-index information for planning/optimization
|
|
|
|
*
|
2018-04-07 22:00:39 +02:00
|
|
|
* indexkeys[], indexcollations[] each have ncolumns entries.
|
|
|
|
* opfamily[], and opcintype[] each have nkeycolumns entries. They do
|
|
|
|
* not contain any information about included attributes.
|
2011-03-26 21:35:25 +01:00
|
|
|
*
|
2018-04-07 22:00:39 +02:00
|
|
|
* sortopfamily[], reverse_sort[], and nulls_first[] have
|
|
|
|
* nkeycolumns entries, if the index is ordered; but if it is unordered,
|
2010-11-29 18:29:42 +01:00
|
|
|
* those pointers are NULL.
|
2007-01-09 03:14:16 +01:00
|
|
|
*
|
2003-05-28 18:04:02 +02:00
|
|
|
* Zeroes in the indexkeys[] array indicate index columns that are
|
|
|
|
* expressions; there is one element in indexprs for each such column.
|
2001-05-20 22:28:20 +02:00
|
|
|
*
|
2010-11-29 18:29:42 +01:00
|
|
|
* For an ordered index, reverse_sort[] and nulls_first[] describe the
|
|
|
|
* sort ordering of a forward indexscan; we can also consider a backward
|
|
|
|
* indexscan, which will generate the reverse ordering.
|
2003-05-28 18:04:02 +02:00
|
|
|
*
|
|
|
|
* The indexprs and indpred expressions have been run through
|
2003-12-28 22:57:37 +01:00
|
|
|
* prepqual.c and eval_const_expressions() for ease of matching to
|
2007-01-09 03:14:16 +01:00
|
|
|
* WHERE clauses. indpred is in implicit-AND form.
|
2011-10-11 20:20:06 +02:00
|
|
|
*
|
|
|
|
* indextlist is a TargetEntry list representing the index columns.
|
|
|
|
* It provides an equivalent base-relation Var for each simple column,
|
|
|
|
* and links to the matching indexprs element for each expression column.
|
Support using index-only scans with partial indexes in more cases.
Previously, the planner would reject an index-only scan if any restriction
clause for its table used a column not available from the index, even
if that restriction clause would later be dropped from the plan entirely
because it's implied by the index's predicate. This is a fairly common
situation for partial indexes because predicates using columns not included
in the index are often the most useful kind of predicate, and we have to
duplicate (or at least imply) the predicate in the WHERE clause in order
to get the index to be considered at all. So index-only scans were
essentially unavailable with such partial indexes.
To fix, we have to do detection of implied-by-predicate clauses much
earlier in the planner. This patch puts it in check_index_predicates
(nee check_partial_indexes), meaning it gets done for every partial index,
whereas we previously only considered this issue at createplan time,
so that the work was only done for an index actually selected for use.
That could result in a noticeable planning slowdown for queries against
tables with many partial indexes. However, testing suggested that there
isn't really a significant cost, especially not with reasonable numbers
of partial indexes. We do get a small additional benefit, which is that
cost_index is more accurate since it correctly discounts the evaluation
cost of clauses that will be removed. We can also avoid considering such
clauses as potential indexquals, which saves useless matching cycles in
the case where the predicate columns aren't in the index, and prevents
generating bogus plans that double-count the clause's selectivity when
the columns are in the index.
Tomas Vondra and Kyotaro Horiguchi, reviewed by Kevin Grittner and
Konstantin Knizhnik, and whacked around a little by me
2016-03-31 20:48:56 +02:00
|
|
|
*
|
|
|
|
* While most of these fields are filled when the IndexOptInfo is created
|
|
|
|
* (by plancat.c), indrestrictinfo and predOK are set later, in
|
|
|
|
* check_index_predicates().
|
2000-01-09 01:26:47 +01:00
|
|
|
*/
|
2019-02-12 03:26:08 +01:00
|
|
|
#ifndef HAVE_INDEXOPTINFO_TYPEDEF
|
|
|
|
typedef struct IndexOptInfo IndexOptInfo;
|
|
|
|
#define HAVE_INDEXOPTINFO_TYPEDEF 1
|
|
|
|
#endif
|
|
|
|
|
|
|
|
struct IndexOptInfo
|
2000-01-09 01:26:47 +01:00
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
|
|
|
|
Oid indexoid; /* OID of the index relation */
|
2010-01-05 22:54:00 +01:00
|
|
|
Oid reltablespace; /* tablespace of index (not table) */
|
2005-03-27 08:29:49 +02:00
|
|
|
RelOptInfo *rel; /* back-link to index's table */
|
2000-01-09 01:26:47 +01:00
|
|
|
|
Redesign the planner's handling of index-descent cost estimation.
Historically we've used a couple of very ad-hoc fudge factors to try to
get the right results when indexes of different sizes would satisfy a
query with the same number of index leaf tuples being visited. In
commit 21a39de5809cd3050a37d2554323cc1d0cbeed9d I tweaked one of these
fudge factors, with results that proved disastrous for larger indexes.
Commit bf01e34b556ff37982ba2d882db424aa484c0d07 fudged it some more,
but still with not a lot of principle behind it.
What seems like a better way to address these issues is to explicitly model
index-descent costs, since that's what's really at stake when considering
diferent indexes with similar leaf-page-level costs. We tried that once
long ago, and found that charging random_page_cost per page descended
through was way too much, because upper btree levels tend to stay in cache
in real-world workloads. However, there's still CPU costs to think about,
and the previous fudge factors can be seen as a crude attempt to account
for those costs. So this patch replaces those fudge factors with explicit
charges for the number of tuple comparisons needed to descend the index
tree, plus a small charge per page touched in the descent. The cost
multipliers are chosen so that the resulting charges are in the vicinity of
the historical (pre-9.2) fudge factors for indexes of up to about a million
tuples, while not ballooning unreasonably beyond that, as the old fudge
factor did (even more so in 9.2).
To make this work accurately for btree indexes, add some code that allows
extraction of the known root-page height from a btree. There's no
equivalent number readily available for other index types, but we can use
the log of the number of index pages as an approximate substitute.
This seems like too much of a behavioral change to risk back-patching,
but it should improve matters going forward. In 9.2 I'll just revert
the fudge-factor change.
2013-01-11 18:56:58 +01:00
|
|
|
/* index-size statistics (from pg_class and elsewhere) */
|
2004-12-01 20:00:56 +01:00
|
|
|
BlockNumber pages; /* number of disk pages in index */
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality tuples; /* number of index tuples in index */
|
Redesign the planner's handling of index-descent cost estimation.
Historically we've used a couple of very ad-hoc fudge factors to try to
get the right results when indexes of different sizes would satisfy a
query with the same number of index leaf tuples being visited. In
commit 21a39de5809cd3050a37d2554323cc1d0cbeed9d I tweaked one of these
fudge factors, with results that proved disastrous for larger indexes.
Commit bf01e34b556ff37982ba2d882db424aa484c0d07 fudged it some more,
but still with not a lot of principle behind it.
What seems like a better way to address these issues is to explicitly model
index-descent costs, since that's what's really at stake when considering
diferent indexes with similar leaf-page-level costs. We tried that once
long ago, and found that charging random_page_cost per page descended
through was way too much, because upper btree levels tend to stay in cache
in real-world workloads. However, there's still CPU costs to think about,
and the previous fudge factors can be seen as a crude attempt to account
for those costs. So this patch replaces those fudge factors with explicit
charges for the number of tuple comparisons needed to descend the index
tree, plus a small charge per page touched in the descent. The cost
multipliers are chosen so that the resulting charges are in the vicinity of
the historical (pre-9.2) fudge factors for indexes of up to about a million
tuples, while not ballooning unreasonably beyond that, as the old fudge
factor did (even more so in 9.2).
To make this work accurately for btree indexes, add some code that allows
extraction of the known root-page height from a btree. There's no
equivalent number readily available for other index types, but we can use
the log of the number of index pages as an approximate substitute.
This seems like too much of a behavioral change to risk back-patching,
but it should improve matters going forward. In 9.2 I'll just revert
the fudge-factor change.
2013-01-11 18:56:58 +01:00
|
|
|
int tree_height; /* index tree height, or -1 if unknown */
|
2000-01-09 01:26:47 +01:00
|
|
|
|
|
|
|
/* index descriptor information */
|
2001-05-20 22:28:20 +02:00
|
|
|
int ncolumns; /* number of columns in index */
|
2018-04-07 22:00:39 +02:00
|
|
|
int nkeycolumns; /* number of key columns in index */
|
|
|
|
int *indexkeys; /* column numbers of index's attributes both
|
|
|
|
* key and included columns, or 0 */
|
2011-02-17 01:24:45 +01:00
|
|
|
Oid *indexcollations; /* OIDs of collations of index columns */
|
2011-03-26 21:35:25 +01:00
|
|
|
Oid *opfamily; /* OIDs of operator families for columns */
|
2007-05-31 18:57:34 +02:00
|
|
|
Oid *opcintype; /* OIDs of opclass declared input data types */
|
2010-11-29 18:29:42 +01:00
|
|
|
Oid *sortopfamily; /* OIDs of btree opfamilies, if orderable */
|
|
|
|
bool *reverse_sort; /* is sort order descending? */
|
2007-01-09 03:14:16 +01:00
|
|
|
bool *nulls_first; /* do NULLs come first in the sort order? */
|
Implement operator class parameters
PostgreSQL provides set of template index access methods, where opclasses have
much freedom in the semantics of indexing. These index AMs are GiST, GIN,
SP-GiST and BRIN. There opclasses define representation of keys, operations on
them and supported search strategies. So, it's natural that opclasses may be
faced some tradeoffs, which require user-side decision. This commit implements
opclass parameters allowing users to set some values, which tell opclass how to
index the particular dataset.
This commit doesn't introduce new storage in system catalog. Instead it uses
pg_attribute.attoptions, which is used for table column storage options but
unused for index attributes.
In order to evade changing signature of each opclass support function, we
implement unified way to pass options to opclass support functions. Options
are set to fn_expr as the constant bytea expression. It's possible due to the
fact that opclass support functions are executed outside of expressions, so
fn_expr is unused for them.
This commit comes with some examples of opclass options usage. We parametrize
signature length in GiST. That applies to multiple opclasses: tsvector_ops,
gist__intbig_ops, gist_ltree_ops, gist__ltree_ops, gist_trgm_ops and
gist_hstore_ops. Also we parametrize maximum number of integer ranges for
gist__int_ops. However, the main future usage of this feature is expected
to be json, where users would be able to specify which way to index particular
json parts.
Catversion is bumped.
Discussion: https://postgr.es/m/d22c3a18-31c7-1879-fc11-4c1ce2f5e5af%40postgrespro.ru
Author: Nikita Glukhov, revised by me
Reviwed-by: Nikolay Shaplov, Robert Haas, Tom Lane, Tomas Vondra, Alvaro Herrera
2020-03-30 18:17:11 +02:00
|
|
|
bytea **opclassoptions; /* opclass-specific options for columns */
|
2015-03-26 18:12:00 +01:00
|
|
|
bool *canreturn; /* which index cols can be returned in an
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
* index-only scan? */
|
2000-01-09 01:26:47 +01:00
|
|
|
Oid relam; /* OID of the access method (in pg_am) */
|
|
|
|
|
2003-05-28 18:04:02 +02:00
|
|
|
List *indexprs; /* expressions for non-simple index columns */
|
2002-11-24 22:52:15 +01:00
|
|
|
List *indpred; /* predicate if a partial index, else NIL */
|
2004-01-04 01:07:32 +01:00
|
|
|
|
2011-10-11 20:20:06 +02:00
|
|
|
List *indextlist; /* targetlist representing index columns */
|
|
|
|
|
Support using index-only scans with partial indexes in more cases.
Previously, the planner would reject an index-only scan if any restriction
clause for its table used a column not available from the index, even
if that restriction clause would later be dropped from the plan entirely
because it's implied by the index's predicate. This is a fairly common
situation for partial indexes because predicates using columns not included
in the index are often the most useful kind of predicate, and we have to
duplicate (or at least imply) the predicate in the WHERE clause in order
to get the index to be considered at all. So index-only scans were
essentially unavailable with such partial indexes.
To fix, we have to do detection of implied-by-predicate clauses much
earlier in the planner. This patch puts it in check_index_predicates
(nee check_partial_indexes), meaning it gets done for every partial index,
whereas we previously only considered this issue at createplan time,
so that the work was only done for an index actually selected for use.
That could result in a noticeable planning slowdown for queries against
tables with many partial indexes. However, testing suggested that there
isn't really a significant cost, especially not with reasonable numbers
of partial indexes. We do get a small additional benefit, which is that
cost_index is more accurate since it correctly discounts the evaluation
cost of clauses that will be removed. We can also avoid considering such
clauses as potential indexquals, which saves useless matching cycles in
the case where the predicate columns aren't in the index, and prevents
generating bogus plans that double-count the clause's selectivity when
the columns are in the index.
Tomas Vondra and Kyotaro Horiguchi, reviewed by Kevin Grittner and
Konstantin Knizhnik, and whacked around a little by me
2016-03-31 20:48:56 +02:00
|
|
|
List *indrestrictinfo; /* parent relation's baserestrictinfo
|
|
|
|
* list, less any conditions implied by
|
|
|
|
* the index's predicate (unless it's a
|
|
|
|
* target rel, see comments in
|
|
|
|
* check_index_predicates()) */
|
|
|
|
|
|
|
|
bool predOK; /* true if index predicate matches query */
|
2002-11-24 22:52:15 +01:00
|
|
|
bool unique; /* true if a unique index */
|
2011-10-23 06:43:39 +02:00
|
|
|
bool immediate; /* is uniqueness enforced immediately? */
|
2011-02-17 01:24:45 +01:00
|
|
|
bool hypothetical; /* true if index doesn't really exist */
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
|
|
|
|
/* Remaining fields are copied from the index AM's API struct: */
|
2010-11-24 20:20:39 +01:00
|
|
|
bool amcanorderbyop; /* does AM support order by operator result? */
|
2005-06-14 01:14:49 +02:00
|
|
|
bool amoptionalkey; /* can query omit key for the first column? */
|
2011-10-16 21:39:24 +02:00
|
|
|
bool amsearcharray; /* can AM handle ScalarArrayOpExpr quals? */
|
2010-01-01 22:53:49 +01:00
|
|
|
bool amsearchnulls; /* can AM search for NULL/NOT NULL entries? */
|
2009-03-06 00:06:45 +01:00
|
|
|
bool amhasgettuple; /* does AM have amgettuple interface? */
|
|
|
|
bool amhasgetbitmap; /* does AM have amgetbitmap interface? */
|
2017-02-15 19:53:24 +01:00
|
|
|
bool amcanparallel; /* does AM support parallel scan? */
|
2020-11-24 21:58:32 +01:00
|
|
|
bool amcanmarkpos; /* does AM support mark/restore? */
|
2019-12-27 00:09:00 +01:00
|
|
|
/* Rather than include amapi.h here, we declare amcostestimate like this */
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
void (*amcostestimate) (); /* AM's cost estimator */
|
2019-02-12 03:26:08 +01:00
|
|
|
};
|
2000-01-09 01:26:47 +01:00
|
|
|
|
2016-06-18 21:22:34 +02:00
|
|
|
/*
|
|
|
|
* ForeignKeyOptInfo
|
|
|
|
* Per-foreign-key information for planning/optimization
|
|
|
|
*
|
|
|
|
* The per-FK-column arrays can be fixed-size because we allow at most
|
|
|
|
* INDEX_MAX_KEYS columns in a foreign key constraint. Each array has
|
|
|
|
* nkeys valid entries.
|
|
|
|
*/
|
|
|
|
typedef struct ForeignKeyOptInfo
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
|
|
|
|
/* Basic data about the foreign key (fetched from catalogs): */
|
|
|
|
Index con_relid; /* RT index of the referencing table */
|
|
|
|
Index ref_relid; /* RT index of the referenced table */
|
|
|
|
int nkeys; /* number of columns in the foreign key */
|
|
|
|
AttrNumber conkey[INDEX_MAX_KEYS]; /* cols in referencing table */
|
|
|
|
AttrNumber confkey[INDEX_MAX_KEYS]; /* cols in referenced table */
|
|
|
|
Oid conpfeqop[INDEX_MAX_KEYS]; /* PK = FK operator OIDs */
|
|
|
|
|
|
|
|
/* Derived info about whether FK's equality conditions match the query: */
|
|
|
|
int nmatched_ec; /* # of FK cols matched by ECs */
|
Fix foreign-key selectivity estimation in the presence of constants.
get_foreign_key_join_selectivity() looks for join clauses that equate
the two sides of the FK constraint. However, if we have a query like
"WHERE fktab.a = pktab.a and fktab.a = 1", it won't find any such join
clause, because equivclass.c replaces the given clauses with "fktab.a
= 1 and pktab.a = 1", which can be enforced at the scan level, leaving
nothing to be done for column "a" at the join level.
We can fix that expectation without much trouble, but then a new problem
arises: applying the foreign-key-based selectivity rule produces a
rowcount underestimate, because we're effectively double-counting the
selectivity of the "fktab.a = 1" clause. So we have to cancel that
selectivity out of the estimate.
To fix, refactor process_implied_equality() so that it can pass back the
new RestrictInfo to its callers in equivclass.c, allowing the generated
"fktab.a = 1" clause to be saved in the EquivalenceClass's ec_derives
list. Then it's not much trouble to dig out the relevant RestrictInfo
when we need to adjust an FK selectivity estimate. (While at it, we
can also remove the expensive use of initialize_mergeclause_eclasses()
to set up the new RestrictInfo's left_ec and right_ec pointers.
The equivclass.c code can set those basically for free.)
This seems like clearly a bug fix, but I'm hesitant to back-patch it,
first because there's some API/ABI risk for extensions and second because
we're usually loath to destabilize plan choices in stable branches.
Per report from Sigrid Ehrenreich.
Discussion: https://postgr.es/m/1019549.1603770457@sss.pgh.pa.us
Discussion: https://postgr.es/m/AM6PR02MB5287A0ADD936C1FA80973E72AB190@AM6PR02MB5287.eurprd02.prod.outlook.com
2020-10-28 16:15:47 +01:00
|
|
|
int nconst_ec; /* # of these ECs that are ec_has_const */
|
2016-06-18 21:22:34 +02:00
|
|
|
int nmatched_rcols; /* # of FK cols matched by non-EC rinfos */
|
|
|
|
int nmatched_ri; /* total # of non-EC rinfos matched to FK */
|
|
|
|
/* Pointer to eclass matching each column's condition, if there is one */
|
|
|
|
struct EquivalenceClass *eclass[INDEX_MAX_KEYS];
|
Fix foreign-key selectivity estimation in the presence of constants.
get_foreign_key_join_selectivity() looks for join clauses that equate
the two sides of the FK constraint. However, if we have a query like
"WHERE fktab.a = pktab.a and fktab.a = 1", it won't find any such join
clause, because equivclass.c replaces the given clauses with "fktab.a
= 1 and pktab.a = 1", which can be enforced at the scan level, leaving
nothing to be done for column "a" at the join level.
We can fix that expectation without much trouble, but then a new problem
arises: applying the foreign-key-based selectivity rule produces a
rowcount underestimate, because we're effectively double-counting the
selectivity of the "fktab.a = 1" clause. So we have to cancel that
selectivity out of the estimate.
To fix, refactor process_implied_equality() so that it can pass back the
new RestrictInfo to its callers in equivclass.c, allowing the generated
"fktab.a = 1" clause to be saved in the EquivalenceClass's ec_derives
list. Then it's not much trouble to dig out the relevant RestrictInfo
when we need to adjust an FK selectivity estimate. (While at it, we
can also remove the expensive use of initialize_mergeclause_eclasses()
to set up the new RestrictInfo's left_ec and right_ec pointers.
The equivclass.c code can set those basically for free.)
This seems like clearly a bug fix, but I'm hesitant to back-patch it,
first because there's some API/ABI risk for extensions and second because
we're usually loath to destabilize plan choices in stable branches.
Per report from Sigrid Ehrenreich.
Discussion: https://postgr.es/m/1019549.1603770457@sss.pgh.pa.us
Discussion: https://postgr.es/m/AM6PR02MB5287A0ADD936C1FA80973E72AB190@AM6PR02MB5287.eurprd02.prod.outlook.com
2020-10-28 16:15:47 +01:00
|
|
|
/* Pointer to eclass member for the referencing Var, if there is one */
|
|
|
|
struct EquivalenceMember *fk_eclass_member[INDEX_MAX_KEYS];
|
2016-06-18 21:22:34 +02:00
|
|
|
/* List of non-EC RestrictInfos matching each column's condition */
|
|
|
|
List *rinfos[INDEX_MAX_KEYS];
|
Avoid invalidating all foreign-join cached plans when user mappings change.
We must not push down a foreign join when the foreign tables involved
should be accessed under different user mappings. Previously we tried
to enforce that rule literally during planning, but that meant that the
resulting plans were dependent on the current contents of the
pg_user_mapping catalog, and we had to blow away all cached plans
containing any remote join when anything at all changed in pg_user_mapping.
This could have been improved somewhat, but the fact that a syscache inval
callback has very limited info about what changed made it hard to do better
within that design. Instead, let's change the planner to not consider user
mappings per se, but to allow a foreign join if both RTEs have the same
checkAsUser value. If they do, then they necessarily will use the same
user mapping at runtime, and we don't need to know specifically which one
that is. Post-plan-time changes in pg_user_mapping no longer require any
plan invalidation.
This rule does give up some optimization ability, to wit where two foreign
table references come from views with different owners or one's from a view
and one's directly in the query, but nonetheless the same user mapping
would have applied. We'll sacrifice the first case, but to not regress
more than we have to in the second case, allow a foreign join involving
both zero and nonzero checkAsUser values if the nonzero one is the same as
the prevailing effective userID. In that case, mark the plan as only
runnable by that userID.
The plancache code already had a notion of plans being userID-specific,
in order to support RLS. It was a little confused though, in particular
lacking clarity of thought as to whether it was the rewritten query or just
the finished plan that's dependent on the userID. Rearrange that code so
that it's clearer what depends on which, and so that the same logic applies
to both RLS-injected role dependency and foreign-join-injected role
dependency.
Note that this patch doesn't remove the other issue mentioned in the
original complaint, which is that while we'll reliably stop using a foreign
join if it's disallowed in a new context, we might fail to start using a
foreign join if it's now allowed, but we previously created a generic
cached plan that didn't use one. It was agreed that the chance of winning
that way was not high enough to justify the much larger number of plan
invalidations that would have to occur if we tried to cause it to happen.
In passing, clean up randomly-varying spelling of EXPLAIN commands in
postgres_fdw.sql, and fix a COSTS ON example that had been allowed to
leak into the committed tests.
This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the
previous attempt at ensuring we wouldn't push down foreign joins that
span permissions contexts.
Etsuro Fujita and Tom Lane
Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
|
|
|
} ForeignKeyOptInfo;
|
2016-06-18 21:22:34 +02:00
|
|
|
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
|
|
|
/*
|
|
|
|
* StatisticExtInfo
|
|
|
|
* Information about extended statistics for planning/optimization
|
|
|
|
*
|
2017-04-06 17:27:15 +02:00
|
|
|
* Each pg_statistic_ext row is represented by one or more nodes of this
|
|
|
|
* type, or even zero if ANALYZE has not computed them.
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
|
|
|
*/
|
|
|
|
typedef struct StatisticExtInfo
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
|
|
|
|
Oid statOid; /* OID of the statistics row */
|
Add stxdinherit flag to pg_statistic_ext_data
Add pg_statistic_ext_data.stxdinherit flag, so that for each extended
statistics definition we can store two versions of data - one for the
relation alone, one for the whole inheritance tree. This is analogous to
pg_statistic.stainherit, but we failed to include such flag in catalogs
for extended statistics, and we had to work around it (see commits
859b3003de, 36c4bc6e72 and 20b9fa308e).
This changes the relationship between the two catalogs storing extended
statistics objects (pg_statistic_ext and pg_statistic_ext_data). Until
now, there was a simple 1:1 mapping - for each definition there was one
pg_statistic_ext_data row, and this row was inserted while creating the
statistics (and then updated during ANALYZE). With the stxdinherit flag,
we don't know how many rows there will be (child relations may be added
after the statistics object is defined), so there may be up to two rows.
We could make CREATE STATISTICS to always create both rows, but that
seems wasteful - without partitioning we only need stxdinherit=false
rows, and declaratively partitioned tables need only stxdinherit=true.
So we no longer initialize pg_statistic_ext_data in CREATE STATISTICS,
and instead make that a responsibility of ANALYZE. Which is what we do
for regular statistics too.
Patch by me, with extensive improvements and fixes by Justin Pryzby.
Author: Tomas Vondra, Justin Pryzby
Reviewed-by: Tomas Vondra, Justin Pryzby
Discussion: https://postgr.es/m/20210923212624.GI831%40telsasoft.com
2022-01-16 13:37:56 +01:00
|
|
|
bool inherit; /* includes child relations */
|
2017-04-06 17:27:15 +02:00
|
|
|
RelOptInfo *rel; /* back-link to statistic's table */
|
2021-03-23 04:45:26 +01:00
|
|
|
char kind; /* statistics kind of this entry */
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
|
|
|
Bitmapset *keys; /* attnums of the columns covered */
|
Extended statistics on expressions
Allow defining extended statistics on expressions, not just just on
simple column references. With this commit, expressions are supported
by all existing extended statistics kinds, improving the same types of
estimates. A simple example may look like this:
CREATE TABLE t (a int);
CREATE STATISTICS s ON mod(a,10), mod(a,20) FROM t;
ANALYZE t;
The collected statistics are useful e.g. to estimate queries with those
expressions in WHERE or GROUP BY clauses:
SELECT * FROM t WHERE mod(a,10) = 0 AND mod(a,20) = 0;
SELECT 1 FROM t GROUP BY mod(a,10), mod(a,20);
This introduces new internal statistics kind 'e' (expressions) which is
built automatically when the statistics object definition includes any
expressions. This represents single-expression statistics, as if there
was an expression index (but without the index maintenance overhead).
The statistics is stored in pg_statistics_ext_data as an array of
composite types, which is possible thanks to 79f6a942bd.
CREATE STATISTICS allows building statistics on a single expression, in
which case in which case it's not possible to specify statistics kinds.
A new system view pg_stats_ext_exprs can be used to display expression
statistics, similarly to pg_stats and pg_stats_ext views.
ALTER TABLE ... ALTER COLUMN ... TYPE now treats indexes the same way it
treats indexes, i.e. it drops and recreates the statistics. This means
all statistics are reset, and we no longer try to preserve at least the
functional dependencies. This should not be a major issue in practice,
as the functional dependencies actually rely on per-column statistics,
which were always reset anyway.
Author: Tomas Vondra
Reviewed-by: Justin Pryzby, Dean Rasheed, Zhihong Yu
Discussion: https://postgr.es/m/ad7891d2-e90c-b446-9fe2-7419143847d7%40enterprisedb.com
2021-03-26 23:22:01 +01:00
|
|
|
List *exprs; /* expressions */
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
|
|
|
} StatisticExtInfo;
|
2002-03-12 01:52:10 +01:00
|
|
|
|
2007-01-20 21:45:41 +01:00
|
|
|
/*
|
|
|
|
* EquivalenceClasses
|
|
|
|
*
|
|
|
|
* Whenever we can determine that a mergejoinable equality clause A = B is
|
|
|
|
* not delayed by any outer join, we create an EquivalenceClass containing
|
|
|
|
* the expressions A and B to record this knowledge. If we later find another
|
|
|
|
* equivalence B = C, we add C to the existing EquivalenceClass; this may
|
|
|
|
* require merging two existing EquivalenceClasses. At the end of the qual
|
|
|
|
* distribution process, we have sets of values that are known all transitively
|
|
|
|
* equal to each other, where "equal" is according to the rules of the btree
|
2011-03-20 01:29:08 +01:00
|
|
|
* operator family(s) shown in ec_opfamilies, as well as the collation shown
|
|
|
|
* by ec_collation. (We restrict an EC to contain only equalities whose
|
|
|
|
* operators belong to the same set of opfamilies. This could probably be
|
|
|
|
* relaxed, but for now it's not worth the trouble, since nearly all equality
|
|
|
|
* operators belong to only one btree opclass anyway. Similarly, we suppose
|
|
|
|
* that all or none of the input datatypes are collatable, so that a single
|
|
|
|
* collation value is sufficient.)
|
2007-01-20 21:45:41 +01:00
|
|
|
*
|
|
|
|
* We also use EquivalenceClasses as the base structure for PathKeys, letting
|
|
|
|
* us represent knowledge about different sort orderings being equivalent.
|
|
|
|
* Since every PathKey must reference an EquivalenceClass, we will end up
|
|
|
|
* with single-member EquivalenceClasses whenever a sort key expression has
|
|
|
|
* not been equivalenced to anything else. It is also possible that such an
|
|
|
|
* EquivalenceClass will contain a volatile expression ("ORDER BY random()"),
|
|
|
|
* which is a case that can't arise otherwise since clauses containing
|
|
|
|
* volatile functions are never considered mergejoinable. We mark such
|
|
|
|
* EquivalenceClasses specially to prevent them from being merged with
|
2007-11-08 22:49:48 +01:00
|
|
|
* ordinary EquivalenceClasses. Also, for volatile expressions we have
|
|
|
|
* to be careful to match the EquivalenceClass to the correct targetlist
|
|
|
|
* entry: consider SELECT random() AS a, random() AS b ... ORDER BY b,a.
|
|
|
|
* So we record the SortGroupRef of the originating sort clause.
|
2007-01-20 21:45:41 +01:00
|
|
|
*
|
|
|
|
* We allow equality clauses appearing below the nullable side of an outer join
|
|
|
|
* to form EquivalenceClasses, but these have a slightly different meaning:
|
|
|
|
* the included values might be all NULL rather than all the same non-null
|
|
|
|
* values. See src/backend/optimizer/README for more on that point.
|
|
|
|
*
|
|
|
|
* NB: if ec_merged isn't NULL, this class has been merged into another, and
|
|
|
|
* should be ignored in favor of using the pointed-to class.
|
|
|
|
*/
|
|
|
|
typedef struct EquivalenceClass
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
|
|
|
|
List *ec_opfamilies; /* btree operator family OIDs */
|
2011-03-20 01:29:08 +01:00
|
|
|
Oid ec_collation; /* collation, if datatypes are collatable */
|
2007-01-20 21:45:41 +01:00
|
|
|
List *ec_members; /* list of EquivalenceMembers */
|
|
|
|
List *ec_sources; /* list of generating RestrictInfos */
|
2007-01-22 21:00:40 +01:00
|
|
|
List *ec_derives; /* list of derived RestrictInfos */
|
2014-03-05 22:00:22 +01:00
|
|
|
Relids ec_relids; /* all relids appearing in ec_members, except
|
|
|
|
* for child members (see below) */
|
2007-01-20 21:45:41 +01:00
|
|
|
bool ec_has_const; /* any pseudoconstants in ec_members? */
|
|
|
|
bool ec_has_volatile; /* the (sole) member is a volatile expr */
|
|
|
|
bool ec_below_outer_join; /* equivalence applies below an OJ */
|
|
|
|
bool ec_broken; /* failed to generate needed clauses? */
|
2007-11-08 22:49:48 +01:00
|
|
|
Index ec_sortref; /* originating sortclause label, or 0 */
|
Improve RLS planning by marking individual quals with security levels.
In an RLS query, we must ensure that security filter quals are evaluated
before ordinary query quals, in case the latter contain "leaky" functions
that could expose the contents of sensitive rows. The original
implementation of RLS planning ensured this by pushing the scan of a
secured table into a sub-query that it marked as a security-barrier view.
Unfortunately this results in very inefficient plans in many cases, because
the sub-query cannot be flattened and gets planned independently of the
rest of the query.
To fix, drop the use of sub-queries to enforce RLS qual order, and instead
mark each qual (RestrictInfo) with a security_level field establishing its
priority for evaluation. Quals must be evaluated in security_level order,
except that "leakproof" quals can be allowed to go ahead of quals of lower
security_level, if it's helpful to do so. This has to be enforced within
the ordering of any one list of quals to be evaluated at a table scan node,
and we also have to ensure that quals are not chosen for early evaluation
(i.e., use as an index qual or TID scan qual) if they're not allowed to go
ahead of other quals at the scan node.
This is sufficient to fix the problem for RLS quals, since we only support
RLS policies on simple tables and thus RLS quals will always exist at the
table scan level only. Eventually these qual ordering rules should be
enforced for join quals as well, which would permit improving planning for
explicit security-barrier views; but that's a task for another patch.
Note that FDWs would need to be aware of these rules --- and not, for
example, send an insecure qual for remote execution --- but since we do
not yet allow RLS policies on foreign tables, the case doesn't arise.
This will need to be addressed before we can allow such policies.
Patch by me, reviewed by Stephen Frost and Dean Rasheed.
Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
|
|
|
Index ec_min_security; /* minimum security_level in ec_sources */
|
|
|
|
Index ec_max_security; /* maximum security_level in ec_sources */
|
2007-01-20 21:45:41 +01:00
|
|
|
struct EquivalenceClass *ec_merged; /* set if merged into another EC */
|
|
|
|
} EquivalenceClass;
|
|
|
|
|
Fix some planner issues found while investigating Kevin Grittner's report
of poorer planning in 8.3 than 8.2:
1. After pushing a constant across an outer join --- ie, given
"a LEFT JOIN b ON (a.x = b.y) WHERE a.x = 42", we can deduce that b.y is
sort of equal to 42, in the sense that we needn't fetch any b rows where
it isn't 42 --- loop to see if any additional deductions can be made.
Previous releases did that by recursing, but I had mistakenly thought that
this was no longer necessary given the EquivalenceClass machinery.
2. Allow pushing constants across outer join conditions even if the
condition is outerjoin_delayed due to a lower outer join. This is safe
as long as the condition is strict and we re-test it at the upper join.
3. Keep the outer-join clause even if we successfully push a constant
across it. This is *necessary* in the outerjoin_delayed case, but
even in the simple case, it seems better to do this to ensure that the
join search order heuristics will consider the join as reasonable to
make. Mark such a clause as having selectivity 1.0, though, since it's
not going to eliminate very many rows after application of the constant
condition.
4. Tweak have_relevant_eclass_joinclause to report that two relations
are joinable when they have vars that are equated to the same constant.
We won't actually generate any joinclause from such an EquivalenceClass,
but again it seems that in such a case it's a good idea to consider
the join as worth costing out.
5. Fix a bug in select_mergejoin_clauses that was exposed by these
changes: we have to reject candidate mergejoin clauses if either side was
equated to a constant, because we can't construct a canonical pathkey list
for such a clause. This is an implementation restriction that might be
worth fixing someday, but it doesn't seem critical to get it done for 8.3.
2008-01-09 21:42:29 +01:00
|
|
|
/*
|
|
|
|
* If an EC contains a const and isn't below-outer-join, any PathKey depending
|
|
|
|
* on it must be redundant, since there's only one possible value of the key.
|
|
|
|
*/
|
|
|
|
#define EC_MUST_BE_REDUNDANT(eclass) \
|
|
|
|
((eclass)->ec_has_const && !(eclass)->ec_below_outer_join)
|
|
|
|
|
2007-01-20 21:45:41 +01:00
|
|
|
/*
|
|
|
|
* EquivalenceMember - one member expression of an EquivalenceClass
|
|
|
|
*
|
|
|
|
* em_is_child signifies that this element was built by transposing a member
|
Revisit handling of UNION ALL subqueries with non-Var output columns.
In commit 57664ed25e5dea117158a2e663c29e60b3546e1c I tried to fix a bug
reported by Teodor Sigaev by making non-simple-Var output columns distinct
(by wrapping their expressions with dummy PlaceHolderVar nodes). This did
not work too well. Commit b28ffd0fcc583c1811e5295279e7d4366c3cae6c fixed
some ensuing problems with matching to child indexes, but per a recent
report from Claus Stadler, constraint exclusion of UNION ALL subqueries was
still broken, because constant-simplification didn't handle the injected
PlaceHolderVars well either. On reflection, the original patch was quite
misguided: there is no reason to expect that EquivalenceClass child members
will be distinct. So instead of trying to make them so, we should ensure
that we can cope with the situation when they're not.
Accordingly, this patch reverts the code changes in the above-mentioned
commits (though the regression test cases they added stay). Instead, I've
added assorted defenses to make sure that duplicate EC child members don't
cause any problems. Teodor's original problem ("MergeAppend child's
targetlist doesn't match MergeAppend") is addressed more directly by
revising prepare_sort_from_pathkeys to let the parent MergeAppend's sort
list guide creation of each child's sort list.
In passing, get rid of add_sort_column; as far as I can tell, testing for
duplicate sort keys at this stage is dead code. Certainly it doesn't
trigger often enough to be worth expending cycles on in ordinary queries.
And keeping the test would've greatly complicated the new logic in
prepare_sort_from_pathkeys, because comparing pathkey list entries against
a previous output array requires that we not skip any entries in the list.
Back-patch to 9.1, like the previous patches. The only known issue in
this area that wasn't caused by the ill-advised previous patches was the
MergeAppend planning failure, which of course is not relevant before 9.1.
It's possible that we need some of the new defenses against duplicate child
EC entries in older branches, but until there's some clear evidence of that
I'm going to refrain from back-patching further.
2012-03-16 18:11:12 +01:00
|
|
|
* for an appendrel parent relation to represent the corresponding expression
|
|
|
|
* for an appendrel child. These members are used for determining the
|
|
|
|
* pathkeys of scans on the child relation and for explicitly sorting the
|
|
|
|
* child when necessary to build a MergeAppend path for the whole appendrel
|
|
|
|
* tree. An em_is_child member has no impact on the properties of the EC as a
|
|
|
|
* whole; in particular the EC's ec_relids field does NOT include the child
|
|
|
|
* relation. An em_is_child member should never be marked em_is_const nor
|
|
|
|
* cause ec_has_const or ec_has_volatile to be set, either. Thus, em_is_child
|
|
|
|
* members are not really full-fledged members of the EC, but just reflections
|
|
|
|
* or doppelgangers of real members. Most operations on EquivalenceClasses
|
|
|
|
* should ignore em_is_child members, and those that don't should test
|
|
|
|
* em_relids to make sure they only consider relevant members.
|
2007-01-20 21:45:41 +01:00
|
|
|
*
|
|
|
|
* em_datatype is usually the same as exprType(em_expr), but can be
|
|
|
|
* different when dealing with a binary-compatible opfamily; in particular
|
|
|
|
* anyarray_ops would never work without this. Use em_datatype when
|
|
|
|
* looking up a specific btree operator to work with this expression.
|
|
|
|
*/
|
|
|
|
typedef struct EquivalenceMember
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
|
|
|
|
Expr *em_expr; /* the expression represented */
|
|
|
|
Relids em_relids; /* all relids appearing in em_expr */
|
Fix planning of non-strict equivalence clauses above outer joins.
If a potential equivalence clause references a variable from the nullable
side of an outer join, the planner needs to take care that derived clauses
are not pushed to below the outer join; else they may use the wrong value
for the variable. (The problem arises only with non-strict clauses, since
if an upper clause can be proven strict then the outer join will get
simplified to a plain join.) The planner attempted to prevent this type
of error by checking that potential equivalence clauses aren't
outerjoin-delayed as a whole, but actually we have to check each side
separately, since the two sides of the clause will get moved around
separately if it's treated as an equivalence. Bugs of this type can be
demonstrated as far back as 7.4, even though releases before 8.3 had only
a very ad-hoc notion of equivalence clauses.
In addition, we neglected to account for the possibility that such clauses
might have nonempty nullable_relids even when not outerjoin-delayed; so the
equivalence-class machinery lacked logic to compute correct nullable_relids
values for clauses it constructs. This oversight was harmless before 9.2
because we were only using RestrictInfo.nullable_relids for OR clauses;
but as of 9.2 it could result in pushing constructed equivalence clauses
to incorrect places. (This accounts for bug #7604 from Bill MacArthur.)
Fix the first problem by adding a new test check_equivalence_delay() in
distribute_qual_to_rels, and fix the second one by adding code in
equivclass.c and called functions to set correct nullable_relids for
generated clauses. Although I believe the second part of this is not
currently necessary before 9.2, I chose to back-patch it anyway, partly to
keep the logic similar across branches and partly because it seems possible
we might find other reasons why we need valid values of nullable_relids in
the older branches.
Add regression tests illustrating these problems. In 9.0 and up, also
add test cases checking that we can push constants through outer joins,
since we've broken that optimization before and I nearly broke it again
with an overly simplistic patch for this problem.
2012-10-18 18:28:45 +02:00
|
|
|
Relids em_nullable_relids; /* nullable by lower outer joins */
|
2007-01-20 21:45:41 +01:00
|
|
|
bool em_is_const; /* expression is pseudoconstant? */
|
|
|
|
bool em_is_child; /* derived version for a child relation? */
|
|
|
|
Oid em_datatype; /* the "nominal type" used by the opfamily */
|
|
|
|
} EquivalenceMember;
|
|
|
|
|
1999-08-16 04:17:58 +02:00
|
|
|
/*
|
|
|
|
* PathKeys
|
|
|
|
*
|
2007-01-20 21:45:41 +01:00
|
|
|
* The sort ordering of a path is represented by a list of PathKey nodes.
|
|
|
|
* An empty list implies no known ordering. Otherwise the first item
|
|
|
|
* represents the primary sort key, the second the first secondary sort key,
|
|
|
|
* etc. The value being sorted is represented by linking to an
|
|
|
|
* EquivalenceClass containing that value and including pk_opfamily among its
|
2011-03-20 01:29:08 +01:00
|
|
|
* ec_opfamilies. The EquivalenceClass tells which collation to use, too.
|
|
|
|
* This is a convenient method because it makes it trivial to detect
|
|
|
|
* equivalent and closely-related orderings. (See optimizer/README for more
|
|
|
|
* information.)
|
2007-01-20 21:45:41 +01:00
|
|
|
*
|
|
|
|
* Note: pk_strategy is either BTLessStrategyNumber (for ASC) or
|
|
|
|
* BTGreaterStrategyNumber (for DESC). We assume that all ordering-capable
|
|
|
|
* index types will use btree-compatible strategy numbers.
|
1999-08-16 04:17:58 +02:00
|
|
|
*/
|
2007-01-20 21:45:41 +01:00
|
|
|
typedef struct PathKey
|
1996-08-28 03:59:28 +02:00
|
|
|
{
|
|
|
|
NodeTag type;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2007-01-20 21:45:41 +01:00
|
|
|
EquivalenceClass *pk_eclass; /* the value that is ordered */
|
|
|
|
Oid pk_opfamily; /* btree opfamily defining the ordering */
|
|
|
|
int pk_strategy; /* sort direction (ASC or DESC) */
|
|
|
|
bool pk_nulls_first; /* do NULLs come before normal values? */
|
|
|
|
} PathKey;
|
1999-02-09 04:51:42 +01:00
|
|
|
|
Optimize order of GROUP BY keys
When evaluating a query with a multi-column GROUP BY clause using sort,
the cost may be heavily dependent on the order in which the keys are
compared when building the groups. Grouping does not imply any ordering,
so we're allowed to compare the keys in arbitrary order, and a Hash Agg
leverages this. But for Group Agg, we simply compared keys in the order
as specified in the query. This commit explores alternative ordering of
the keys, trying to find a cheaper one.
In principle, we might generate grouping paths for all permutations of
the keys, and leave the rest to the optimizer. But that might get very
expensive, so we try to pick only a couple interesting orderings based
on both local and global information.
When planning the grouping path, we explore statistics (number of
distinct values, cost of the comparison function) for the keys and
reorder them to minimize comparison costs. Intuitively, it may be better
to perform more expensive comparisons (for complex data types etc.)
last, because maybe the cheaper comparisons will be enough. Similarly,
the higher the cardinality of a key, the lower the probability we’ll
need to compare more keys. The patch generates and costs various
orderings, picking the cheapest ones.
The ordering of group keys may interact with other parts of the query,
some of which may not be known while planning the grouping. E.g. there
may be an explicit ORDER BY clause, or some other ordering-dependent
operation, higher up in the query, and using the same ordering may allow
using either incremental sort or even eliminate the sort entirely.
The patch generates orderings and picks those minimizing the comparison
cost (for various pathkeys), and then adds orderings that might be
useful for operations higher up in the plan (ORDER BY, etc.). Finally,
it always keeps the ordering specified in the query, on the assumption
the user might have additional insights.
This introduces a new GUC enable_group_by_reordering, so that the
optimization may be disabled if needed.
The original patch was proposed by Teodor Sigaev, and later improved and
reworked by Dmitry Dolgov. Reviews by a number of people, including me,
Andrey Lepikhov, Claudio Freire, Ibrar Ahmed and Zhihong Yu.
Author: Dmitry Dolgov, Teodor Sigaev, Tomas Vondra
Reviewed-by: Tomas Vondra, Andrey Lepikhov, Claudio Freire, Ibrar Ahmed, Zhihong Yu
Discussion: https://postgr.es/m/7c79e6a5-8597-74e8-0671-1c39d124c9d6%40sigaev.ru
Discussion: https://postgr.es/m/CA%2Bq6zcW_4o2NC0zutLkOJPsFt80megSpX_dVRo6GK9PC-Jx_Ag%40mail.gmail.com
2022-03-31 00:09:11 +02:00
|
|
|
/*
|
|
|
|
* Combines information about pathkeys and the associated clauses.
|
|
|
|
*/
|
|
|
|
typedef struct PathKeyInfo
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
List *pathkeys;
|
|
|
|
List *clauses;
|
|
|
|
} PathKeyInfo;
|
|
|
|
|
2021-03-29 03:47:05 +02:00
|
|
|
/*
|
|
|
|
* VolatileFunctionStatus -- allows nodes to cache their
|
|
|
|
* contain_volatile_functions properties. VOLATILITY_UNKNOWN means not yet
|
|
|
|
* determined.
|
|
|
|
*/
|
|
|
|
typedef enum VolatileFunctionStatus
|
|
|
|
{
|
|
|
|
VOLATILITY_UNKNOWN = 0,
|
|
|
|
VOLATILITY_VOLATILE,
|
|
|
|
VOLATILITY_NOVOLATILE
|
|
|
|
} VolatileFunctionStatus;
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
|
2016-03-14 21:59:59 +01:00
|
|
|
/*
|
|
|
|
* PathTarget
|
|
|
|
*
|
|
|
|
* This struct contains what we need to know during planning about the
|
|
|
|
* targetlist (output columns) that a Path will compute. Each RelOptInfo
|
|
|
|
* includes a default PathTarget, which its individual Paths may simply
|
|
|
|
* reference. However, in some cases a Path may compute outputs different
|
|
|
|
* from other Paths, and in that case we make a custom PathTarget for it.
|
|
|
|
* For example, an indexscan might return index expressions that would
|
|
|
|
* otherwise need to be explicitly calculated. (Note also that "upper"
|
|
|
|
* relations generally don't have useful default PathTargets.)
|
|
|
|
*
|
|
|
|
* exprs contains bare expressions; they do not have TargetEntry nodes on top,
|
|
|
|
* though those will appear in finished Plans.
|
|
|
|
*
|
|
|
|
* sortgrouprefs[] is an array of the same length as exprs, containing the
|
|
|
|
* corresponding sort/group refnos, or zeroes for expressions not referenced
|
|
|
|
* by sort/group clauses. If sortgrouprefs is NULL (which it generally is in
|
|
|
|
* RelOptInfo.reltarget targets; only upper-level Paths contain this info),
|
|
|
|
* we have not identified sort/group columns in this tlist. This allows us to
|
|
|
|
* deal with sort/group refnos when needed with less expense than including
|
|
|
|
* TargetEntry nodes in the exprs list.
|
|
|
|
*/
|
|
|
|
typedef struct PathTarget
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
List *exprs; /* list of expressions to be computed */
|
|
|
|
Index *sortgrouprefs; /* corresponding sort/group refnos, or 0 */
|
|
|
|
QualCost cost; /* cost of evaluating the expressions */
|
|
|
|
int width; /* estimated avg width of result tuples */
|
2021-03-29 03:47:05 +02:00
|
|
|
VolatileFunctionStatus has_volatile_expr; /* indicates if exprs contain
|
|
|
|
* any volatile functions. */
|
2016-03-14 21:59:59 +01:00
|
|
|
} PathTarget;
|
|
|
|
|
2016-06-13 18:59:25 +02:00
|
|
|
/* Convenience macro to get a sort/group refno from a PathTarget */
|
|
|
|
#define get_pathtarget_sortgroupref(target, colno) \
|
|
|
|
((target)->sortgrouprefs ? (target)->sortgrouprefs[colno] : (Index) 0)
|
|
|
|
|
2016-03-14 21:59:59 +01:00
|
|
|
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
/*
|
|
|
|
* ParamPathInfo
|
|
|
|
*
|
|
|
|
* All parameterized paths for a given relation with given required outer rels
|
|
|
|
* link to a single ParamPathInfo, which stores common information such as
|
|
|
|
* the estimated rowcount for this parameterization. We do this partly to
|
|
|
|
* avoid recalculations, but mostly to ensure that the estimated rowcount
|
|
|
|
* is in fact the same for every such path.
|
|
|
|
*
|
|
|
|
* Note: ppi_clauses is only used in ParamPathInfos for base relation paths;
|
|
|
|
* in join cases it's NIL because the set of relevant clauses varies depending
|
|
|
|
* on how the join is formed. The relevant clauses will appear in each
|
|
|
|
* parameterized join path's joinrestrictinfo list, instead.
|
|
|
|
*/
|
|
|
|
typedef struct ParamPathInfo
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
|
|
|
|
Relids ppi_req_outer; /* rels supplying parameters used by path */
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality ppi_rows; /* estimated number of result tuples */
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
List *ppi_clauses; /* join clauses available from outer rels */
|
|
|
|
} ParamPathInfo;
|
|
|
|
|
|
|
|
|
1999-08-16 04:17:58 +02:00
|
|
|
/*
|
2008-10-04 23:56:55 +02:00
|
|
|
* Type "Path" is used as-is for sequential-scan paths, as well as some other
|
|
|
|
* simple plan types that we don't need any extra information in the path for.
|
|
|
|
* For other path types it is the first component of a larger struct.
|
2002-11-27 21:52:04 +01:00
|
|
|
*
|
2012-01-28 01:26:38 +01:00
|
|
|
* "pathtype" is the NodeTag of the Plan node we could build from this Path.
|
|
|
|
* It is partially redundant with the Path's NodeTag, but allows us to use
|
|
|
|
* the same Path type for multiple Plan types when there is no need to
|
|
|
|
* distinguish the Plan type during path processing.
|
|
|
|
*
|
Add an explicit representation of the output targetlist to Paths.
Up to now, there's been an assumption that all Paths for a given relation
compute the same output column set (targetlist). However, there are good
reasons to remove that assumption. For example, an indexscan on an
expression index might be able to return the value of an expensive function
"for free". While we have the ability to generate such a plan today in
simple cases, we don't have a way to model that it's cheaper than a plan
that computes the function from scratch, nor a way to create such a plan
in join cases (where the function computation would normally happen at
the topmost join node). Also, we need this so that we can have Paths
representing post-scan/join steps, where the targetlist may well change
from one step to the next. Therefore, invent a "struct PathTarget"
representing the columns we expect a plan step to emit. It's convenient
to include the output tuple width and tlist evaluation cost in this struct,
and there will likely be additional fields in future.
While Path nodes that actually do have custom outputs will need their own
PathTargets, it will still be true that most Paths for a given relation
will compute the same tlist. To reduce the overhead added by this patch,
keep a "default PathTarget" in RelOptInfo, and allow Paths that compute
that column set to just point to their parent RelOptInfo's reltarget.
(In the patch as committed, actually every Path is like that, since we
do not yet have any cases of custom PathTargets.)
I took this opportunity to provide some more-honest costing of
PlaceHolderVar evaluation. Up to now, the assumption that "scan/join
reltargetlists have cost zero" was applied not only to Vars, where it's
reasonable, but also PlaceHolderVars where it isn't. Now, we add the eval
cost of a PlaceHolderVar's expression to the first plan level where it can
be computed, by including it in the PathTarget cost field and adding that
to the cost estimates for Paths. This isn't perfect yet but it's much
better than before, and there is a way forward to improve it more. This
costing change affects the join order chosen for a couple of the regression
tests, changing expected row ordering.
2016-02-19 02:01:49 +01:00
|
|
|
* "parent" identifies the relation this Path scans, and "pathtarget"
|
|
|
|
* describes the precise set of output columns the Path would compute.
|
|
|
|
* In simple cases all Paths for a given rel share the same targetlist,
|
2016-03-14 21:59:59 +01:00
|
|
|
* which we represent by having path->pathtarget equal to parent->reltarget.
|
Add an explicit representation of the output targetlist to Paths.
Up to now, there's been an assumption that all Paths for a given relation
compute the same output column set (targetlist). However, there are good
reasons to remove that assumption. For example, an indexscan on an
expression index might be able to return the value of an expensive function
"for free". While we have the ability to generate such a plan today in
simple cases, we don't have a way to model that it's cheaper than a plan
that computes the function from scratch, nor a way to create such a plan
in join cases (where the function computation would normally happen at
the topmost join node). Also, we need this so that we can have Paths
representing post-scan/join steps, where the targetlist may well change
from one step to the next. Therefore, invent a "struct PathTarget"
representing the columns we expect a plan step to emit. It's convenient
to include the output tuple width and tlist evaluation cost in this struct,
and there will likely be additional fields in future.
While Path nodes that actually do have custom outputs will need their own
PathTargets, it will still be true that most Paths for a given relation
will compute the same tlist. To reduce the overhead added by this patch,
keep a "default PathTarget" in RelOptInfo, and allow Paths that compute
that column set to just point to their parent RelOptInfo's reltarget.
(In the patch as committed, actually every Path is like that, since we
do not yet have any cases of custom PathTargets.)
I took this opportunity to provide some more-honest costing of
PlaceHolderVar evaluation. Up to now, the assumption that "scan/join
reltargetlists have cost zero" was applied not only to Vars, where it's
reasonable, but also PlaceHolderVars where it isn't. Now, we add the eval
cost of a PlaceHolderVar's expression to the first plan level where it can
be computed, by including it in the PathTarget cost field and adding that
to the cost estimates for Paths. This isn't perfect yet but it's much
better than before, and there is a way forward to improve it more. This
costing change affects the join order chosen for a couple of the regression
tests, changing expected row ordering.
2016-02-19 02:01:49 +01:00
|
|
|
*
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
* "param_info", if not NULL, links to a ParamPathInfo that identifies outer
|
|
|
|
* relation(s) that provide parameter values to each scan of this path.
|
|
|
|
* That means this path can only be joined to those rels by means of nestloop
|
|
|
|
* joins with this path on the inside. Also note that a parameterized path
|
|
|
|
* is responsible for testing all "movable" joinclauses involving this rel
|
|
|
|
* and the specified outer rel(s).
|
|
|
|
*
|
2012-01-28 01:26:38 +01:00
|
|
|
* "rows" is the same as parent->rows in simple paths, but in parameterized
|
|
|
|
* paths and UniquePaths it can be less than parent->rows, reflecting the
|
|
|
|
* fact that we've filtered by extra join conditions or removed duplicates.
|
|
|
|
*
|
|
|
|
* "pathkeys" is a List of PathKey nodes (see above), describing the sort
|
|
|
|
* ordering of the path's output rows.
|
1999-08-16 04:17:58 +02:00
|
|
|
*/
|
1996-08-28 03:59:28 +02:00
|
|
|
typedef struct Path
|
|
|
|
{
|
|
|
|
NodeTag type;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2004-01-05 06:07:36 +01:00
|
|
|
NodeTag pathtype; /* tag identifying scan/join method */
|
|
|
|
|
1999-08-16 04:17:58 +02:00
|
|
|
RelOptInfo *parent; /* the relation this path can build */
|
Add an explicit representation of the output targetlist to Paths.
Up to now, there's been an assumption that all Paths for a given relation
compute the same output column set (targetlist). However, there are good
reasons to remove that assumption. For example, an indexscan on an
expression index might be able to return the value of an expensive function
"for free". While we have the ability to generate such a plan today in
simple cases, we don't have a way to model that it's cheaper than a plan
that computes the function from scratch, nor a way to create such a plan
in join cases (where the function computation would normally happen at
the topmost join node). Also, we need this so that we can have Paths
representing post-scan/join steps, where the targetlist may well change
from one step to the next. Therefore, invent a "struct PathTarget"
representing the columns we expect a plan step to emit. It's convenient
to include the output tuple width and tlist evaluation cost in this struct,
and there will likely be additional fields in future.
While Path nodes that actually do have custom outputs will need their own
PathTargets, it will still be true that most Paths for a given relation
will compute the same tlist. To reduce the overhead added by this patch,
keep a "default PathTarget" in RelOptInfo, and allow Paths that compute
that column set to just point to their parent RelOptInfo's reltarget.
(In the patch as committed, actually every Path is like that, since we
do not yet have any cases of custom PathTargets.)
I took this opportunity to provide some more-honest costing of
PlaceHolderVar evaluation. Up to now, the assumption that "scan/join
reltargetlists have cost zero" was applied not only to Vars, where it's
reasonable, but also PlaceHolderVars where it isn't. Now, we add the eval
cost of a PlaceHolderVar's expression to the first plan level where it can
be computed, by including it in the PathTarget cost field and adding that
to the cost estimates for Paths. This isn't perfect yet but it's much
better than before, and there is a way forward to improve it more. This
costing change affects the join order chosen for a couple of the regression
tests, changing expected row ordering.
2016-02-19 02:01:49 +01:00
|
|
|
PathTarget *pathtarget; /* list of Vars/Exprs, cost, width */
|
|
|
|
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
ParamPathInfo *param_info; /* parameterization info, or NULL if none */
|
Add an explicit representation of the output targetlist to Paths.
Up to now, there's been an assumption that all Paths for a given relation
compute the same output column set (targetlist). However, there are good
reasons to remove that assumption. For example, an indexscan on an
expression index might be able to return the value of an expensive function
"for free". While we have the ability to generate such a plan today in
simple cases, we don't have a way to model that it's cheaper than a plan
that computes the function from scratch, nor a way to create such a plan
in join cases (where the function computation would normally happen at
the topmost join node). Also, we need this so that we can have Paths
representing post-scan/join steps, where the targetlist may well change
from one step to the next. Therefore, invent a "struct PathTarget"
representing the columns we expect a plan step to emit. It's convenient
to include the output tuple width and tlist evaluation cost in this struct,
and there will likely be additional fields in future.
While Path nodes that actually do have custom outputs will need their own
PathTargets, it will still be true that most Paths for a given relation
will compute the same tlist. To reduce the overhead added by this patch,
keep a "default PathTarget" in RelOptInfo, and allow Paths that compute
that column set to just point to their parent RelOptInfo's reltarget.
(In the patch as committed, actually every Path is like that, since we
do not yet have any cases of custom PathTargets.)
I took this opportunity to provide some more-honest costing of
PlaceHolderVar evaluation. Up to now, the assumption that "scan/join
reltargetlists have cost zero" was applied not only to Vars, where it's
reasonable, but also PlaceHolderVars where it isn't. Now, we add the eval
cost of a PlaceHolderVar's expression to the first plan level where it can
be computed, by including it in the PathTarget cost field and adding that
to the cost estimates for Paths. This isn't perfect yet but it's much
better than before, and there is a way forward to improve it more. This
costing change affects the join order chosen for a couple of the regression
tests, changing expected row ordering.
2016-02-19 02:01:49 +01:00
|
|
|
|
2015-11-11 14:57:52 +01:00
|
|
|
bool parallel_aware; /* engage parallel-aware logic? */
|
2016-01-20 20:29:22 +01:00
|
|
|
bool parallel_safe; /* OK to use as part of parallel plan? */
|
2016-06-09 15:08:27 +02:00
|
|
|
int parallel_workers; /* desired # of workers; 0 = not parallel */
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2012-01-28 01:26:38 +01:00
|
|
|
/* estimated size/costs for path (see costsize.c for more info) */
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality rows; /* estimated number of result tuples */
|
2000-02-15 21:49:31 +01:00
|
|
|
Cost startup_cost; /* cost expended before fetching any tuples */
|
|
|
|
Cost total_cost; /* total cost (assuming all tuples fetched) */
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1999-08-16 04:17:58 +02:00
|
|
|
List *pathkeys; /* sort ordering of path's output */
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
/* pathkeys is a List of PathKey nodes; see above */
|
1996-08-28 03:59:28 +02:00
|
|
|
} Path;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
/* Macro for extracting a path's parameterization relids; beware double eval */
|
|
|
|
#define PATH_REQ_OUTER(path) \
|
|
|
|
((path)->param_info ? (path)->param_info->ppi_req_outer : (Relids) NULL)
|
|
|
|
|
1999-07-25 01:21:14 +02:00
|
|
|
/*----------
|
2005-04-25 03:30:14 +02:00
|
|
|
* IndexPath represents an index scan over a single index.
|
1999-08-16 04:17:58 +02:00
|
|
|
*
|
2011-10-11 20:20:06 +02:00
|
|
|
* This struct is used for both regular indexscans and index-only scans;
|
|
|
|
* path.pathtype is T_IndexScan or T_IndexOnlyScan to show which is meant.
|
|
|
|
*
|
2005-04-25 03:30:14 +02:00
|
|
|
* 'indexinfo' is the index to be scanned.
|
2005-04-20 00:35:18 +02:00
|
|
|
*
|
Refactor the representation of indexable clauses in IndexPaths.
In place of three separate but interrelated lists (indexclauses,
indexquals, and indexqualcols), an IndexPath now has one list
"indexclauses" of IndexClause nodes. This holds basically the same
information as before, but in a more useful format: in particular, there
is now a clear connection between an indexclause (an original restriction
clause from WHERE or JOIN/ON) and the indexquals (directly usable index
conditions) derived from it.
We also change the ground rules a bit by mandating that clause commutation,
if needed, be done up-front so that what is stored in the indexquals list
is always directly usable as an index condition. This gets rid of repeated
re-determination of which side of the clause is the indexkey during costing
and plan generation, as well as repeated lookups of the commutator
operator. To minimize the added up-front cost, the typical case of
commuting a plain OpExpr is handled by a new special-purpose function
commute_restrictinfo(). For RowCompareExprs, generating the new clause
properly commuted to begin with is not really any more complex than before,
it's just different --- and we can save doing that work twice, as the
pretty-klugy original implementation did.
Tracking the connection between original and derived clauses lets us
also track explicitly whether the derived clauses are an exact or lossy
translation of the original. This provides a cheap solution to getting
rid of unnecessary rechecks of boolean index clauses, which previously
seemed like it'd be more expensive than it was worth.
Another pleasant (IMO) side-effect is that EXPLAIN now always shows
index clauses with the indexkey on the left; this seems less confusing.
This commit leaves expand_indexqual_conditions() and some related
functions in a slightly messy state. I didn't bother to change them
any more than minimally necessary to work with the new data structure,
because all that code is going to be refactored out of existence in
a follow-on patch.
Discussion: https://postgr.es/m/22182.1549124950@sss.pgh.pa.us
2019-02-09 23:30:43 +01:00
|
|
|
* 'indexclauses' is a list of IndexClause nodes, each representing one
|
|
|
|
* index-checkable restriction, with implicit AND semantics across the list.
|
|
|
|
* An empty list implies a full index scan.
|
2011-12-25 01:03:21 +01:00
|
|
|
*
|
|
|
|
* 'indexorderbys', if not NIL, is a list of ORDER BY expressions that have
|
|
|
|
* been found to be usable as ordering operators for an amcanorderbyop index.
|
|
|
|
* The list must match the path's pathkeys, ie, one expression per pathkey
|
|
|
|
* in the same order. These are not RestrictInfos, just bare expressions,
|
Refactor the representation of indexable clauses in IndexPaths.
In place of three separate but interrelated lists (indexclauses,
indexquals, and indexqualcols), an IndexPath now has one list
"indexclauses" of IndexClause nodes. This holds basically the same
information as before, but in a more useful format: in particular, there
is now a clear connection between an indexclause (an original restriction
clause from WHERE or JOIN/ON) and the indexquals (directly usable index
conditions) derived from it.
We also change the ground rules a bit by mandating that clause commutation,
if needed, be done up-front so that what is stored in the indexquals list
is always directly usable as an index condition. This gets rid of repeated
re-determination of which side of the clause is the indexkey during costing
and plan generation, as well as repeated lookups of the commutator
operator. To minimize the added up-front cost, the typical case of
commuting a plain OpExpr is handled by a new special-purpose function
commute_restrictinfo(). For RowCompareExprs, generating the new clause
properly commuted to begin with is not really any more complex than before,
it's just different --- and we can save doing that work twice, as the
pretty-klugy original implementation did.
Tracking the connection between original and derived clauses lets us
also track explicitly whether the derived clauses are an exact or lossy
translation of the original. This provides a cheap solution to getting
rid of unnecessary rechecks of boolean index clauses, which previously
seemed like it'd be more expensive than it was worth.
Another pleasant (IMO) side-effect is that EXPLAIN now always shows
index clauses with the indexkey on the left; this seems less confusing.
This commit leaves expand_indexqual_conditions() and some related
functions in a slightly messy state. I didn't bother to change them
any more than minimally necessary to work with the new data structure,
because all that code is going to be refactored out of existence in
a follow-on patch.
Discussion: https://postgr.es/m/22182.1549124950@sss.pgh.pa.us
2019-02-09 23:30:43 +01:00
|
|
|
* since they generally won't yield booleans. It's guaranteed that each
|
|
|
|
* expression has the index key on the left side of the operator.
|
2011-12-25 01:03:21 +01:00
|
|
|
*
|
|
|
|
* 'indexorderbycols' is an integer list of index column numbers (zero-based)
|
|
|
|
* of the same length as 'indexorderbys', showing which index column each
|
|
|
|
* ORDER BY expression is meant to be used with. (There is no restriction
|
|
|
|
* on which index column each ORDER BY can be used with.)
|
2010-12-03 02:50:48 +01:00
|
|
|
*
|
2000-02-15 21:49:31 +01:00
|
|
|
* 'indexscandir' is one of:
|
|
|
|
* ForwardScanDirection: forward scan of an ordered index
|
|
|
|
* BackwardScanDirection: backward scan of an ordered index
|
|
|
|
* NoMovementScanDirection: scan of an unordered index, or don't care
|
|
|
|
* (The executor doesn't care whether it gets ForwardScanDirection or
|
|
|
|
* NoMovementScanDirection for an indexscan, but the planner wants to
|
|
|
|
* distinguish ordered from unordered indexes for building pathkeys.)
|
|
|
|
*
|
2005-04-21 04:28:02 +02:00
|
|
|
* 'indextotalcost' and 'indexselectivity' are saved in the IndexPath so that
|
|
|
|
* we need not recompute them when considering using the same index in a
|
2005-04-25 03:30:14 +02:00
|
|
|
* bitmap index/heap scan (see BitmapHeapPath). The costs of the IndexPath
|
2011-10-11 20:20:06 +02:00
|
|
|
* itself represent the costs of an IndexScan or IndexOnlyScan plan type.
|
1999-07-25 01:21:14 +02:00
|
|
|
*----------
|
|
|
|
*/
|
1996-08-28 03:59:28 +02:00
|
|
|
typedef struct IndexPath
|
|
|
|
{
|
|
|
|
Path path;
|
2005-04-25 03:30:14 +02:00
|
|
|
IndexOptInfo *indexinfo;
|
2004-01-06 00:39:54 +01:00
|
|
|
List *indexclauses;
|
2010-12-03 02:50:48 +01:00
|
|
|
List *indexorderbys;
|
2011-12-25 01:03:21 +01:00
|
|
|
List *indexorderbycols;
|
2000-02-15 21:49:31 +01:00
|
|
|
ScanDirection indexscandir;
|
2005-04-21 04:28:02 +02:00
|
|
|
Cost indextotalcost;
|
|
|
|
Selectivity indexselectivity;
|
1996-08-28 03:59:28 +02:00
|
|
|
} IndexPath;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Refactor the representation of indexable clauses in IndexPaths.
In place of three separate but interrelated lists (indexclauses,
indexquals, and indexqualcols), an IndexPath now has one list
"indexclauses" of IndexClause nodes. This holds basically the same
information as before, but in a more useful format: in particular, there
is now a clear connection between an indexclause (an original restriction
clause from WHERE or JOIN/ON) and the indexquals (directly usable index
conditions) derived from it.
We also change the ground rules a bit by mandating that clause commutation,
if needed, be done up-front so that what is stored in the indexquals list
is always directly usable as an index condition. This gets rid of repeated
re-determination of which side of the clause is the indexkey during costing
and plan generation, as well as repeated lookups of the commutator
operator. To minimize the added up-front cost, the typical case of
commuting a plain OpExpr is handled by a new special-purpose function
commute_restrictinfo(). For RowCompareExprs, generating the new clause
properly commuted to begin with is not really any more complex than before,
it's just different --- and we can save doing that work twice, as the
pretty-klugy original implementation did.
Tracking the connection between original and derived clauses lets us
also track explicitly whether the derived clauses are an exact or lossy
translation of the original. This provides a cheap solution to getting
rid of unnecessary rechecks of boolean index clauses, which previously
seemed like it'd be more expensive than it was worth.
Another pleasant (IMO) side-effect is that EXPLAIN now always shows
index clauses with the indexkey on the left; this seems less confusing.
This commit leaves expand_indexqual_conditions() and some related
functions in a slightly messy state. I didn't bother to change them
any more than minimally necessary to work with the new data structure,
because all that code is going to be refactored out of existence in
a follow-on patch.
Discussion: https://postgr.es/m/22182.1549124950@sss.pgh.pa.us
2019-02-09 23:30:43 +01:00
|
|
|
/*
|
|
|
|
* Each IndexClause references a RestrictInfo node from the query's WHERE
|
|
|
|
* or JOIN conditions, and shows how that restriction can be applied to
|
|
|
|
* the particular index. We support both indexclauses that are directly
|
|
|
|
* usable by the index machinery, which are typically of the form
|
|
|
|
* "indexcol OP pseudoconstant", and those from which an indexable qual
|
|
|
|
* can be derived. The simplest such transformation is that a clause
|
|
|
|
* of the form "pseudoconstant OP indexcol" can be commuted to produce an
|
|
|
|
* indexable qual (the index machinery expects the indexcol to be on the
|
|
|
|
* left always). Another example is that we might be able to extract an
|
|
|
|
* indexable range condition from a LIKE condition, as in "x LIKE 'foo%bar'"
|
|
|
|
* giving rise to "x >= 'foo' AND x < 'fop'". Derivation of such lossy
|
|
|
|
* conditions is done by a planner support function attached to the
|
|
|
|
* indexclause's top-level function or operator.
|
|
|
|
*
|
2019-02-15 01:37:30 +01:00
|
|
|
* indexquals is a list of RestrictInfos for the directly-usable index
|
|
|
|
* conditions associated with this IndexClause. In the simplest case
|
|
|
|
* it's a one-element list whose member is iclause->rinfo. Otherwise,
|
|
|
|
* it contains one or more directly-usable indexqual conditions extracted
|
|
|
|
* from the given clause. The 'lossy' flag indicates whether the
|
|
|
|
* indexquals are semantically equivalent to the original clause, or
|
|
|
|
* represent a weaker condition.
|
Refactor the representation of indexable clauses in IndexPaths.
In place of three separate but interrelated lists (indexclauses,
indexquals, and indexqualcols), an IndexPath now has one list
"indexclauses" of IndexClause nodes. This holds basically the same
information as before, but in a more useful format: in particular, there
is now a clear connection between an indexclause (an original restriction
clause from WHERE or JOIN/ON) and the indexquals (directly usable index
conditions) derived from it.
We also change the ground rules a bit by mandating that clause commutation,
if needed, be done up-front so that what is stored in the indexquals list
is always directly usable as an index condition. This gets rid of repeated
re-determination of which side of the clause is the indexkey during costing
and plan generation, as well as repeated lookups of the commutator
operator. To minimize the added up-front cost, the typical case of
commuting a plain OpExpr is handled by a new special-purpose function
commute_restrictinfo(). For RowCompareExprs, generating the new clause
properly commuted to begin with is not really any more complex than before,
it's just different --- and we can save doing that work twice, as the
pretty-klugy original implementation did.
Tracking the connection between original and derived clauses lets us
also track explicitly whether the derived clauses are an exact or lossy
translation of the original. This provides a cheap solution to getting
rid of unnecessary rechecks of boolean index clauses, which previously
seemed like it'd be more expensive than it was worth.
Another pleasant (IMO) side-effect is that EXPLAIN now always shows
index clauses with the indexkey on the left; this seems less confusing.
This commit leaves expand_indexqual_conditions() and some related
functions in a slightly messy state. I didn't bother to change them
any more than minimally necessary to work with the new data structure,
because all that code is going to be refactored out of existence in
a follow-on patch.
Discussion: https://postgr.es/m/22182.1549124950@sss.pgh.pa.us
2019-02-09 23:30:43 +01:00
|
|
|
*
|
|
|
|
* Normally, indexcol is the index of the single index column the clause
|
|
|
|
* works on, and indexcols is NIL. But if the clause is a RowCompareExpr,
|
|
|
|
* indexcol is the index of the leading column, and indexcols is a list of
|
|
|
|
* all the affected columns. (Note that indexcols matches up with the
|
2019-02-15 01:37:30 +01:00
|
|
|
* columns of the actual indexable RowCompareExpr in indexquals, which
|
|
|
|
* might be different from the original in rinfo.)
|
Refactor the representation of indexable clauses in IndexPaths.
In place of three separate but interrelated lists (indexclauses,
indexquals, and indexqualcols), an IndexPath now has one list
"indexclauses" of IndexClause nodes. This holds basically the same
information as before, but in a more useful format: in particular, there
is now a clear connection between an indexclause (an original restriction
clause from WHERE or JOIN/ON) and the indexquals (directly usable index
conditions) derived from it.
We also change the ground rules a bit by mandating that clause commutation,
if needed, be done up-front so that what is stored in the indexquals list
is always directly usable as an index condition. This gets rid of repeated
re-determination of which side of the clause is the indexkey during costing
and plan generation, as well as repeated lookups of the commutator
operator. To minimize the added up-front cost, the typical case of
commuting a plain OpExpr is handled by a new special-purpose function
commute_restrictinfo(). For RowCompareExprs, generating the new clause
properly commuted to begin with is not really any more complex than before,
it's just different --- and we can save doing that work twice, as the
pretty-klugy original implementation did.
Tracking the connection between original and derived clauses lets us
also track explicitly whether the derived clauses are an exact or lossy
translation of the original. This provides a cheap solution to getting
rid of unnecessary rechecks of boolean index clauses, which previously
seemed like it'd be more expensive than it was worth.
Another pleasant (IMO) side-effect is that EXPLAIN now always shows
index clauses with the indexkey on the left; this seems less confusing.
This commit leaves expand_indexqual_conditions() and some related
functions in a slightly messy state. I didn't bother to change them
any more than minimally necessary to work with the new data structure,
because all that code is going to be refactored out of existence in
a follow-on patch.
Discussion: https://postgr.es/m/22182.1549124950@sss.pgh.pa.us
2019-02-09 23:30:43 +01:00
|
|
|
*
|
|
|
|
* An IndexPath's IndexClause list is required to be ordered by index
|
|
|
|
* column, i.e. the indexcol values must form a nondecreasing sequence.
|
|
|
|
* (The order of multiple clauses for the same index column is unspecified.)
|
|
|
|
*/
|
|
|
|
typedef struct IndexClause
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
struct RestrictInfo *rinfo; /* original restriction or join clause */
|
2019-02-15 01:37:30 +01:00
|
|
|
List *indexquals; /* indexqual(s) derived from it */
|
Refactor the representation of indexable clauses in IndexPaths.
In place of three separate but interrelated lists (indexclauses,
indexquals, and indexqualcols), an IndexPath now has one list
"indexclauses" of IndexClause nodes. This holds basically the same
information as before, but in a more useful format: in particular, there
is now a clear connection between an indexclause (an original restriction
clause from WHERE or JOIN/ON) and the indexquals (directly usable index
conditions) derived from it.
We also change the ground rules a bit by mandating that clause commutation,
if needed, be done up-front so that what is stored in the indexquals list
is always directly usable as an index condition. This gets rid of repeated
re-determination of which side of the clause is the indexkey during costing
and plan generation, as well as repeated lookups of the commutator
operator. To minimize the added up-front cost, the typical case of
commuting a plain OpExpr is handled by a new special-purpose function
commute_restrictinfo(). For RowCompareExprs, generating the new clause
properly commuted to begin with is not really any more complex than before,
it's just different --- and we can save doing that work twice, as the
pretty-klugy original implementation did.
Tracking the connection between original and derived clauses lets us
also track explicitly whether the derived clauses are an exact or lossy
translation of the original. This provides a cheap solution to getting
rid of unnecessary rechecks of boolean index clauses, which previously
seemed like it'd be more expensive than it was worth.
Another pleasant (IMO) side-effect is that EXPLAIN now always shows
index clauses with the indexkey on the left; this seems less confusing.
This commit leaves expand_indexqual_conditions() and some related
functions in a slightly messy state. I didn't bother to change them
any more than minimally necessary to work with the new data structure,
because all that code is going to be refactored out of existence in
a follow-on patch.
Discussion: https://postgr.es/m/22182.1549124950@sss.pgh.pa.us
2019-02-09 23:30:43 +01:00
|
|
|
bool lossy; /* are indexquals a lossy version of clause? */
|
|
|
|
AttrNumber indexcol; /* index column the clause uses (zero-based) */
|
|
|
|
List *indexcols; /* multiple index columns, if RowCompare */
|
|
|
|
} IndexClause;
|
|
|
|
|
2005-04-20 00:35:18 +02:00
|
|
|
/*
|
|
|
|
* BitmapHeapPath represents one or more indexscans that generate TID bitmaps
|
|
|
|
* instead of directly accessing the heap, followed by AND/OR combinations
|
|
|
|
* to produce a single bitmap, followed by a heap scan that uses the bitmap.
|
|
|
|
* Note that the output is always considered unordered, since it will come
|
|
|
|
* out in physical heap order no matter what the underlying indexes did.
|
|
|
|
*
|
|
|
|
* The individual indexscans are represented by IndexPath nodes, and any
|
2005-04-25 03:30:14 +02:00
|
|
|
* logic on top of them is represented by a tree of BitmapAndPath and
|
|
|
|
* BitmapOrPath nodes. Notice that we can use the same IndexPath node both
|
2011-10-11 20:20:06 +02:00
|
|
|
* to represent a regular (or index-only) index scan plan, and as the child
|
|
|
|
* of a BitmapHeapPath that represents scanning the same index using a
|
|
|
|
* BitmapIndexScan. The startup_cost and total_cost figures of an IndexPath
|
|
|
|
* always represent the costs to use it as a regular (or index-only)
|
|
|
|
* IndexScan. The costs of a BitmapIndexScan can be computed using the
|
|
|
|
* IndexPath's indextotalcost and indexselectivity.
|
2005-04-20 00:35:18 +02:00
|
|
|
*/
|
|
|
|
typedef struct BitmapHeapPath
|
|
|
|
{
|
|
|
|
Path path;
|
2005-04-21 21:18:13 +02:00
|
|
|
Path *bitmapqual; /* IndexPath, BitmapAndPath, BitmapOrPath */
|
2005-04-20 00:35:18 +02:00
|
|
|
} BitmapHeapPath;
|
|
|
|
|
2005-04-21 21:18:13 +02:00
|
|
|
/*
|
|
|
|
* BitmapAndPath represents a BitmapAnd plan node; it can only appear as
|
|
|
|
* part of the substructure of a BitmapHeapPath. The Path structure is
|
|
|
|
* a bit more heavyweight than we really need for this, but for simplicity
|
|
|
|
* we make it a derivative of Path anyway.
|
|
|
|
*/
|
|
|
|
typedef struct BitmapAndPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
List *bitmapquals; /* IndexPaths and BitmapOrPaths */
|
|
|
|
Selectivity bitmapselectivity;
|
|
|
|
} BitmapAndPath;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* BitmapOrPath represents a BitmapOr plan node; it can only appear as
|
|
|
|
* part of the substructure of a BitmapHeapPath. The Path structure is
|
|
|
|
* a bit more heavyweight than we really need for this, but for simplicity
|
|
|
|
* we make it a derivative of Path anyway.
|
|
|
|
*/
|
|
|
|
typedef struct BitmapOrPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
List *bitmapquals; /* IndexPaths and BitmapAndPaths */
|
|
|
|
Selectivity bitmapselectivity;
|
|
|
|
} BitmapOrPath;
|
|
|
|
|
2000-11-12 01:37:02 +01:00
|
|
|
/*
|
|
|
|
* TidPath represents a scan by TID
|
2004-01-05 06:07:36 +01:00
|
|
|
*
|
2005-11-26 23:14:57 +01:00
|
|
|
* tidquals is an implicitly OR'ed list of qual expressions of the form
|
2018-12-30 21:24:28 +01:00
|
|
|
* "CTID = pseudoconstant", or "CTID = ANY(pseudoconstant_array)",
|
|
|
|
* or a CurrentOfExpr for the relation.
|
2000-11-12 01:37:02 +01:00
|
|
|
*/
|
1999-11-23 21:07:06 +01:00
|
|
|
typedef struct TidPath
|
|
|
|
{
|
|
|
|
Path path;
|
2005-11-26 23:14:57 +01:00
|
|
|
List *tidquals; /* qual(s) involving CTID = something */
|
1999-11-23 21:07:06 +01:00
|
|
|
} TidPath;
|
|
|
|
|
2021-02-27 10:59:36 +01:00
|
|
|
/*
|
2021-07-08 12:45:09 +02:00
|
|
|
* TidRangePath represents a scan by a contiguous range of TIDs
|
2021-02-27 10:59:36 +01:00
|
|
|
*
|
|
|
|
* tidrangequals is an implicitly AND'ed list of qual expressions of the form
|
|
|
|
* "CTID relop pseudoconstant", where relop is one of >,>=,<,<=.
|
|
|
|
*/
|
|
|
|
typedef struct TidRangePath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
List *tidrangequals;
|
|
|
|
} TidRangePath;
|
|
|
|
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
/*
|
|
|
|
* SubqueryScanPath represents a scan of an unflattened subquery-in-FROM
|
|
|
|
*
|
|
|
|
* Note that the subpath comes from a different planning domain; for example
|
|
|
|
* RTE indexes within it mean something different from those known to the
|
|
|
|
* SubqueryScanPath. path.parent->subroot is the planning context needed to
|
|
|
|
* interpret the subpath.
|
|
|
|
*/
|
|
|
|
typedef struct SubqueryScanPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath; /* path representing subquery execution */
|
|
|
|
} SubqueryScanPath;
|
|
|
|
|
2011-02-20 06:17:18 +01:00
|
|
|
/*
|
2016-04-21 19:30:48 +02:00
|
|
|
* ForeignPath represents a potential scan of a foreign table, foreign join
|
|
|
|
* or foreign upper-relation.
|
Revise FDW planning API, again.
Further reflection shows that a single callback isn't very workable if we
desire to let FDWs generate multiple Paths, because that forces the FDW to
do all work necessary to generate a valid Plan node for each Path. Instead
split the former PlanForeignScan API into three steps: GetForeignRelSize,
GetForeignPaths, GetForeignPlan. We had already bit the bullet of breaking
the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain,
and it's substantially more flexible for complex FDWs.
Add an fdw_private field to RelOptInfo so that the new functions can save
state there rather than possibly having to recalculate information two or
three times.
In addition, we'd not thought through what would be needed to allow an FDW
to set up subexpressions of its choice for runtime execution. We could
treat ForeignScan.fdw_private as an executable expression but that seems
likely to break existing FDWs unnecessarily (in particular, it would
restrict the set of node types allowable in fdw_private to those supported
by expression_tree_walker). Instead, invent a separate field fdw_exprs
which will receive the postprocessing appropriate for expression trees.
(One field is enough since it can be a list of expressions; also, we assume
the corresponding expression state tree(s) will be held within fdw_state,
so we don't need to add anything to ForeignScanState.)
Per review of Hanada Shigeru's pgsql_fdw patch. We may need to tweak this
further as we continue to work on that patch, but to me it feels a lot
closer to being right now.
2012-03-09 18:48:48 +01:00
|
|
|
*
|
|
|
|
* fdw_private stores FDW private data about the scan. While fdw_private is
|
|
|
|
* not actually touched by the core code during normal operations, it's
|
|
|
|
* generally a good idea to use a representation that can be dumped by
|
|
|
|
* nodeToString(), so that you can examine the structure during debugging
|
|
|
|
* with tools like pprint().
|
2011-02-20 06:17:18 +01:00
|
|
|
*/
|
|
|
|
typedef struct ForeignPath
|
|
|
|
{
|
|
|
|
Path path;
|
Allow foreign and custom joins to handle EvalPlanQual rechecks.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 provided basic
infrastructure for allowing a foreign data wrapper or custom scan
provider to replace a join of one or more tables with a scan.
However, this infrastructure failed to take into account the need
for possible EvalPlanQual rechecks, and ExecScanFetch would fail
an assertion (or just overwrite memory) if such a check was attempted
for a plan containing a pushed-down join. To fix, adjust the EPQ
machinery to skip some processing steps when scanrelid == 0, making
those the responsibility of scan's recheck method, which also has
the responsibility in this case of correctly populating the relevant
slot.
To allow foreign scans to gain control in the right place to make
use of this new facility, add a new, optional RecheckForeignScan
method. Also, allow a foreign scan to have a child plan, which can
be used to correctly populate the slot (or perhaps for something
else, but this is the only use currently envisioned).
KaiGai Kohei, reviewed by Robert Haas, Etsuro Fujita, and Kyotaro
Horiguchi.
2015-12-08 18:31:03 +01:00
|
|
|
Path *fdw_outerpath;
|
2012-03-05 22:15:59 +01:00
|
|
|
List *fdw_private;
|
2011-02-20 06:17:18 +01:00
|
|
|
} ForeignPath;
|
|
|
|
|
2014-11-07 23:26:02 +01:00
|
|
|
/*
|
2014-11-22 00:21:46 +01:00
|
|
|
* CustomPath represents a table scan done by some out-of-core extension.
|
|
|
|
*
|
|
|
|
* We provide a set of hooks here - which the provider must take care to set
|
|
|
|
* up correctly - to allow extensions to supply their own methods of scanning
|
|
|
|
* a relation. For example, a provider might provide GPU acceleration, a
|
|
|
|
* cache-based scan, or some other kind of logic we haven't dreamed up yet.
|
|
|
|
*
|
|
|
|
* CustomPaths can be injected into the planning process for a relation by
|
|
|
|
* set_rel_pathlist_hook functions.
|
|
|
|
*
|
|
|
|
* Core code must avoid assuming that the CustomPath is only as large as
|
|
|
|
* the structure declared here; providers are allowed to make it the first
|
|
|
|
* element in a larger structure. (Since the planner never copies Paths,
|
|
|
|
* this doesn't add any complication.) However, for consistency with the
|
|
|
|
* FDW case, we provide a "custom_private" field in CustomPath; providers
|
|
|
|
* may prefer to use that rather than define another struct type.
|
2014-11-07 23:26:02 +01:00
|
|
|
*/
|
|
|
|
|
2016-03-29 17:00:18 +02:00
|
|
|
struct CustomPathMethods;
|
2014-11-07 23:26:02 +01:00
|
|
|
|
2014-11-21 00:36:07 +01:00
|
|
|
typedef struct CustomPath
|
|
|
|
{
|
|
|
|
Path path;
|
2016-08-31 09:06:18 +02:00
|
|
|
uint32 flags; /* mask of CUSTOMPATH_* flags, see
|
|
|
|
* nodes/extensible.h */
|
2015-06-26 15:40:47 +02:00
|
|
|
List *custom_paths; /* list of child Path nodes, if any */
|
2014-11-22 00:21:46 +01:00
|
|
|
List *custom_private;
|
2016-03-29 17:00:18 +02:00
|
|
|
const struct CustomPathMethods *methods;
|
2014-11-21 00:36:07 +01:00
|
|
|
} CustomPath;
|
|
|
|
|
2000-11-12 01:37:02 +01:00
|
|
|
/*
|
|
|
|
* AppendPath represents an Append plan, ie, successive execution of
|
2006-01-31 22:39:25 +01:00
|
|
|
* several member plans.
|
2005-07-23 23:05:48 +02:00
|
|
|
*
|
Support Parallel Append plan nodes.
When we create an Append node, we can spread out the workers over the
subplans instead of piling on to each subplan one at a time, which
should typically be a bit more efficient, both because the startup
cost of any plan executed entirely by one worker is paid only once and
also because of reduced contention. We can also construct Append
plans using a mix of partial and non-partial subplans, which may allow
for parallelism in places that otherwise couldn't support it.
Unfortunately, this patch doesn't handle the important case of
parallelizing UNION ALL by running each branch in a separate worker;
the executor infrastructure is added here, but more planner work is
needed.
Amit Khandekar, Robert Haas, Amul Sul, reviewed and tested by
Ashutosh Bapat, Amit Langote, Rafia Sabih, Amit Kapila, and
Rajkumar Raghuwanshi.
Discussion: http://postgr.es/m/CAJ3gD9dy0K_E8r727heqXoBmWZ83HwLFwdcaSSmBQ1+S+vRuUQ@mail.gmail.com
2017-12-05 23:28:39 +01:00
|
|
|
* For partial Append, 'subpaths' contains non-partial subpaths followed by
|
|
|
|
* partial subpaths.
|
|
|
|
*
|
2005-07-23 23:05:48 +02:00
|
|
|
* Note: it is possible for "subpaths" to contain only one, or even no,
|
|
|
|
* elements. These cases are optimized during create_append_plan.
|
2008-03-24 22:53:04 +01:00
|
|
|
* In particular, an AppendPath with no subpaths is a "dummy" path that
|
|
|
|
* is created to represent the case that a relation is provably empty.
|
Fix handling of targetlist SRFs when scan/join relation is known empty.
When we introduced separate ProjectSetPath nodes for application of
set-returning functions in v10, we inadvertently broke some cases where
we're supposed to recognize that the result of a subquery is known to be
empty (contain zero rows). That's because IS_DUMMY_REL was just looking
for a childless AppendPath without allowing for a ProjectSetPath being
possibly stuck on top. In itself, this didn't do anything much worse
than produce slightly worse plans for some corner cases.
Then in v11, commit 11cf92f6e rearranged things to allow the scan/join
targetlist to be applied directly to partial paths before they get
gathered. But it inserted a short-circuit path for dummy relations
that was a little too short: it failed to insert a ProjectSetPath node
at all for a targetlist containing set-returning functions, resulting in
bogus "set-valued function called in context that cannot accept a set"
errors, as reported in bug #15669 from Madelaine Thibaut.
The best way to fix this mess seems to be to reimplement IS_DUMMY_REL
so that it drills down through any ProjectSetPath nodes that might be
there (and it seems like we'd better allow for ProjectionPath as well).
While we're at it, make it look at rel->pathlist not cheapest_total_path,
so that it gives the right answer independently of whether set_cheapest
has been done lately. That dependency looks pretty shaky in the context
of code like apply_scanjoin_target_to_paths, and even if it's not broken
today it'd certainly bite us at some point. (Nastily, unsafe use of the
old coding would almost always work; the hazard comes down to possibly
looking through a dangling pointer, and only once in a blue moon would
you find something there that resulted in the wrong answer.)
It now looks like it was a mistake for IS_DUMMY_REL to be a macro: if
there are any extensions using it, they'll continue to use the old
inadequate logic until they're recompiled, after which they'll fail
to load into server versions predating this fix. Hopefully there are
few such extensions.
Having fixed IS_DUMMY_REL, the special path for dummy rels in
apply_scanjoin_target_to_paths is unnecessary as well as being wrong,
so we can just drop it.
Also change a few places that were testing for partitioned-ness of a
planner relation but not using IS_PARTITIONED_REL for the purpose; that
seems unsafe as well as inconsistent, plus it required an ugly hack in
apply_scanjoin_target_to_paths.
In passing, save a few cycles in apply_scanjoin_target_to_paths by
skipping processing of pre-existing paths for partitioned rels,
and do some cosmetic cleanup and comment adjustment in that function.
I renamed IS_DUMMY_PATH to IS_DUMMY_APPEND with the intention of breaking
any code that might be using it, since in almost every case that would
be wrong; IS_DUMMY_REL is what to be using instead.
In HEAD, also make set_dummy_rel_pathlist static (since it's no longer
used from outside allpaths.c), and delete is_dummy_plan, since it's no
longer used anywhere.
Back-patch as appropriate into v11 and v10.
Tom Lane and Julien Rouhaud
Discussion: https://postgr.es/m/15669-02fb3296cca26203@postgresql.org
2019-03-07 20:21:52 +01:00
|
|
|
* (This is a convenient representation because it means that when we build
|
|
|
|
* an appendrel and find that all its children have been excluded, no extra
|
|
|
|
* action is needed to recognize the relation as dummy.)
|
2000-11-12 01:37:02 +01:00
|
|
|
*/
|
|
|
|
typedef struct AppendPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
List *subpaths; /* list of component Paths */
|
Use Append rather than MergeAppend for scanning ordered partitions.
If we need ordered output from a scan of a partitioned table, but
the ordering matches the partition ordering, then we don't need to
use a MergeAppend to combine the pre-ordered per-partition scan
results: a plain Append will produce the same results. This
both saves useless comparison work inside the MergeAppend proper,
and allows us to start returning tuples after istarting up just
the first child node not all of them.
However, all is not peaches and cream, because if some of the
child nodes have high startup costs then there will be big
discontinuities in the tuples-returned-versus-elapsed-time curve.
The planner's cost model cannot handle that (yet, anyway).
If we model the Append's startup cost as being just the first
child's startup cost, we may drastically underestimate the cost
of fetching slightly more tuples than are available from the first
child. Since we've had bad experiences with over-optimistic choices
of "fast start" plans for ORDER BY LIMIT queries, that seems scary.
As a klugy workaround, set the startup cost estimate for an ordered
Append to be the sum of its children's startup costs (as MergeAppend
would). This doesn't really describe reality, but it's less likely
to cause a bad plan choice than an underestimated startup cost would.
In practice, the cases where we really care about this optimization
will have child plans that are IndexScans with zero startup cost,
so that the overly conservative estimate is still just zero.
David Rowley, reviewed by Julien Rouhaud and Antonin Houska
Discussion: https://postgr.es/m/CAKJS1f-hAqhPLRk_RaSFTgYxd=Tz5hA7kQ2h4-DhJufQk8TGuw@mail.gmail.com
2019-04-06 01:20:30 +02:00
|
|
|
/* Index of first partial path in subpaths; list_length(subpaths) if none */
|
Support Parallel Append plan nodes.
When we create an Append node, we can spread out the workers over the
subplans instead of piling on to each subplan one at a time, which
should typically be a bit more efficient, both because the startup
cost of any plan executed entirely by one worker is paid only once and
also because of reduced contention. We can also construct Append
plans using a mix of partial and non-partial subplans, which may allow
for parallelism in places that otherwise couldn't support it.
Unfortunately, this patch doesn't handle the important case of
parallelizing UNION ALL by running each branch in a separate worker;
the executor infrastructure is added here, but more planner work is
needed.
Amit Khandekar, Robert Haas, Amul Sul, reviewed and tested by
Ashutosh Bapat, Amit Langote, Rafia Sabih, Amit Kapila, and
Rajkumar Raghuwanshi.
Discussion: http://postgr.es/m/CAJ3gD9dy0K_E8r727heqXoBmWZ83HwLFwdcaSSmBQ1+S+vRuUQ@mail.gmail.com
2017-12-05 23:28:39 +01:00
|
|
|
int first_partial_path;
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality limit_tuples; /* hard limit on output tuples, or -1 */
|
2000-11-12 01:37:02 +01:00
|
|
|
} AppendPath;
|
|
|
|
|
Fix handling of targetlist SRFs when scan/join relation is known empty.
When we introduced separate ProjectSetPath nodes for application of
set-returning functions in v10, we inadvertently broke some cases where
we're supposed to recognize that the result of a subquery is known to be
empty (contain zero rows). That's because IS_DUMMY_REL was just looking
for a childless AppendPath without allowing for a ProjectSetPath being
possibly stuck on top. In itself, this didn't do anything much worse
than produce slightly worse plans for some corner cases.
Then in v11, commit 11cf92f6e rearranged things to allow the scan/join
targetlist to be applied directly to partial paths before they get
gathered. But it inserted a short-circuit path for dummy relations
that was a little too short: it failed to insert a ProjectSetPath node
at all for a targetlist containing set-returning functions, resulting in
bogus "set-valued function called in context that cannot accept a set"
errors, as reported in bug #15669 from Madelaine Thibaut.
The best way to fix this mess seems to be to reimplement IS_DUMMY_REL
so that it drills down through any ProjectSetPath nodes that might be
there (and it seems like we'd better allow for ProjectionPath as well).
While we're at it, make it look at rel->pathlist not cheapest_total_path,
so that it gives the right answer independently of whether set_cheapest
has been done lately. That dependency looks pretty shaky in the context
of code like apply_scanjoin_target_to_paths, and even if it's not broken
today it'd certainly bite us at some point. (Nastily, unsafe use of the
old coding would almost always work; the hazard comes down to possibly
looking through a dangling pointer, and only once in a blue moon would
you find something there that resulted in the wrong answer.)
It now looks like it was a mistake for IS_DUMMY_REL to be a macro: if
there are any extensions using it, they'll continue to use the old
inadequate logic until they're recompiled, after which they'll fail
to load into server versions predating this fix. Hopefully there are
few such extensions.
Having fixed IS_DUMMY_REL, the special path for dummy rels in
apply_scanjoin_target_to_paths is unnecessary as well as being wrong,
so we can just drop it.
Also change a few places that were testing for partitioned-ness of a
planner relation but not using IS_PARTITIONED_REL for the purpose; that
seems unsafe as well as inconsistent, plus it required an ugly hack in
apply_scanjoin_target_to_paths.
In passing, save a few cycles in apply_scanjoin_target_to_paths by
skipping processing of pre-existing paths for partitioned rels,
and do some cosmetic cleanup and comment adjustment in that function.
I renamed IS_DUMMY_PATH to IS_DUMMY_APPEND with the intention of breaking
any code that might be using it, since in almost every case that would
be wrong; IS_DUMMY_REL is what to be using instead.
In HEAD, also make set_dummy_rel_pathlist static (since it's no longer
used from outside allpaths.c), and delete is_dummy_plan, since it's no
longer used anywhere.
Back-patch as appropriate into v11 and v10.
Tom Lane and Julien Rouhaud
Discussion: https://postgr.es/m/15669-02fb3296cca26203@postgresql.org
2019-03-07 20:21:52 +01:00
|
|
|
#define IS_DUMMY_APPEND(p) \
|
2008-03-24 22:53:04 +01:00
|
|
|
(IsA((p), AppendPath) && ((AppendPath *) (p))->subpaths == NIL)
|
|
|
|
|
Fix handling of targetlist SRFs when scan/join relation is known empty.
When we introduced separate ProjectSetPath nodes for application of
set-returning functions in v10, we inadvertently broke some cases where
we're supposed to recognize that the result of a subquery is known to be
empty (contain zero rows). That's because IS_DUMMY_REL was just looking
for a childless AppendPath without allowing for a ProjectSetPath being
possibly stuck on top. In itself, this didn't do anything much worse
than produce slightly worse plans for some corner cases.
Then in v11, commit 11cf92f6e rearranged things to allow the scan/join
targetlist to be applied directly to partial paths before they get
gathered. But it inserted a short-circuit path for dummy relations
that was a little too short: it failed to insert a ProjectSetPath node
at all for a targetlist containing set-returning functions, resulting in
bogus "set-valued function called in context that cannot accept a set"
errors, as reported in bug #15669 from Madelaine Thibaut.
The best way to fix this mess seems to be to reimplement IS_DUMMY_REL
so that it drills down through any ProjectSetPath nodes that might be
there (and it seems like we'd better allow for ProjectionPath as well).
While we're at it, make it look at rel->pathlist not cheapest_total_path,
so that it gives the right answer independently of whether set_cheapest
has been done lately. That dependency looks pretty shaky in the context
of code like apply_scanjoin_target_to_paths, and even if it's not broken
today it'd certainly bite us at some point. (Nastily, unsafe use of the
old coding would almost always work; the hazard comes down to possibly
looking through a dangling pointer, and only once in a blue moon would
you find something there that resulted in the wrong answer.)
It now looks like it was a mistake for IS_DUMMY_REL to be a macro: if
there are any extensions using it, they'll continue to use the old
inadequate logic until they're recompiled, after which they'll fail
to load into server versions predating this fix. Hopefully there are
few such extensions.
Having fixed IS_DUMMY_REL, the special path for dummy rels in
apply_scanjoin_target_to_paths is unnecessary as well as being wrong,
so we can just drop it.
Also change a few places that were testing for partitioned-ness of a
planner relation but not using IS_PARTITIONED_REL for the purpose; that
seems unsafe as well as inconsistent, plus it required an ugly hack in
apply_scanjoin_target_to_paths.
In passing, save a few cycles in apply_scanjoin_target_to_paths by
skipping processing of pre-existing paths for partitioned rels,
and do some cosmetic cleanup and comment adjustment in that function.
I renamed IS_DUMMY_PATH to IS_DUMMY_APPEND with the intention of breaking
any code that might be using it, since in almost every case that would
be wrong; IS_DUMMY_REL is what to be using instead.
In HEAD, also make set_dummy_rel_pathlist static (since it's no longer
used from outside allpaths.c), and delete is_dummy_plan, since it's no
longer used anywhere.
Back-patch as appropriate into v11 and v10.
Tom Lane and Julien Rouhaud
Discussion: https://postgr.es/m/15669-02fb3296cca26203@postgresql.org
2019-03-07 20:21:52 +01:00
|
|
|
/*
|
|
|
|
* A relation that's been proven empty will have one path that is dummy
|
|
|
|
* (but might have projection paths on top). For historical reasons,
|
|
|
|
* this is provided as a macro that wraps is_dummy_rel().
|
|
|
|
*/
|
|
|
|
#define IS_DUMMY_REL(r) is_dummy_rel(r)
|
|
|
|
extern bool is_dummy_rel(RelOptInfo *rel);
|
2012-01-28 01:26:38 +01:00
|
|
|
|
2010-10-14 22:56:39 +02:00
|
|
|
/*
|
|
|
|
* MergeAppendPath represents a MergeAppend plan, ie, the merging of sorted
|
|
|
|
* results from several member plans to produce similarly-sorted output.
|
|
|
|
*/
|
|
|
|
typedef struct MergeAppendPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
List *subpaths; /* list of component Paths */
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality limit_tuples; /* hard limit on output tuples, or -1 */
|
2010-10-14 22:56:39 +02:00
|
|
|
} MergeAppendPath;
|
|
|
|
|
2002-11-06 01:00:45 +01:00
|
|
|
/*
|
In the planner, replace an empty FROM clause with a dummy RTE.
The fact that "SELECT expression" has no base relations has long been a
thorn in the side of the planner. It makes it hard to flatten a sub-query
that looks like that, or is a trivial VALUES() item, because the planner
generally uses relid sets to identify sub-relations, and such a sub-query
would have an empty relid set if we flattened it. prepjointree.c contains
some baroque logic that works around this in certain special cases --- but
there is a much better answer. We can replace an empty FROM clause with a
dummy RTE that acts like a table of one row and no columns, and then there
are no such corner cases to worry about. Instead we need some logic to
get rid of useless dummy RTEs, but that's simpler and covers more cases
than what was there before.
For really trivial cases, where the query is just "SELECT expression" and
nothing else, there's a hazard that adding the extra RTE makes for a
noticeable slowdown; even though it's not much processing, there's not
that much for the planner to do overall. However testing says that the
penalty is very small, close to the noise level. In more complex queries,
this is able to find optimizations that we could not find before.
The new RTE type is called RTE_RESULT, since the "scan" plan type it
gives rise to is a Result node (the same plan we produced for a "SELECT
expression" query before). To avoid confusion, rename the old ResultPath
path type to GroupResultPath, reflecting that it's only used in degenerate
grouping cases where we know the query produces just one grouped row.
(It wouldn't work to unify the two cases, because there are different
rules about where the associated quals live during query_planner.)
Note: although this touches readfuncs.c, I don't think a catversion
bump is required, because the added case can't occur in stored rules,
only plans.
Patch by me, reviewed by David Rowley and Mark Dilger
Discussion: https://postgr.es/m/15944.1521127664@sss.pgh.pa.us
2019-01-28 23:54:10 +01:00
|
|
|
* GroupResultPath represents use of a Result plan node to compute the
|
|
|
|
* output of a degenerate GROUP BY case, wherein we know we should produce
|
|
|
|
* exactly one row, which might then be filtered by a HAVING qual.
|
Revise the planner's handling of "pseudoconstant" WHERE clauses, that is
clauses containing no variables and no volatile functions. Such a clause
can be used as a one-time qual in a gating Result plan node, to suppress
plan execution entirely when it is false. Even when the clause is true,
putting it in a gating node wins by avoiding repeated evaluation of the
clause. In previous PG releases, query_planner() would do this for
pseudoconstant clauses appearing at the top level of the jointree, but
there was no ability to generate a gating Result deeper in the plan tree.
To fix it, get rid of the special case in query_planner(), and instead
process pseudoconstant clauses through the normal RestrictInfo qual
distribution mechanism. When a pseudoconstant clause is found attached to
a path node in create_plan(), pull it out and generate a gating Result at
that point. This requires special-casing pseudoconstants in selectivity
estimation and cost_qual_eval, but on the whole it's pretty clean.
It probably even makes the planner a bit faster than before for the normal
case of no pseudoconstants, since removing pull_constant_clauses saves one
useless traversal of the qual tree. Per gripe from Phil Frost.
2006-07-01 20:38:33 +02:00
|
|
|
*
|
|
|
|
* Note that quals is a list of bare clauses, not RestrictInfos.
|
2002-11-06 01:00:45 +01:00
|
|
|
*/
|
In the planner, replace an empty FROM clause with a dummy RTE.
The fact that "SELECT expression" has no base relations has long been a
thorn in the side of the planner. It makes it hard to flatten a sub-query
that looks like that, or is a trivial VALUES() item, because the planner
generally uses relid sets to identify sub-relations, and such a sub-query
would have an empty relid set if we flattened it. prepjointree.c contains
some baroque logic that works around this in certain special cases --- but
there is a much better answer. We can replace an empty FROM clause with a
dummy RTE that acts like a table of one row and no columns, and then there
are no such corner cases to worry about. Instead we need some logic to
get rid of useless dummy RTEs, but that's simpler and covers more cases
than what was there before.
For really trivial cases, where the query is just "SELECT expression" and
nothing else, there's a hazard that adding the extra RTE makes for a
noticeable slowdown; even though it's not much processing, there's not
that much for the planner to do overall. However testing says that the
penalty is very small, close to the noise level. In more complex queries,
this is able to find optimizations that we could not find before.
The new RTE type is called RTE_RESULT, since the "scan" plan type it
gives rise to is a Result node (the same plan we produced for a "SELECT
expression" query before). To avoid confusion, rename the old ResultPath
path type to GroupResultPath, reflecting that it's only used in degenerate
grouping cases where we know the query produces just one grouped row.
(It wouldn't work to unify the two cases, because there are different
rules about where the associated quals live during query_planner.)
Note: although this touches readfuncs.c, I don't think a catversion
bump is required, because the added case can't occur in stored rules,
only plans.
Patch by me, reviewed by David Rowley and Mark Dilger
Discussion: https://postgr.es/m/15944.1521127664@sss.pgh.pa.us
2019-01-28 23:54:10 +01:00
|
|
|
typedef struct GroupResultPath
|
2002-11-06 01:00:45 +01:00
|
|
|
{
|
|
|
|
Path path;
|
Revise the planner's handling of "pseudoconstant" WHERE clauses, that is
clauses containing no variables and no volatile functions. Such a clause
can be used as a one-time qual in a gating Result plan node, to suppress
plan execution entirely when it is false. Even when the clause is true,
putting it in a gating node wins by avoiding repeated evaluation of the
clause. In previous PG releases, query_planner() would do this for
pseudoconstant clauses appearing at the top level of the jointree, but
there was no ability to generate a gating Result deeper in the plan tree.
To fix it, get rid of the special case in query_planner(), and instead
process pseudoconstant clauses through the normal RestrictInfo qual
distribution mechanism. When a pseudoconstant clause is found attached to
a path node in create_plan(), pull it out and generate a gating Result at
that point. This requires special-casing pseudoconstants in selectivity
estimation and cost_qual_eval, but on the whole it's pretty clean.
It probably even makes the planner a bit faster than before for the normal
case of no pseudoconstants, since removing pull_constant_clauses saves one
useless traversal of the qual tree. Per gripe from Phil Frost.
2006-07-01 20:38:33 +02:00
|
|
|
List *quals;
|
In the planner, replace an empty FROM clause with a dummy RTE.
The fact that "SELECT expression" has no base relations has long been a
thorn in the side of the planner. It makes it hard to flatten a sub-query
that looks like that, or is a trivial VALUES() item, because the planner
generally uses relid sets to identify sub-relations, and such a sub-query
would have an empty relid set if we flattened it. prepjointree.c contains
some baroque logic that works around this in certain special cases --- but
there is a much better answer. We can replace an empty FROM clause with a
dummy RTE that acts like a table of one row and no columns, and then there
are no such corner cases to worry about. Instead we need some logic to
get rid of useless dummy RTEs, but that's simpler and covers more cases
than what was there before.
For really trivial cases, where the query is just "SELECT expression" and
nothing else, there's a hazard that adding the extra RTE makes for a
noticeable slowdown; even though it's not much processing, there's not
that much for the planner to do overall. However testing says that the
penalty is very small, close to the noise level. In more complex queries,
this is able to find optimizations that we could not find before.
The new RTE type is called RTE_RESULT, since the "scan" plan type it
gives rise to is a Result node (the same plan we produced for a "SELECT
expression" query before). To avoid confusion, rename the old ResultPath
path type to GroupResultPath, reflecting that it's only used in degenerate
grouping cases where we know the query produces just one grouped row.
(It wouldn't work to unify the two cases, because there are different
rules about where the associated quals live during query_planner.)
Note: although this touches readfuncs.c, I don't think a catversion
bump is required, because the added case can't occur in stored rules,
only plans.
Patch by me, reviewed by David Rowley and Mark Dilger
Discussion: https://postgr.es/m/15944.1521127664@sss.pgh.pa.us
2019-01-28 23:54:10 +01:00
|
|
|
} GroupResultPath;
|
2002-11-06 01:00:45 +01:00
|
|
|
|
2002-11-30 06:21:03 +01:00
|
|
|
/*
|
|
|
|
* MaterialPath represents use of a Material plan node, i.e., caching of
|
|
|
|
* the output of its subpath. This is used when the subpath is expensive
|
|
|
|
* and needs to be scanned repeatedly, or when we need mark/restore ability
|
|
|
|
* and the subpath doesn't have it.
|
|
|
|
*/
|
|
|
|
typedef struct MaterialPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath;
|
|
|
|
} MaterialPath;
|
|
|
|
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
/*
|
2021-07-14 02:43:58 +02:00
|
|
|
* MemoizePath represents a Memoize plan node, i.e., a cache that caches
|
|
|
|
* tuples from parameterized paths to save the underlying node from having to
|
|
|
|
* be rescanned for parameter values which are already cached.
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
*/
|
2021-07-14 02:43:58 +02:00
|
|
|
typedef struct MemoizePath
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath; /* outerpath to cache tuples from */
|
|
|
|
List *hash_operators; /* hash operators for each key */
|
|
|
|
List *param_exprs; /* cache keys */
|
|
|
|
bool singlerow; /* true if the cache entry is to be marked as
|
|
|
|
* complete after caching the first record. */
|
2021-11-23 22:06:59 +01:00
|
|
|
bool binary_mode; /* true when cache key should be compared bit
|
|
|
|
* by bit, false when using hash equality ops */
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality calls; /* expected number of rescans */
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
uint32 est_entries; /* The maximum number of entries that the
|
|
|
|
* planner expects will fit in the cache, or 0
|
|
|
|
* if unknown */
|
2021-07-14 02:43:58 +02:00
|
|
|
} MemoizePath;
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
|
2003-01-20 19:55:07 +01:00
|
|
|
/*
|
|
|
|
* UniquePath represents elimination of distinct rows from the output of
|
|
|
|
* its subpath.
|
|
|
|
*
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
* This can represent significantly different plans: either hash-based or
|
|
|
|
* sort-based implementation, or a no-op if the input path can be proven
|
|
|
|
* distinct already. The decision is sufficiently localized that it's not
|
|
|
|
* worth having separate Path node types. (Note: in the no-op case, we could
|
|
|
|
* eliminate the UniquePath node entirely and just return the subpath; but
|
|
|
|
* it's convenient to have a UniquePath in the path tree to signal upper-level
|
|
|
|
* routines that the input is known distinct.)
|
2003-01-20 19:55:07 +01:00
|
|
|
*/
|
2021-07-21 11:03:25 +02:00
|
|
|
typedef enum UniquePathMethod
|
2004-01-05 19:04:39 +01:00
|
|
|
{
|
|
|
|
UNIQUE_PATH_NOOP, /* input is known unique already */
|
|
|
|
UNIQUE_PATH_HASH, /* use hashing */
|
|
|
|
UNIQUE_PATH_SORT /* use sorting */
|
|
|
|
} UniquePathMethod;
|
|
|
|
|
2003-01-20 19:55:07 +01:00
|
|
|
typedef struct UniquePath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath;
|
2004-01-05 19:04:39 +01:00
|
|
|
UniquePathMethod umethod;
|
2008-08-14 20:48:00 +02:00
|
|
|
List *in_operators; /* equality operators of the IN clause */
|
|
|
|
List *uniq_exprs; /* expressions to be made unique */
|
2003-01-20 19:55:07 +01:00
|
|
|
} UniquePath;
|
|
|
|
|
Add a Gather executor node.
A Gather executor node runs any number of copies of a plan in an equal
number of workers and merges all of the results into a single tuple
stream. It can also run the plan itself, if the workers are
unavailable or haven't started up yet. It is intended to work with
the Partial Seq Scan node which will be added in future commits.
It could also be used to implement parallel query of a different sort
by itself, without help from Partial Seq Scan, if the single_copy mode
is used. In that mode, a worker executes the plan, and the parallel
leader does not, merely collecting the worker's results. So, a Gather
node could be inserted into a plan to split the execution of that plan
across two processes. Nested Gather nodes aren't currently supported,
but we might want to add support for that in the future.
There's nothing in the planner to actually generate Gather nodes yet,
so it's not quite time to break out the champagne. But we're getting
close.
Amit Kapila. Some designs suggestions were provided by me, and I also
reviewed the patch. Single-copy mode, documentation, and other minor
changes also by me.
2015-10-01 01:23:36 +02:00
|
|
|
/*
|
|
|
|
* GatherPath runs several copies of a plan in parallel and collects the
|
|
|
|
* results. The parallel leader may also execute the plan, unless the
|
|
|
|
* single_copy flag is set.
|
|
|
|
*/
|
|
|
|
typedef struct GatherPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath; /* path for each worker */
|
2016-09-14 21:43:26 +02:00
|
|
|
bool single_copy; /* don't execute path more than once */
|
2017-04-01 03:01:20 +02:00
|
|
|
int num_workers; /* number of workers sought to help */
|
Add a Gather executor node.
A Gather executor node runs any number of copies of a plan in an equal
number of workers and merges all of the results into a single tuple
stream. It can also run the plan itself, if the workers are
unavailable or haven't started up yet. It is intended to work with
the Partial Seq Scan node which will be added in future commits.
It could also be used to implement parallel query of a different sort
by itself, without help from Partial Seq Scan, if the single_copy mode
is used. In that mode, a worker executes the plan, and the parallel
leader does not, merely collecting the worker's results. So, a Gather
node could be inserted into a plan to split the execution of that plan
across two processes. Nested Gather nodes aren't currently supported,
but we might want to add support for that in the future.
There's nothing in the planner to actually generate Gather nodes yet,
so it's not quite time to break out the champagne. But we're getting
close.
Amit Kapila. Some designs suggestions were provided by me, and I also
reviewed the patch. Single-copy mode, documentation, and other minor
changes also by me.
2015-10-01 01:23:36 +02:00
|
|
|
} GatherPath;
|
|
|
|
|
2017-03-09 13:40:36 +01:00
|
|
|
/*
|
Force rescanning of parallel-aware scan nodes below a Gather[Merge].
The ExecReScan machinery contains various optimizations for postponing
or skipping rescans of plan subtrees; for example a HashAgg node may
conclude that it can re-use the table it built before, instead of
re-reading its input subtree. But that is wrong if the input contains
a parallel-aware table scan node, since the portion of the table scanned
by the leader process is likely to vary from one rescan to the next.
This explains the timing-dependent buildfarm failures we saw after
commit a2b70c89c.
The established mechanism for showing that a plan node's output is
potentially variable is to mark it as depending on some runtime Param.
Hence, to fix this, invent a dummy Param (one that has a PARAM_EXEC
parameter number, but carries no actual value) associated with each Gather
or GatherMerge node, mark parallel-aware nodes below that node as dependent
on that Param, and arrange for ExecReScanGather[Merge] to flag that Param
as changed whenever the Gather[Merge] node is rescanned.
This solution breaks an undocumented assumption made by the parallel
executor logic, namely that all rescans of nodes below a Gather[Merge]
will happen synchronously during the ReScan of the top node itself.
But that's fundamentally contrary to the design of the ExecReScan code,
and so was doomed to fail someday anyway (even if you want to argue
that the bug being fixed here wasn't a failure of that assumption).
A follow-on patch will address that issue. In the meantime, the worst
that's expected to happen is that given very bad timing luck, the leader
might have to do all the work during a rescan, because workers think
they have nothing to do, if they are able to start up before the eventual
ReScan of the leader's parallel-aware table scan node has reset the
shared scan state.
Although this problem exists in 9.6, there does not seem to be any way
for it to manifest there. Without GatherMerge, it seems that a plan tree
that has a rescan-short-circuiting node below Gather will always also
have one above it that will short-circuit in the same cases, preventing
the Gather from being rescanned. Hence we won't take the risk of
back-patching this change into 9.6. But v10 needs it.
Discussion: https://postgr.es/m/CAA4eK1JkByysFJNh9M349u_nNjqETuEnY_y1VUc_kJiU0bxtaQ@mail.gmail.com
2017-08-30 15:29:55 +02:00
|
|
|
* GatherMergePath runs several copies of a plan in parallel and collects
|
2019-06-24 23:17:04 +02:00
|
|
|
* the results, preserving their common sort order.
|
2017-03-09 13:40:36 +01:00
|
|
|
*/
|
|
|
|
typedef struct GatherMergePath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath; /* path for each worker */
|
|
|
|
int num_workers; /* number of workers sought to help */
|
|
|
|
} GatherMergePath;
|
|
|
|
|
|
|
|
|
1999-08-16 04:17:58 +02:00
|
|
|
/*
|
|
|
|
* All join-type paths share these fields.
|
|
|
|
*/
|
|
|
|
|
|
|
|
typedef struct JoinPath
|
1996-08-28 03:59:28 +02:00
|
|
|
{
|
|
|
|
Path path;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-09-12 23:07:18 +02:00
|
|
|
JoinType jointype;
|
|
|
|
|
2017-04-08 04:20:03 +02:00
|
|
|
bool inner_unique; /* each outer tuple provably matches no more
|
|
|
|
* than one inner tuple */
|
|
|
|
|
1999-08-16 04:17:58 +02:00
|
|
|
Path *outerjoinpath; /* path for the outer side of the join */
|
|
|
|
Path *innerjoinpath; /* path for the inner side of the join */
|
2000-09-12 23:07:18 +02:00
|
|
|
|
2000-02-07 05:41:04 +01:00
|
|
|
List *joinrestrictinfo; /* RestrictInfos to apply to join */
|
2000-04-12 19:17:23 +02:00
|
|
|
|
2000-02-07 05:41:04 +01:00
|
|
|
/*
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
* See the notes for RelOptInfo and ParamPathInfo to understand why
|
|
|
|
* joinrestrictinfo is needed in JoinPath, and can't be merged into the
|
|
|
|
* parent RelOptInfo.
|
2000-02-07 05:41:04 +01:00
|
|
|
*/
|
1999-08-16 04:17:58 +02:00
|
|
|
} JoinPath;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A nested-loop path needs no special fields.
|
|
|
|
*/
|
|
|
|
|
2021-08-08 16:55:51 +02:00
|
|
|
typedef struct NestPath
|
|
|
|
{
|
|
|
|
JoinPath jpath;
|
|
|
|
} NestPath;
|
1999-08-16 04:17:58 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* A mergejoin path has these fields.
|
|
|
|
*
|
2009-11-15 03:45:35 +01:00
|
|
|
* Unlike other path types, a MergePath node doesn't represent just a single
|
|
|
|
* run-time plan node: it can represent up to four. Aside from the MergeJoin
|
|
|
|
* node itself, there can be a Sort node for the outer input, a Sort node
|
|
|
|
* for the inner input, and/or a Material node for the inner input. We could
|
|
|
|
* represent these nodes by separate path nodes, but considering how many
|
|
|
|
* different merge paths are investigated during a complex join problem,
|
|
|
|
* it seems better to avoid unnecessary palloc overhead.
|
|
|
|
*
|
2000-02-19 00:47:31 +01:00
|
|
|
* path_mergeclauses lists the clauses (in the form of RestrictInfos)
|
2007-01-20 21:45:41 +01:00
|
|
|
* that will be used in the merge.
|
2000-02-19 00:47:31 +01:00
|
|
|
*
|
1999-08-16 04:17:58 +02:00
|
|
|
* Note that the mergeclauses are a subset of the parent relation's
|
|
|
|
* restriction-clause list. Any join clauses that are not mergejoinable
|
|
|
|
* appear only in the parent's restrict list, and must be checked by a
|
|
|
|
* qpqual at execution time.
|
2000-02-19 00:47:31 +01:00
|
|
|
*
|
|
|
|
* outersortkeys (resp. innersortkeys) is NIL if the outer path
|
|
|
|
* (resp. inner path) is already ordered appropriately for the
|
|
|
|
* mergejoin. If it is not NIL then it is a PathKeys list describing
|
2009-11-15 03:45:35 +01:00
|
|
|
* the ordering that must be created by an explicit Sort node.
|
|
|
|
*
|
2017-08-16 06:22:32 +02:00
|
|
|
* skip_mark_restore is true if the executor need not do mark/restore calls.
|
2017-04-08 04:20:03 +02:00
|
|
|
* Mark/restore overhead is usually required, but can be skipped if we know
|
|
|
|
* that the executor need find only one match per outer tuple, and that the
|
|
|
|
* mergeclauses are sufficient to identify a match. In such cases the
|
|
|
|
* executor can immediately advance the outer relation after processing a
|
2018-04-01 21:01:28 +02:00
|
|
|
* match, and therefore it need never back up the inner relation.
|
2017-04-08 04:20:03 +02:00
|
|
|
*
|
2017-08-16 06:22:32 +02:00
|
|
|
* materialize_inner is true if a Material node should be placed atop the
|
2009-11-15 03:45:35 +01:00
|
|
|
* inner input. This may appear with or without an inner Sort step.
|
1999-08-16 04:17:58 +02:00
|
|
|
*/
|
1999-02-12 18:25:05 +01:00
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
typedef struct MergePath
|
|
|
|
{
|
1999-02-12 18:25:05 +01:00
|
|
|
JoinPath jpath;
|
2000-02-19 00:47:31 +01:00
|
|
|
List *path_mergeclauses; /* join clauses to be used for merge */
|
2009-11-15 03:45:35 +01:00
|
|
|
List *outersortkeys; /* keys for explicit sort, if any */
|
|
|
|
List *innersortkeys; /* keys for explicit sort, if any */
|
2017-04-08 04:20:03 +02:00
|
|
|
bool skip_mark_restore; /* can executor skip mark/restore? */
|
2009-11-15 03:45:35 +01:00
|
|
|
bool materialize_inner; /* add Materialize to inner? */
|
1996-08-28 03:59:28 +02:00
|
|
|
} MergePath;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1999-02-22 20:55:44 +01:00
|
|
|
/*
|
1999-08-16 04:17:58 +02:00
|
|
|
* A hashjoin path has these fields.
|
|
|
|
*
|
|
|
|
* The remarks above for mergeclauses apply for hashclauses as well.
|
2000-02-19 00:47:31 +01:00
|
|
|
*
|
|
|
|
* Hashjoin does not care what order its inputs appear in, so we have
|
|
|
|
* no need for sortkeys.
|
1999-02-22 20:55:44 +01:00
|
|
|
*/
|
1996-08-28 03:59:28 +02:00
|
|
|
|
1999-08-16 04:17:58 +02:00
|
|
|
typedef struct HashPath
|
1996-08-28 03:59:28 +02:00
|
|
|
{
|
1999-08-16 04:17:58 +02:00
|
|
|
JoinPath jpath;
|
|
|
|
List *path_hashclauses; /* join clauses used for hashing */
|
2009-03-26 18:15:35 +01:00
|
|
|
int num_batches; /* number of batches expected */
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality inner_rows_total; /* total inner rows expected */
|
1999-08-16 04:17:58 +02:00
|
|
|
} HashPath;
|
1996-08-28 03:59:28 +02:00
|
|
|
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
/*
|
|
|
|
* ProjectionPath represents a projection (that is, targetlist computation)
|
|
|
|
*
|
Refactor planning of projection steps that don't need a Result plan node.
The original upper-planner-pathification design (commit 3fc6e2d7f5b652b4)
assumed that we could always determine during Path formation whether or not
we would need a Result plan node to perform projection of a targetlist.
That turns out not to work very well, though, because createplan.c still
has some responsibilities for choosing the specific target list associated
with sorting/grouping nodes (in particular it might choose to add resjunk
columns for sorting). We might not ever refactor that --- doing so would
push more work into Path formation, which isn't attractive --- and we
certainly won't do so for 9.6. So, while create_projection_path and
apply_projection_to_path can tell for sure what will happen if the subpath
is projection-capable, they can't tell for sure when it isn't. This is at
least a latent bug in apply_projection_to_path, which might think it can
apply a target to a non-projecting node when the node will end up computing
something different.
Also, I'd tied the creation of a ProjectionPath node to whether or not a
Result is needed, but it turns out that we sometimes need a ProjectionPath
node anyway to avoid modifying a possibly-shared subpath node. Callers had
to use create_projection_path for such cases, and we added code to them
that knew about the potential omission of a Result node and attempted to
adjust the cost estimates for that. That was uncertainly correct and
definitely ugly/unmaintainable.
To fix, have create_projection_path explicitly check whether a Result
is needed and adjust its cost estimate accordingly, though it creates
a ProjectionPath in either case. apply_projection_to_path is now mostly
just an optimized version that can avoid creating an extra Path node when
the input is known to not be shared with any other live path. (There
is one case that create_projection_path doesn't handle, which is pushing
parallel-safe expressions below a Gather node. We could make it do that
by duplicating the GatherPath, but there seems no need as yet.)
create_projection_plan still has to recheck the tlist-match condition,
which means that if the matching situation does get changed by createplan.c
then we'll have made a slightly incorrect cost estimate. But there seems
no help for that in the near term, and I doubt it occurs often enough,
let alone would change planning decisions often enough, to be worth
stressing about.
I added a "dummypp" field to ProjectionPath to track whether
create_projection_path thinks a Result is needed. This is not really
necessary as-committed because create_projection_plan doesn't look at the
flag; but it seems like a good idea to remember what we thought when
forming the cost estimate, if only for debugging purposes.
In passing, get rid of the target_parallel parameter added to
apply_projection_to_path by commit 54f5c5150. I don't think that's a good
idea because it involves callers in what should be an internal decision,
and opens us up to missing optimization opportunities if callers think they
don't need to provide a valid flag, as most don't. For the moment, this
just costs us an extra has_parallel_hazard call when planning a Gather.
If that starts to look expensive, I think a better solution would be to
teach PathTarget to carry/cache knowledge of parallel-safety of its
contents.
2016-06-22 00:38:20 +02:00
|
|
|
* Nominally, this path node represents using a Result plan node to do a
|
|
|
|
* projection step. However, if the input plan node supports projection,
|
|
|
|
* we can just modify its output targetlist to do the required calculations
|
|
|
|
* directly, and not need a Result. In some places in the planner we can just
|
|
|
|
* jam the desired PathTarget into the input path node (and adjust its cost
|
|
|
|
* accordingly), so we don't need a ProjectionPath. But in other places
|
|
|
|
* it's necessary to not modify the input path node, so we need a separate
|
|
|
|
* ProjectionPath node, which is marked dummy to indicate that we intend to
|
|
|
|
* assign the work to the input plan node. The estimated cost for the
|
|
|
|
* ProjectionPath node will account for whether a Result will be used or not.
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
*/
|
|
|
|
typedef struct ProjectionPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath; /* path representing input source */
|
Refactor planning of projection steps that don't need a Result plan node.
The original upper-planner-pathification design (commit 3fc6e2d7f5b652b4)
assumed that we could always determine during Path formation whether or not
we would need a Result plan node to perform projection of a targetlist.
That turns out not to work very well, though, because createplan.c still
has some responsibilities for choosing the specific target list associated
with sorting/grouping nodes (in particular it might choose to add resjunk
columns for sorting). We might not ever refactor that --- doing so would
push more work into Path formation, which isn't attractive --- and we
certainly won't do so for 9.6. So, while create_projection_path and
apply_projection_to_path can tell for sure what will happen if the subpath
is projection-capable, they can't tell for sure when it isn't. This is at
least a latent bug in apply_projection_to_path, which might think it can
apply a target to a non-projecting node when the node will end up computing
something different.
Also, I'd tied the creation of a ProjectionPath node to whether or not a
Result is needed, but it turns out that we sometimes need a ProjectionPath
node anyway to avoid modifying a possibly-shared subpath node. Callers had
to use create_projection_path for such cases, and we added code to them
that knew about the potential omission of a Result node and attempted to
adjust the cost estimates for that. That was uncertainly correct and
definitely ugly/unmaintainable.
To fix, have create_projection_path explicitly check whether a Result
is needed and adjust its cost estimate accordingly, though it creates
a ProjectionPath in either case. apply_projection_to_path is now mostly
just an optimized version that can avoid creating an extra Path node when
the input is known to not be shared with any other live path. (There
is one case that create_projection_path doesn't handle, which is pushing
parallel-safe expressions below a Gather node. We could make it do that
by duplicating the GatherPath, but there seems no need as yet.)
create_projection_plan still has to recheck the tlist-match condition,
which means that if the matching situation does get changed by createplan.c
then we'll have made a slightly incorrect cost estimate. But there seems
no help for that in the near term, and I doubt it occurs often enough,
let alone would change planning decisions often enough, to be worth
stressing about.
I added a "dummypp" field to ProjectionPath to track whether
create_projection_path thinks a Result is needed. This is not really
necessary as-committed because create_projection_plan doesn't look at the
flag; but it seems like a good idea to remember what we thought when
forming the cost estimate, if only for debugging purposes.
In passing, get rid of the target_parallel parameter added to
apply_projection_to_path by commit 54f5c5150. I don't think that's a good
idea because it involves callers in what should be an internal decision,
and opens us up to missing optimization opportunities if callers think they
don't need to provide a valid flag, as most don't. For the moment, this
just costs us an extra has_parallel_hazard call when planning a Gather.
If that starts to look expensive, I think a better solution would be to
teach PathTarget to carry/cache knowledge of parallel-safety of its
contents.
2016-06-22 00:38:20 +02:00
|
|
|
bool dummypp; /* true if no separate Result is needed */
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
} ProjectionPath;
|
|
|
|
|
Move targetlist SRF handling from expression evaluation to new executor node.
Evaluation of set returning functions (SRFs_ in the targetlist (like SELECT
generate_series(1,5)) so far was done in the expression evaluation (i.e.
ExecEvalExpr()) and projection (i.e. ExecProject/ExecTargetList) code.
This meant that most executor nodes performing projection, and most
expression evaluation functions, had to deal with the possibility that an
evaluated expression could return a set of return values.
That's bad because it leads to repeated code in a lot of places. It also,
and that's my (Andres's) motivation, made it a lot harder to implement a
more efficient way of doing expression evaluation.
To fix this, introduce a new executor node (ProjectSet) that can evaluate
targetlists containing one or more SRFs. To avoid the complexity of the old
way of handling nested expressions returning sets (e.g. having to pass up
ExprDoneCond, and dealing with arguments to functions returning sets etc.),
those SRFs can only be at the top level of the node's targetlist. The
planner makes sure (via split_pathtarget_at_srfs()) that SRF evaluation is
only necessary in ProjectSet nodes and that SRFs are only present at the
top level of the node's targetlist. If there are nested SRFs the planner
creates multiple stacked ProjectSet nodes. The ProjectSet nodes always get
input from an underlying node.
We also discussed and prototyped evaluating targetlist SRFs using ROWS
FROM(), but that turned out to be more complicated than we'd hoped.
While moving SRF evaluation to ProjectSet would allow to retain the old
"least common multiple" behavior when multiple SRFs are present in one
targetlist (i.e. continue returning rows until all SRFs are at the end of
their input at the same time), we decided to instead only return rows till
all SRFs are exhausted, returning NULL for already exhausted ones. We
deemed the previous behavior to be too confusing, unexpected and actually
not particularly useful.
As a side effect, the previously prohibited case of multiple set returning
arguments to a function, is now allowed. Not because it's particularly
desirable, but because it ends up working and there seems to be no argument
for adding code to prohibit it.
Currently the behavior for COALESCE and CASE containing SRFs has changed,
returning multiple rows from the expression, even when the SRF containing
"arm" of the expression is not evaluated. That's because the SRFs are
evaluated in a separate ProjectSet node. As that's quite confusing, we're
likely to instead prohibit SRFs in those places. But that's still being
discussed, and the code would reside in places not touched here, so that's
a task for later.
There's a lot of, now superfluous, code dealing with set return expressions
around. But as the changes to get rid of those are verbose largely boring,
it seems better for readability to keep the cleanup as a separate commit.
Author: Tom Lane and Andres Freund
Discussion: https://postgr.es/m/20160822214023.aaxz5l4igypowyri@alap3.anarazel.de
2017-01-18 21:46:50 +01:00
|
|
|
/*
|
|
|
|
* ProjectSetPath represents evaluation of a targetlist that includes
|
|
|
|
* set-returning function(s), which will need to be implemented by a
|
|
|
|
* ProjectSet plan node.
|
|
|
|
*/
|
|
|
|
typedef struct ProjectSetPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath; /* path representing input source */
|
|
|
|
} ProjectSetPath;
|
|
|
|
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
/*
|
|
|
|
* SortPath represents an explicit sort step
|
|
|
|
*
|
|
|
|
* The sort keys are, by definition, the same as path.pathkeys.
|
|
|
|
*
|
|
|
|
* Note: the Sort plan node cannot project, so path.pathtarget must be the
|
|
|
|
* same as the input's pathtarget.
|
|
|
|
*/
|
|
|
|
typedef struct SortPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath; /* path representing input source */
|
|
|
|
} SortPath;
|
|
|
|
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
/*
|
2020-11-30 22:32:56 +01:00
|
|
|
* IncrementalSortPath represents an incremental sort step
|
|
|
|
*
|
|
|
|
* This is like a regular sort, except some leading key columns are assumed
|
|
|
|
* to be ordered already.
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
*/
|
|
|
|
typedef struct IncrementalSortPath
|
|
|
|
{
|
|
|
|
SortPath spath;
|
|
|
|
int nPresortedCols; /* number of presorted columns */
|
|
|
|
} IncrementalSortPath;
|
|
|
|
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
/*
|
|
|
|
* GroupPath represents grouping (of presorted input)
|
|
|
|
*
|
|
|
|
* groupClause represents the columns to be grouped on; the input path
|
|
|
|
* must be at least that well sorted.
|
|
|
|
*
|
|
|
|
* We can also apply a qual to the grouped rows (equivalent of HAVING)
|
|
|
|
*/
|
|
|
|
typedef struct GroupPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath; /* path representing input source */
|
|
|
|
List *groupClause; /* a list of SortGroupClause's */
|
|
|
|
List *qual; /* quals (HAVING quals), if any */
|
|
|
|
} GroupPath;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* UpperUniquePath represents adjacent-duplicate removal (in presorted input)
|
|
|
|
*
|
|
|
|
* The columns to be compared are the first numkeys columns of the path's
|
|
|
|
* pathkeys. The input is presumed already sorted that way.
|
|
|
|
*/
|
|
|
|
typedef struct UpperUniquePath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath; /* path representing input source */
|
|
|
|
int numkeys; /* number of pathkey columns to compare */
|
|
|
|
} UpperUniquePath;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* AggPath represents generic computation of aggregate functions
|
|
|
|
*
|
|
|
|
* This may involve plain grouping (but not grouping sets), using either
|
|
|
|
* sorted or hashed grouping; for the AGG_SORTED case, the input must be
|
|
|
|
* appropriately presorted.
|
|
|
|
*/
|
|
|
|
typedef struct AggPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath; /* path representing input source */
|
|
|
|
AggStrategy aggstrategy; /* basic strategy, see nodes.h */
|
2016-06-26 20:33:38 +02:00
|
|
|
AggSplit aggsplit; /* agg-splitting mode, see nodes.h */
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality numGroups; /* estimated number of groups in input */
|
2020-02-28 18:32:35 +01:00
|
|
|
uint64 transitionSpace; /* for pass-by-ref transition data */
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
List *groupClause; /* a list of SortGroupClause's */
|
|
|
|
List *qual; /* quals (HAVING quals), if any */
|
|
|
|
} AggPath;
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
/*
|
|
|
|
* Various annotations used for grouping sets in the planner.
|
|
|
|
*/
|
|
|
|
|
|
|
|
typedef struct GroupingSetData
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
List *set; /* grouping set as list of sortgrouprefs */
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality numGroups; /* est. number of result groups */
|
2017-03-27 05:20:54 +02:00
|
|
|
} GroupingSetData;
|
|
|
|
|
|
|
|
typedef struct RollupData
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
List *groupClause; /* applicable subset of parse->groupClause */
|
|
|
|
List *gsets; /* lists of integer indexes into groupClause */
|
|
|
|
List *gsets_data; /* list of GroupingSetData */
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality numGroups; /* est. number of result groups */
|
2017-03-27 05:20:54 +02:00
|
|
|
bool hashable; /* can be hashed */
|
|
|
|
bool is_hashed; /* to be implemented as a hashagg */
|
|
|
|
} RollupData;
|
|
|
|
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
/*
|
|
|
|
* GroupingSetsPath represents a GROUPING SETS aggregation
|
|
|
|
*/
|
2017-03-27 05:20:54 +02:00
|
|
|
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
typedef struct GroupingSetsPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath; /* path representing input source */
|
2017-03-27 05:20:54 +02:00
|
|
|
AggStrategy aggstrategy; /* basic strategy */
|
|
|
|
List *rollups; /* list of RollupData */
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
List *qual; /* quals (HAVING quals), if any */
|
2020-02-28 18:32:35 +01:00
|
|
|
uint64 transitionSpace; /* for pass-by-ref transition data */
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
} GroupingSetsPath;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* MinMaxAggPath represents computation of MIN/MAX aggregates from indexes
|
|
|
|
*/
|
|
|
|
typedef struct MinMaxAggPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
List *mmaggregates; /* list of MinMaxAggInfo */
|
|
|
|
List *quals; /* HAVING quals, if any */
|
|
|
|
} MinMaxAggPath;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* WindowAggPath represents generic computation of window functions
|
|
|
|
*/
|
|
|
|
typedef struct WindowAggPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath; /* path representing input source */
|
|
|
|
WindowClause *winclause; /* WindowClause we'll be using */
|
|
|
|
} WindowAggPath;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* SetOpPath represents a set-operation, that is INTERSECT or EXCEPT
|
|
|
|
*/
|
|
|
|
typedef struct SetOpPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath; /* path representing input source */
|
|
|
|
SetOpCmd cmd; /* what to do, see nodes.h */
|
|
|
|
SetOpStrategy strategy; /* how to do it, see nodes.h */
|
|
|
|
List *distinctList; /* SortGroupClauses identifying target cols */
|
|
|
|
AttrNumber flagColIdx; /* where is the flag column, if any */
|
|
|
|
int firstFlag; /* flag value for first input relation */
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality numGroups; /* estimated number of groups in input */
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
} SetOpPath;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* RecursiveUnionPath represents a recursive UNION node
|
|
|
|
*/
|
|
|
|
typedef struct RecursiveUnionPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *leftpath; /* paths representing input sources */
|
|
|
|
Path *rightpath;
|
|
|
|
List *distinctList; /* SortGroupClauses identifying target cols */
|
|
|
|
int wtParam; /* ID of Param representing work table */
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality numGroups; /* estimated number of groups in input */
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
} RecursiveUnionPath;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* LockRowsPath represents acquiring row locks for SELECT FOR UPDATE/SHARE
|
|
|
|
*/
|
|
|
|
typedef struct LockRowsPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath; /* path representing input source */
|
|
|
|
List *rowMarks; /* a list of PlanRowMark's */
|
|
|
|
int epqParam; /* ID of Param for EvalPlanQual re-eval */
|
|
|
|
} LockRowsPath;
|
|
|
|
|
|
|
|
/*
|
2022-03-28 16:45:58 +02:00
|
|
|
* ModifyTablePath represents performing INSERT/UPDATE/DELETE/MERGE
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
*
|
|
|
|
* We represent most things that will be in the ModifyTable plan node
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
* literally, except we have a child Path not Plan. But analysis of the
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
* OnConflictExpr is deferred to createplan.c, as is collection of FDW data.
|
|
|
|
*/
|
|
|
|
typedef struct ModifyTablePath
|
|
|
|
{
|
|
|
|
Path path;
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
Path *subpath; /* Path producing source data */
|
2022-03-28 16:45:58 +02:00
|
|
|
CmdType operation; /* INSERT, UPDATE, DELETE, or MERGE */
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
bool canSetTag; /* do we set the command tag/es_processed? */
|
|
|
|
Index nominalRelation; /* Parent RT index for use of EXPLAIN */
|
2018-10-07 20:33:17 +02:00
|
|
|
Index rootRelation; /* Root RT index, if target is partitioned */
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
bool partColsUpdated; /* some part key in hierarchy updated? */
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
List *resultRelations; /* integer list of RT indexes */
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
List *updateColnosLists; /* per-target-table update_colnos lists */
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
List *withCheckOptionLists; /* per-target-table WCO lists */
|
|
|
|
List *returningLists; /* per-target-table RETURNING tlists */
|
|
|
|
List *rowMarks; /* PlanRowMarks (non-locking only) */
|
|
|
|
OnConflictExpr *onconflict; /* ON CONFLICT clause, or NULL */
|
|
|
|
int epqParam; /* ID of Param for EvalPlanQual re-eval */
|
2022-03-28 16:45:58 +02:00
|
|
|
List *mergeActionLists; /* per-target-table lists of actions for
|
|
|
|
* MERGE */
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
} ModifyTablePath;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* LimitPath represents applying LIMIT/OFFSET restrictions
|
|
|
|
*/
|
|
|
|
typedef struct LimitPath
|
|
|
|
{
|
|
|
|
Path path;
|
|
|
|
Path *subpath; /* path representing input source */
|
|
|
|
Node *limitOffset; /* OFFSET parameter, or NULL if none */
|
|
|
|
Node *limitCount; /* COUNT parameter, or NULL if none */
|
2020-04-07 22:22:13 +02:00
|
|
|
LimitOption limitOption; /* FETCH FIRST with ties or exact number */
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
} LimitPath;
|
|
|
|
|
|
|
|
|
1999-02-22 20:55:44 +01:00
|
|
|
/*
|
1999-07-25 01:21:14 +02:00
|
|
|
* Restriction clause info.
|
1999-08-16 04:17:58 +02:00
|
|
|
*
|
1999-07-25 01:21:14 +02:00
|
|
|
* We create one of these for each AND sub-clause of a restriction condition
|
2000-09-29 20:21:41 +02:00
|
|
|
* (WHERE or JOIN/ON clause). Since the restriction clauses are logically
|
|
|
|
* ANDed, we can use any one of them or any subset of them to filter out
|
|
|
|
* tuples, without having to evaluate the rest. The RestrictInfo node itself
|
|
|
|
* stores data used by the optimizer while choosing the best query plan.
|
1999-08-16 04:17:58 +02:00
|
|
|
*
|
2000-02-07 05:41:04 +01:00
|
|
|
* If a restriction clause references a single base relation, it will appear
|
|
|
|
* in the baserestrictinfo list of the RelOptInfo for that base rel.
|
1999-08-16 04:17:58 +02:00
|
|
|
*
|
2000-02-07 05:41:04 +01:00
|
|
|
* If a restriction clause references more than one base rel, it will
|
2005-06-09 06:19:00 +02:00
|
|
|
* appear in the joininfo list of every RelOptInfo that describes a strict
|
|
|
|
* subset of the base rels mentioned in the clause. The joininfo lists are
|
1999-08-16 04:17:58 +02:00
|
|
|
* used to drive join tree building by selecting plausible join candidates.
|
2000-02-07 05:41:04 +01:00
|
|
|
* The clause cannot actually be applied until we have built a join rel
|
|
|
|
* containing all the base rels it references, however.
|
|
|
|
*
|
2000-09-12 23:07:18 +02:00
|
|
|
* When we construct a join rel that includes all the base rels referenced
|
|
|
|
* in a multi-relation restriction clause, we place that clause into the
|
|
|
|
* joinrestrictinfo lists of paths for the join rel, if neither left nor
|
|
|
|
* right sub-path includes all base rels referenced in the clause. The clause
|
|
|
|
* will be applied at that join level, and will not propagate any further up
|
|
|
|
* the join tree. (Note: the "predicate migration" code was once intended to
|
2000-02-07 05:41:04 +01:00
|
|
|
* push restriction clauses up and down the plan tree based on evaluation
|
|
|
|
* costs, but it's dead code and is unlikely to be resurrected in the
|
|
|
|
* foreseeable future.)
|
|
|
|
*
|
|
|
|
* Note that in the presence of more than two rels, a multi-rel restriction
|
|
|
|
* might reach different heights in the join tree depending on the join
|
|
|
|
* sequence we use. So, these clauses cannot be associated directly with
|
|
|
|
* the join RelOptInfo, but must be kept track of on a per-join-path basis.
|
1999-08-16 04:17:58 +02:00
|
|
|
*
|
2007-01-20 21:45:41 +01:00
|
|
|
* RestrictInfos that represent equivalence conditions (i.e., mergejoinable
|
|
|
|
* equalities that are not outerjoin-delayed) are handled a bit differently.
|
|
|
|
* Initially we attach them to the EquivalenceClasses that are derived from
|
|
|
|
* them. When we construct a scan or join path, we look through all the
|
|
|
|
* EquivalenceClasses and generate derived RestrictInfos representing the
|
|
|
|
* minimal set of conditions that need to be checked for this particular scan
|
|
|
|
* or join to enforce that all members of each EquivalenceClass are in fact
|
|
|
|
* equal in all rows emitted by the scan or join.
|
|
|
|
*
|
2000-09-29 20:21:41 +02:00
|
|
|
* When dealing with outer joins we have to be very careful about pushing qual
|
|
|
|
* clauses up and down the tree. An outer join's own JOIN/ON conditions must
|
Fix some planner issues found while investigating Kevin Grittner's report
of poorer planning in 8.3 than 8.2:
1. After pushing a constant across an outer join --- ie, given
"a LEFT JOIN b ON (a.x = b.y) WHERE a.x = 42", we can deduce that b.y is
sort of equal to 42, in the sense that we needn't fetch any b rows where
it isn't 42 --- loop to see if any additional deductions can be made.
Previous releases did that by recursing, but I had mistakenly thought that
this was no longer necessary given the EquivalenceClass machinery.
2. Allow pushing constants across outer join conditions even if the
condition is outerjoin_delayed due to a lower outer join. This is safe
as long as the condition is strict and we re-test it at the upper join.
3. Keep the outer-join clause even if we successfully push a constant
across it. This is *necessary* in the outerjoin_delayed case, but
even in the simple case, it seems better to do this to ensure that the
join search order heuristics will consider the join as reasonable to
make. Mark such a clause as having selectivity 1.0, though, since it's
not going to eliminate very many rows after application of the constant
condition.
4. Tweak have_relevant_eclass_joinclause to report that two relations
are joinable when they have vars that are equated to the same constant.
We won't actually generate any joinclause from such an EquivalenceClass,
but again it seems that in such a case it's a good idea to consider
the join as worth costing out.
5. Fix a bug in select_mergejoin_clauses that was exposed by these
changes: we have to reject candidate mergejoin clauses if either side was
equated to a constant, because we can't construct a canonical pathkey list
for such a clause. This is an implementation restriction that might be
worth fixing someday, but it doesn't seem critical to get it done for 8.3.
2008-01-09 21:42:29 +01:00
|
|
|
* be evaluated exactly at that join node, unless they are "degenerate"
|
|
|
|
* conditions that reference only Vars from the nullable side of the join.
|
|
|
|
* Quals appearing in WHERE or in a JOIN above the outer join cannot be pushed
|
|
|
|
* down below the outer join, if they reference any nullable Vars.
|
|
|
|
* RestrictInfo nodes contain a flag to indicate whether a qual has been
|
2000-09-29 20:21:41 +02:00
|
|
|
* pushed down to a lower level than its original syntactic placement in the
|
|
|
|
* join tree would suggest. If an outer join prevents us from pushing a qual
|
|
|
|
* down to its "natural" semantic level (the level associated with just the
|
2005-06-09 06:19:00 +02:00
|
|
|
* base rels used in the qual) then we mark the qual with a "required_relids"
|
|
|
|
* value including more than just the base rels it actually uses. By
|
Fix some planner issues found while investigating Kevin Grittner's report
of poorer planning in 8.3 than 8.2:
1. After pushing a constant across an outer join --- ie, given
"a LEFT JOIN b ON (a.x = b.y) WHERE a.x = 42", we can deduce that b.y is
sort of equal to 42, in the sense that we needn't fetch any b rows where
it isn't 42 --- loop to see if any additional deductions can be made.
Previous releases did that by recursing, but I had mistakenly thought that
this was no longer necessary given the EquivalenceClass machinery.
2. Allow pushing constants across outer join conditions even if the
condition is outerjoin_delayed due to a lower outer join. This is safe
as long as the condition is strict and we re-test it at the upper join.
3. Keep the outer-join clause even if we successfully push a constant
across it. This is *necessary* in the outerjoin_delayed case, but
even in the simple case, it seems better to do this to ensure that the
join search order heuristics will consider the join as reasonable to
make. Mark such a clause as having selectivity 1.0, though, since it's
not going to eliminate very many rows after application of the constant
condition.
4. Tweak have_relevant_eclass_joinclause to report that two relations
are joinable when they have vars that are equated to the same constant.
We won't actually generate any joinclause from such an EquivalenceClass,
but again it seems that in such a case it's a good idea to consider
the join as worth costing out.
5. Fix a bug in select_mergejoin_clauses that was exposed by these
changes: we have to reject candidate mergejoin clauses if either side was
equated to a constant, because we can't construct a canonical pathkey list
for such a clause. This is an implementation restriction that might be
worth fixing someday, but it doesn't seem critical to get it done for 8.3.
2008-01-09 21:42:29 +01:00
|
|
|
* pretending that the qual references all the rels required to form the outer
|
2000-09-29 20:21:41 +02:00
|
|
|
* join, we prevent it from being evaluated below the outer join's joinrel.
|
|
|
|
* When we do form the outer join's joinrel, we still need to distinguish
|
|
|
|
* those quals that are actually in that join's JOIN/ON condition from those
|
2007-02-16 21:57:19 +01:00
|
|
|
* that appeared elsewhere in the tree and were pushed down to the join rel
|
2004-01-05 06:07:36 +01:00
|
|
|
* because they used no other rels. That's what the is_pushed_down flag is
|
2007-02-16 21:57:19 +01:00
|
|
|
* for; it tells us that a qual is not an OUTER JOIN qual for the set of base
|
|
|
|
* rels listed in required_relids. A clause that originally came from WHERE
|
|
|
|
* or an INNER JOIN condition will *always* have its is_pushed_down flag set.
|
|
|
|
* It's possible for an OUTER JOIN clause to be marked is_pushed_down too,
|
|
|
|
* if we decide that it can be pushed down into the nullable side of the join.
|
|
|
|
* In that case it acts as a plain filter qual for wherever it gets evaluated.
|
Fix some planner issues found while investigating Kevin Grittner's report
of poorer planning in 8.3 than 8.2:
1. After pushing a constant across an outer join --- ie, given
"a LEFT JOIN b ON (a.x = b.y) WHERE a.x = 42", we can deduce that b.y is
sort of equal to 42, in the sense that we needn't fetch any b rows where
it isn't 42 --- loop to see if any additional deductions can be made.
Previous releases did that by recursing, but I had mistakenly thought that
this was no longer necessary given the EquivalenceClass machinery.
2. Allow pushing constants across outer join conditions even if the
condition is outerjoin_delayed due to a lower outer join. This is safe
as long as the condition is strict and we re-test it at the upper join.
3. Keep the outer-join clause even if we successfully push a constant
across it. This is *necessary* in the outerjoin_delayed case, but
even in the simple case, it seems better to do this to ensure that the
join search order heuristics will consider the join as reasonable to
make. Mark such a clause as having selectivity 1.0, though, since it's
not going to eliminate very many rows after application of the constant
condition.
4. Tweak have_relevant_eclass_joinclause to report that two relations
are joinable when they have vars that are equated to the same constant.
We won't actually generate any joinclause from such an EquivalenceClass,
but again it seems that in such a case it's a good idea to consider
the join as worth costing out.
5. Fix a bug in select_mergejoin_clauses that was exposed by these
changes: we have to reject candidate mergejoin clauses if either side was
equated to a constant, because we can't construct a canonical pathkey list
for such a clause. This is an implementation restriction that might be
worth fixing someday, but it doesn't seem critical to get it done for 8.3.
2008-01-09 21:42:29 +01:00
|
|
|
* (In short, is_pushed_down is only false for non-degenerate outer join
|
2018-04-20 21:19:16 +02:00
|
|
|
* conditions. Possibly we should rename it to reflect that meaning? But
|
|
|
|
* see also the comments for RINFO_IS_PUSHED_DOWN, below.)
|
2004-01-05 06:07:36 +01:00
|
|
|
*
|
Fix some planner issues found while investigating Kevin Grittner's report
of poorer planning in 8.3 than 8.2:
1. After pushing a constant across an outer join --- ie, given
"a LEFT JOIN b ON (a.x = b.y) WHERE a.x = 42", we can deduce that b.y is
sort of equal to 42, in the sense that we needn't fetch any b rows where
it isn't 42 --- loop to see if any additional deductions can be made.
Previous releases did that by recursing, but I had mistakenly thought that
this was no longer necessary given the EquivalenceClass machinery.
2. Allow pushing constants across outer join conditions even if the
condition is outerjoin_delayed due to a lower outer join. This is safe
as long as the condition is strict and we re-test it at the upper join.
3. Keep the outer-join clause even if we successfully push a constant
across it. This is *necessary* in the outerjoin_delayed case, but
even in the simple case, it seems better to do this to ensure that the
join search order heuristics will consider the join as reasonable to
make. Mark such a clause as having selectivity 1.0, though, since it's
not going to eliminate very many rows after application of the constant
condition.
4. Tweak have_relevant_eclass_joinclause to report that two relations
are joinable when they have vars that are equated to the same constant.
We won't actually generate any joinclause from such an EquivalenceClass,
but again it seems that in such a case it's a good idea to consider
the join as worth costing out.
5. Fix a bug in select_mergejoin_clauses that was exposed by these
changes: we have to reject candidate mergejoin clauses if either side was
equated to a constant, because we can't construct a canonical pathkey list
for such a clause. This is an implementation restriction that might be
worth fixing someday, but it doesn't seem critical to get it done for 8.3.
2008-01-09 21:42:29 +01:00
|
|
|
* RestrictInfo nodes also contain an outerjoin_delayed flag, which is true
|
|
|
|
* if the clause's applicability must be delayed due to any outer joins
|
2009-04-16 22:42:16 +02:00
|
|
|
* appearing below it (ie, it has to be postponed to some join level higher
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
* than the set of relations it actually references).
|
|
|
|
*
|
|
|
|
* There is also an outer_relids field, which is NULL except for outer join
|
|
|
|
* clauses; for those, it is the set of relids on the outer side of the
|
|
|
|
* clause's outer join. (These are rels that the clause cannot be applied to
|
|
|
|
* in parameterized scans, since pushing it into the join's outer side would
|
|
|
|
* lead to wrong answers.)
|
|
|
|
*
|
|
|
|
* There is also a nullable_relids field, which is the set of rels the clause
|
|
|
|
* references that can be forced null by some outer join below the clause.
|
|
|
|
*
|
|
|
|
* outerjoin_delayed = true is subtly different from nullable_relids != NULL:
|
|
|
|
* a clause might reference some nullable rels and yet not be
|
|
|
|
* outerjoin_delayed because it also references all the other rels of the
|
|
|
|
* outer join(s). A clause that is not outerjoin_delayed can be enforced
|
|
|
|
* anywhere it is computable.
|
2005-11-15 00:54:23 +01:00
|
|
|
*
|
Improve RLS planning by marking individual quals with security levels.
In an RLS query, we must ensure that security filter quals are evaluated
before ordinary query quals, in case the latter contain "leaky" functions
that could expose the contents of sensitive rows. The original
implementation of RLS planning ensured this by pushing the scan of a
secured table into a sub-query that it marked as a security-barrier view.
Unfortunately this results in very inefficient plans in many cases, because
the sub-query cannot be flattened and gets planned independently of the
rest of the query.
To fix, drop the use of sub-queries to enforce RLS qual order, and instead
mark each qual (RestrictInfo) with a security_level field establishing its
priority for evaluation. Quals must be evaluated in security_level order,
except that "leakproof" quals can be allowed to go ahead of quals of lower
security_level, if it's helpful to do so. This has to be enforced within
the ordering of any one list of quals to be evaluated at a table scan node,
and we also have to ensure that quals are not chosen for early evaluation
(i.e., use as an index qual or TID scan qual) if they're not allowed to go
ahead of other quals at the scan node.
This is sufficient to fix the problem for RLS quals, since we only support
RLS policies on simple tables and thus RLS quals will always exist at the
table scan level only. Eventually these qual ordering rules should be
enforced for join quals as well, which would permit improving planning for
explicit security-barrier views; but that's a task for another patch.
Note that FDWs would need to be aware of these rules --- and not, for
example, send an insecure qual for remote execution --- but since we do
not yet allow RLS policies on foreign tables, the case doesn't arise.
This will need to be addressed before we can allow such policies.
Patch by me, reviewed by Stephen Frost and Dean Rasheed.
Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
|
|
|
* To handle security-barrier conditions efficiently, we mark RestrictInfo
|
|
|
|
* nodes with a security_level field, in which higher values identify clauses
|
|
|
|
* coming from less-trusted sources. The exact semantics are that a clause
|
|
|
|
* cannot be evaluated before another clause with a lower security_level value
|
|
|
|
* unless the first clause is leakproof. As with outer-join clauses, this
|
|
|
|
* creates a reason for clauses to sometimes need to be evaluated higher in
|
|
|
|
* the join tree than their contents would suggest; and even at a single plan
|
|
|
|
* node, this rule constrains the order of application of clauses.
|
|
|
|
*
|
1999-08-16 04:17:58 +02:00
|
|
|
* In general, the referenced clause might be arbitrarily complex. The
|
|
|
|
* kinds of clauses we can handle as indexscan quals, mergejoin clauses,
|
2007-01-20 21:45:41 +01:00
|
|
|
* or hashjoin clauses are limited (e.g., no volatile functions). The code
|
|
|
|
* for each kind of path is responsible for identifying the restrict clauses
|
|
|
|
* it can use and ignoring the rest. Clauses not implemented by an indexscan,
|
2000-09-12 23:07:18 +02:00
|
|
|
* mergejoin, or hashjoin will be placed in the plan qual or joinqual field
|
2001-06-05 07:26:05 +02:00
|
|
|
* of the finished Plan node, where they will be enforced by general-purpose
|
1999-08-16 04:17:58 +02:00
|
|
|
* qual-expression-evaluation code. (But we are still entitled to count
|
|
|
|
* their selectivity when estimating the result tuple count, if we
|
|
|
|
* can guess what it is...)
|
2004-01-04 01:07:32 +01:00
|
|
|
*
|
|
|
|
* When the referenced clause is an OR clause, we generate a modified copy
|
|
|
|
* in which additional RestrictInfo nodes are inserted below the top-level
|
|
|
|
* OR/AND structure. This is a convenience for OR indexscan processing:
|
|
|
|
* indexquals taken from either the top level or an OR subclause will have
|
|
|
|
* associated RestrictInfo nodes.
|
Revise the planner's handling of "pseudoconstant" WHERE clauses, that is
clauses containing no variables and no volatile functions. Such a clause
can be used as a one-time qual in a gating Result plan node, to suppress
plan execution entirely when it is false. Even when the clause is true,
putting it in a gating node wins by avoiding repeated evaluation of the
clause. In previous PG releases, query_planner() would do this for
pseudoconstant clauses appearing at the top level of the jointree, but
there was no ability to generate a gating Result deeper in the plan tree.
To fix it, get rid of the special case in query_planner(), and instead
process pseudoconstant clauses through the normal RestrictInfo qual
distribution mechanism. When a pseudoconstant clause is found attached to
a path node in create_plan(), pull it out and generate a gating Result at
that point. This requires special-casing pseudoconstants in selectivity
estimation and cost_qual_eval, but on the whole it's pretty clean.
It probably even makes the planner a bit faster than before for the normal
case of no pseudoconstants, since removing pull_constant_clauses saves one
useless traversal of the qual tree. Per gripe from Phil Frost.
2006-07-01 20:38:33 +02:00
|
|
|
*
|
|
|
|
* The can_join flag is set true if the clause looks potentially useful as
|
|
|
|
* a merge or hash join clause, that is if it is a binary opclause with
|
|
|
|
* nonoverlapping sets of relids referenced in the left and right sides.
|
|
|
|
* (Whether the operator is actually merge or hash joinable isn't checked,
|
|
|
|
* however.)
|
|
|
|
*
|
|
|
|
* The pseudoconstant flag is set true if the clause contains no Vars of
|
|
|
|
* the current query level and no volatile functions. Such a clause can be
|
|
|
|
* pulled out and used as a one-time qual in a gating Result node. We keep
|
|
|
|
* pseudoconstant clauses in the same lists as other RestrictInfos so that
|
|
|
|
* the regular clause-pushing machinery can assign them to the correct join
|
|
|
|
* level, but they need to be treated specially for cost and selectivity
|
|
|
|
* estimates. Note that a pseudoconstant clause can never be an indexqual
|
|
|
|
* or merge or hash join clause, so it's of no interest to large parts of
|
|
|
|
* the planner.
|
2007-01-20 21:45:41 +01:00
|
|
|
*
|
|
|
|
* When join clauses are generated from EquivalenceClasses, there may be
|
|
|
|
* several equally valid ways to enforce join equivalence, of which we need
|
|
|
|
* apply only one. We mark clauses of this kind by setting parent_ec to
|
|
|
|
* point to the generating EquivalenceClass. Multiple clauses with the same
|
|
|
|
* parent_ec in the same join are redundant.
|
1999-02-22 20:55:44 +01:00
|
|
|
*/
|
1996-08-28 03:59:28 +02:00
|
|
|
|
1999-02-03 21:15:53 +01:00
|
|
|
typedef struct RestrictInfo
|
1996-08-28 03:59:28 +02:00
|
|
|
{
|
|
|
|
NodeTag type;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-09-12 23:07:18 +02:00
|
|
|
Expr *clause; /* the represented clause of WHERE or JOIN */
|
|
|
|
|
2017-08-16 06:22:32 +02:00
|
|
|
bool is_pushed_down; /* true if clause was pushed down in level */
|
2004-01-05 06:07:36 +01:00
|
|
|
|
2017-08-16 06:22:32 +02:00
|
|
|
bool outerjoin_delayed; /* true if delayed by lower outer join */
|
2005-11-15 00:54:23 +01:00
|
|
|
|
Revise the planner's handling of "pseudoconstant" WHERE clauses, that is
clauses containing no variables and no volatile functions. Such a clause
can be used as a one-time qual in a gating Result plan node, to suppress
plan execution entirely when it is false. Even when the clause is true,
putting it in a gating node wins by avoiding repeated evaluation of the
clause. In previous PG releases, query_planner() would do this for
pseudoconstant clauses appearing at the top level of the jointree, but
there was no ability to generate a gating Result deeper in the plan tree.
To fix it, get rid of the special case in query_planner(), and instead
process pseudoconstant clauses through the normal RestrictInfo qual
distribution mechanism. When a pseudoconstant clause is found attached to
a path node in create_plan(), pull it out and generate a gating Result at
that point. This requires special-casing pseudoconstants in selectivity
estimation and cost_qual_eval, but on the whole it's pretty clean.
It probably even makes the planner a bit faster than before for the normal
case of no pseudoconstants, since removing pull_constant_clauses saves one
useless traversal of the qual tree. Per gripe from Phil Frost.
2006-07-01 20:38:33 +02:00
|
|
|
bool can_join; /* see comment above */
|
|
|
|
|
|
|
|
bool pseudoconstant; /* see comment above */
|
2003-12-31 00:53:15 +01:00
|
|
|
|
2017-08-16 06:22:32 +02:00
|
|
|
bool leakproof; /* true if known to contain no leaked Vars */
|
Improve RLS planning by marking individual quals with security levels.
In an RLS query, we must ensure that security filter quals are evaluated
before ordinary query quals, in case the latter contain "leaky" functions
that could expose the contents of sensitive rows. The original
implementation of RLS planning ensured this by pushing the scan of a
secured table into a sub-query that it marked as a security-barrier view.
Unfortunately this results in very inefficient plans in many cases, because
the sub-query cannot be flattened and gets planned independently of the
rest of the query.
To fix, drop the use of sub-queries to enforce RLS qual order, and instead
mark each qual (RestrictInfo) with a security_level field establishing its
priority for evaluation. Quals must be evaluated in security_level order,
except that "leakproof" quals can be allowed to go ahead of quals of lower
security_level, if it's helpful to do so. This has to be enforced within
the ordering of any one list of quals to be evaluated at a table scan node,
and we also have to ensure that quals are not chosen for early evaluation
(i.e., use as an index qual or TID scan qual) if they're not allowed to go
ahead of other quals at the scan node.
This is sufficient to fix the problem for RLS quals, since we only support
RLS policies on simple tables and thus RLS quals will always exist at the
table scan level only. Eventually these qual ordering rules should be
enforced for join quals as well, which would permit improving planning for
explicit security-barrier views; but that's a task for another patch.
Note that FDWs would need to be aware of these rules --- and not, for
example, send an insecure qual for remote execution --- but since we do
not yet allow RLS policies on foreign tables, the case doesn't arise.
This will need to be addressed before we can allow such policies.
Patch by me, reviewed by Stephen Frost and Dean Rasheed.
Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
|
|
|
|
2021-03-29 03:47:05 +02:00
|
|
|
VolatileFunctionStatus has_volatile; /* to indicate if clause contains
|
|
|
|
* any volatile functions. */
|
|
|
|
|
Improve RLS planning by marking individual quals with security levels.
In an RLS query, we must ensure that security filter quals are evaluated
before ordinary query quals, in case the latter contain "leaky" functions
that could expose the contents of sensitive rows. The original
implementation of RLS planning ensured this by pushing the scan of a
secured table into a sub-query that it marked as a security-barrier view.
Unfortunately this results in very inefficient plans in many cases, because
the sub-query cannot be flattened and gets planned independently of the
rest of the query.
To fix, drop the use of sub-queries to enforce RLS qual order, and instead
mark each qual (RestrictInfo) with a security_level field establishing its
priority for evaluation. Quals must be evaluated in security_level order,
except that "leakproof" quals can be allowed to go ahead of quals of lower
security_level, if it's helpful to do so. This has to be enforced within
the ordering of any one list of quals to be evaluated at a table scan node,
and we also have to ensure that quals are not chosen for early evaluation
(i.e., use as an index qual or TID scan qual) if they're not allowed to go
ahead of other quals at the scan node.
This is sufficient to fix the problem for RLS quals, since we only support
RLS policies on simple tables and thus RLS quals will always exist at the
table scan level only. Eventually these qual ordering rules should be
enforced for join quals as well, which would permit improving planning for
explicit security-barrier views; but that's a task for another patch.
Note that FDWs would need to be aware of these rules --- and not, for
example, send an insecure qual for remote execution --- but since we do
not yet allow RLS policies on foreign tables, the case doesn't arise.
This will need to be addressed before we can allow such policies.
Patch by me, reviewed by Stephen Frost and Dean Rasheed.
Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
|
|
|
Index security_level; /* see comment above */
|
|
|
|
|
2005-06-09 06:19:00 +02:00
|
|
|
/* The set of relids (varnos) actually referenced in the clause: */
|
2004-01-04 04:51:52 +01:00
|
|
|
Relids clause_relids;
|
|
|
|
|
2005-06-09 06:19:00 +02:00
|
|
|
/* The set of relids required to evaluate the clause: */
|
|
|
|
Relids required_relids;
|
|
|
|
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
/* If an outer-join clause, the outer-side relations, else NULL: */
|
|
|
|
Relids outer_relids;
|
|
|
|
|
2009-04-16 22:42:16 +02:00
|
|
|
/* The relids used in the clause that are nullable by lower outer joins: */
|
|
|
|
Relids nullable_relids;
|
|
|
|
|
2003-12-31 00:53:15 +01:00
|
|
|
/* These fields are set for any binary opclause: */
|
|
|
|
Relids left_relids; /* relids in left side of clause */
|
|
|
|
Relids right_relids; /* relids in right side of clause */
|
|
|
|
|
2004-01-04 01:07:32 +01:00
|
|
|
/* This field is NULL unless clause is an OR clause: */
|
|
|
|
Expr *orclause; /* modified clause with RestrictInfos */
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2007-01-20 21:45:41 +01:00
|
|
|
/* This field is NULL unless clause is potentially redundant: */
|
|
|
|
EquivalenceClass *parent_ec; /* generating EquivalenceClass */
|
|
|
|
|
2004-01-04 04:51:52 +01:00
|
|
|
/* cache space for cost and selectivity */
|
2003-01-12 23:35:29 +01:00
|
|
|
QualCost eval_cost; /* eval cost of clause; -1 if not yet set */
|
2009-02-07 00:43:24 +01:00
|
|
|
Selectivity norm_selec; /* selectivity for "normal" (JOIN_INNER)
|
|
|
|
* semantics; -1 if not yet set; >1 means a
|
2008-12-01 22:06:13 +01:00
|
|
|
* redundant clause */
|
2009-02-07 00:43:24 +01:00
|
|
|
Selectivity outer_selec; /* selectivity for outer join semantics; -1 if
|
|
|
|
* not yet set */
|
2001-06-05 07:26:05 +02:00
|
|
|
|
2007-01-20 21:45:41 +01:00
|
|
|
/* valid if clause is mergejoinable, else NIL */
|
|
|
|
List *mergeopfamilies; /* opfamilies containing clause operator */
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2007-01-20 21:45:41 +01:00
|
|
|
/* cache space for mergeclause processing; NULL if not yet set */
|
|
|
|
EquivalenceClass *left_ec; /* EquivalenceClass containing lefthand */
|
|
|
|
EquivalenceClass *right_ec; /* EquivalenceClass containing righthand */
|
2007-01-22 21:00:40 +01:00
|
|
|
EquivalenceMember *left_em; /* EquivalenceMember for lefthand */
|
|
|
|
EquivalenceMember *right_em; /* EquivalenceMember for righthand */
|
|
|
|
List *scansel_cache; /* list of MergeScanSelCache structs */
|
2000-12-14 23:30:45 +01:00
|
|
|
|
2007-01-20 21:45:41 +01:00
|
|
|
/* transient workspace for use while considering a specific join path */
|
|
|
|
bool outer_is_left; /* T = outer var on left, F = on right */
|
2002-03-01 07:01:20 +01:00
|
|
|
|
1999-08-16 04:17:58 +02:00
|
|
|
/* valid if clause is hashjoinable, else InvalidOid: */
|
|
|
|
Oid hashjoinoperator; /* copy of clause operator */
|
2000-12-14 23:30:45 +01:00
|
|
|
|
|
|
|
/* cache space for hashclause processing; -1 if not yet set */
|
2001-05-07 02:43:27 +02:00
|
|
|
Selectivity left_bucketsize; /* avg bucketsize of left side */
|
|
|
|
Selectivity right_bucketsize; /* avg bucketsize of right side */
|
Avoid out-of-memory in a hash join with many duplicate inner keys.
The executor is capable of splitting buckets during a hash join if
too much memory is being used by a small number of buckets. However,
this only helps if a bucket's population is actually divisible; if
all the hash keys are alike, the tuples still end up in the same
new bucket. This can result in an OOM failure if there are enough
inner keys with identical hash values. The planner's cost estimates
will bias it against choosing a hash join in such situations, but not
by so much that it will never do so. To mitigate the OOM hazard,
explicitly estimate the hash bucket space needed by just the inner
side's most common value, and if that would exceed work_mem then
add disable_cost to the hash cost estimate.
This approach doesn't account for the possibility that two or more
common values would share the same hash value. On the other hand,
work_mem is normally a fairly conservative bound, so that eating
two or more times that much space is probably not going to kill us.
If we have no stats about the inner side, ignore this consideration.
There was some discussion of making a conservative assumption, but that
would effectively result in disabling hash join whenever we lack stats,
which seems like an overreaction given how seldom the problem manifests
in the field.
Per a complaint from David Hinkle. Although this could be viewed
as a bug fix, the lack of similar complaints weighs against back-
patching; indeed we waited for v11 because it seemed already rather
late in the v10 cycle to be making plan choice changes like this one.
Discussion: https://postgr.es/m/32013.1487271761@sss.pgh.pa.us
2017-08-15 20:05:46 +02:00
|
|
|
Selectivity left_mcvfreq; /* left side's most common val's freq */
|
|
|
|
Selectivity right_mcvfreq; /* right side's most common val's freq */
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
|
2021-11-08 02:40:33 +01:00
|
|
|
/* hash equality operators used for memoize nodes, else InvalidOid */
|
|
|
|
Oid left_hasheqoperator;
|
|
|
|
Oid right_hasheqoperator;
|
1999-08-16 04:17:58 +02:00
|
|
|
} RestrictInfo;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2018-04-20 21:19:16 +02:00
|
|
|
/*
|
|
|
|
* This macro embodies the correct way to test whether a RestrictInfo is
|
|
|
|
* "pushed down" to a given outer join, that is, should be treated as a filter
|
|
|
|
* clause rather than a join clause at that outer join. This is certainly so
|
|
|
|
* if is_pushed_down is true; but examining that is not sufficient anymore,
|
|
|
|
* because outer-join clauses will get pushed down to lower outer joins when
|
|
|
|
* we generate a path for the lower outer join that is parameterized by the
|
|
|
|
* LHS of the upper one. We can detect such a clause by noting that its
|
|
|
|
* required_relids exceed the scope of the join.
|
|
|
|
*/
|
|
|
|
#define RINFO_IS_PUSHED_DOWN(rinfo, joinrelids) \
|
|
|
|
((rinfo)->is_pushed_down || \
|
|
|
|
!bms_is_subset((rinfo)->required_relids, joinrelids))
|
|
|
|
|
2007-01-22 21:00:40 +01:00
|
|
|
/*
|
|
|
|
* Since mergejoinscansel() is a relatively expensive function, and would
|
|
|
|
* otherwise be invoked many times while planning a large join tree,
|
|
|
|
* we go out of our way to cache its results. Each mergejoinable
|
|
|
|
* RestrictInfo carries a list of the specific sort orderings that have
|
|
|
|
* been considered for use with it, and the resulting selectivities.
|
|
|
|
*/
|
|
|
|
typedef struct MergeScanSelCache
|
|
|
|
{
|
|
|
|
/* Ordering details (cache lookup key) */
|
|
|
|
Oid opfamily; /* btree opfamily defining the ordering */
|
2011-03-20 01:29:08 +01:00
|
|
|
Oid collation; /* collation for the ordering */
|
2007-01-22 21:00:40 +01:00
|
|
|
int strategy; /* sort direction (ASC or DESC) */
|
|
|
|
bool nulls_first; /* do NULLs come before normal values? */
|
|
|
|
/* Results */
|
2007-12-08 22:05:11 +01:00
|
|
|
Selectivity leftstartsel; /* first-join fraction for clause left side */
|
|
|
|
Selectivity leftendsel; /* last-join fraction for clause left side */
|
|
|
|
Selectivity rightstartsel; /* first-join fraction for clause right side */
|
|
|
|
Selectivity rightendsel; /* last-join fraction for clause right side */
|
2007-01-22 21:00:40 +01:00
|
|
|
} MergeScanSelCache;
|
|
|
|
|
2008-10-21 22:42:53 +02:00
|
|
|
/*
|
|
|
|
* Placeholder node for an expression to be evaluated below the top level
|
|
|
|
* of a plan tree. This is used during planning to represent the contained
|
|
|
|
* expression. At the end of the planning process it is replaced by either
|
|
|
|
* the contained expression or a Var referring to a lower-level evaluation of
|
|
|
|
* the contained expression. Typically the evaluation occurs below an outer
|
|
|
|
* join, and Var references above the outer join might thereby yield NULL
|
|
|
|
* instead of the expression value.
|
|
|
|
*
|
|
|
|
* Although the planner treats this as an expression node type, it is not
|
|
|
|
* recognized by the parser or executor, so we declare it here rather than
|
|
|
|
* in primnodes.h.
|
|
|
|
*/
|
|
|
|
|
|
|
|
typedef struct PlaceHolderVar
|
|
|
|
{
|
|
|
|
Expr xpr;
|
|
|
|
Expr *phexpr; /* the represented expression */
|
|
|
|
Relids phrels; /* base relids syntactically within expr src */
|
|
|
|
Index phid; /* ID for PHV (unique within planner run) */
|
|
|
|
Index phlevelsup; /* > 0 if PHV belongs to outer query */
|
|
|
|
} PlaceHolderVar;
|
|
|
|
|
2008-08-14 20:48:00 +02:00
|
|
|
/*
|
|
|
|
* "Special join" info.
|
2005-12-20 03:30:36 +01:00
|
|
|
*
|
|
|
|
* One-sided outer joins constrain the order of joining partially but not
|
|
|
|
* completely. We flatten such joins into the planner's top-level list of
|
2008-08-14 20:48:00 +02:00
|
|
|
* relations to join, but record information about each outer join in a
|
|
|
|
* SpecialJoinInfo struct. These structs are kept in the PlannerInfo node's
|
|
|
|
* join_info_list.
|
|
|
|
*
|
|
|
|
* Similarly, semijoins and antijoins created by flattening IN (subselect)
|
|
|
|
* and EXISTS(subselect) clauses create partial constraints on join order.
|
|
|
|
* These are likewise recorded in SpecialJoinInfo structs.
|
|
|
|
*
|
|
|
|
* We make SpecialJoinInfos for FULL JOINs even though there is no flexibility
|
|
|
|
* of planning for them, because this simplifies make_join_rel()'s API.
|
2005-12-20 03:30:36 +01:00
|
|
|
*
|
|
|
|
* min_lefthand and min_righthand are the sets of base relids that must be
|
2008-08-14 20:48:00 +02:00
|
|
|
* available on each side when performing the special join. lhs_strict is
|
|
|
|
* true if the special join's condition cannot succeed when the LHS variables
|
|
|
|
* are all NULL (this means that an outer join can commute with upper-level
|
2005-12-20 03:30:36 +01:00
|
|
|
* outer joins even if it appears in their RHS). We don't bother to set
|
|
|
|
* lhs_strict for FULL JOINs, however.
|
|
|
|
*
|
|
|
|
* It is not valid for either min_lefthand or min_righthand to be empty sets;
|
|
|
|
* if they were, this would break the logic that enforces join order.
|
|
|
|
*
|
2007-08-31 03:44:06 +02:00
|
|
|
* syn_lefthand and syn_righthand are the sets of base relids that are
|
2008-08-14 20:48:00 +02:00
|
|
|
* syntactically below this special join. (These are needed to help compute
|
|
|
|
* min_lefthand and min_righthand for higher joins.)
|
2007-08-31 03:44:06 +02:00
|
|
|
*
|
2017-08-16 06:22:32 +02:00
|
|
|
* delay_upper_joins is set true if we detect a pushed-down clause that has
|
2007-05-23 01:23:58 +02:00
|
|
|
* to be evaluated after this join is formed (because it references the RHS).
|
|
|
|
* Any outer joins that have such a clause and this join in their RHS cannot
|
|
|
|
* commute with this join, because that would leave noplace to check the
|
|
|
|
* pushed-down clause. (We don't track this for FULL JOINs, either.)
|
|
|
|
*
|
Improve planner's cost estimation in the presence of semijoins.
If we have a semijoin, say
SELECT * FROM x WHERE x1 IN (SELECT y1 FROM y)
and we're estimating the cost of a parameterized indexscan on x, the number
of repetitions of the indexscan should not be taken as the size of y; it'll
really only be the number of distinct values of y1, because the only valid
plan with y on the outside of a nestloop would require y to be unique-ified
before joining it to x. Most of the time this doesn't make that much
difference, but sometimes it can lead to drastically underestimating the
cost of the indexscan and hence choosing a bad plan, as pointed out by
David Kubečka.
Fixing this is a bit difficult because parameterized indexscans are costed
out quite early in the planning process, before we have the information
that would be needed to call estimate_num_groups() and thereby estimate the
number of distinct values of the join column(s). However we can move the
code that extracts a semijoin RHS's unique-ification columns, so that it's
done in initsplan.c rather than on-the-fly in create_unique_path(). That
shouldn't make any difference speed-wise and it's really a bit cleaner too.
The other bit of information we need is the size of the semijoin RHS,
which is easy if it's a single relation (we make those estimates before
considering indexscan costs) but problematic if it's a join relation.
The solution adopted here is just to use the product of the sizes of the
join component rels. That will generally be an overestimate, but since
estimate_num_groups() only uses this input as a clamp, an overestimate
shouldn't hurt us too badly. In any case we don't allow this new logic
to produce a value larger than we would have chosen before, so that at
worst an overestimate leaves us no wiser than we were before.
2015-03-12 02:21:00 +01:00
|
|
|
* For a semijoin, we also extract the join operators and their RHS arguments
|
|
|
|
* and set semi_operators, semi_rhs_exprs, semi_can_btree, and semi_can_hash.
|
|
|
|
* This is done in support of possibly unique-ifying the RHS, so we don't
|
|
|
|
* bother unless at least one of semi_can_btree and semi_can_hash can be set
|
|
|
|
* true. (You might expect that this information would be computed during
|
|
|
|
* join planning; but it's helpful to have it available during planning of
|
|
|
|
* parameterized table scans, so we store it in the SpecialJoinInfo structs.)
|
2008-08-14 20:48:00 +02:00
|
|
|
*
|
|
|
|
* jointype is never JOIN_RIGHT; a RIGHT JOIN is handled by switching
|
|
|
|
* the inputs to make it a LEFT JOIN. So the allowed values of jointype
|
|
|
|
* in a join_info_list member are only LEFT, FULL, SEMI, or ANTI.
|
|
|
|
*
|
|
|
|
* For purposes of join selectivity estimation, we create transient
|
|
|
|
* SpecialJoinInfo structures for regular inner joins; so it is possible
|
|
|
|
* to have jointype == JOIN_INNER in such a structure, even though this is
|
2009-02-25 04:30:38 +01:00
|
|
|
* not allowed within join_info_list. We also create transient
|
|
|
|
* SpecialJoinInfos with jointype == JOIN_INNER for outer joins, since for
|
|
|
|
* cost estimation purposes it is sometimes useful to know the join size under
|
|
|
|
* plain innerjoin semantics. Note that lhs_strict, delay_upper_joins, and
|
Improve planner's cost estimation in the presence of semijoins.
If we have a semijoin, say
SELECT * FROM x WHERE x1 IN (SELECT y1 FROM y)
and we're estimating the cost of a parameterized indexscan on x, the number
of repetitions of the indexscan should not be taken as the size of y; it'll
really only be the number of distinct values of y1, because the only valid
plan with y on the outside of a nestloop would require y to be unique-ified
before joining it to x. Most of the time this doesn't make that much
difference, but sometimes it can lead to drastically underestimating the
cost of the indexscan and hence choosing a bad plan, as pointed out by
David Kubečka.
Fixing this is a bit difficult because parameterized indexscans are costed
out quite early in the planning process, before we have the information
that would be needed to call estimate_num_groups() and thereby estimate the
number of distinct values of the join column(s). However we can move the
code that extracts a semijoin RHS's unique-ification columns, so that it's
done in initsplan.c rather than on-the-fly in create_unique_path(). That
shouldn't make any difference speed-wise and it's really a bit cleaner too.
The other bit of information we need is the size of the semijoin RHS,
which is easy if it's a single relation (we make those estimates before
considering indexscan costs) but problematic if it's a join relation.
The solution adopted here is just to use the product of the sizes of the
join component rels. That will generally be an overestimate, but since
estimate_num_groups() only uses this input as a clamp, an overestimate
shouldn't hurt us too badly. In any case we don't allow this new logic
to produce a value larger than we would have chosen before, so that at
worst an overestimate leaves us no wiser than we were before.
2015-03-12 02:21:00 +01:00
|
|
|
* of course the semi_xxx fields are not set meaningfully within such structs.
|
2005-12-20 03:30:36 +01:00
|
|
|
*/
|
2019-01-29 21:48:51 +01:00
|
|
|
#ifndef HAVE_SPECIALJOININFO_TYPEDEF
|
|
|
|
typedef struct SpecialJoinInfo SpecialJoinInfo;
|
|
|
|
#define HAVE_SPECIALJOININFO_TYPEDEF 1
|
|
|
|
#endif
|
2005-12-20 03:30:36 +01:00
|
|
|
|
2019-01-29 21:48:51 +01:00
|
|
|
struct SpecialJoinInfo
|
2005-12-20 03:30:36 +01:00
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
Relids min_lefthand; /* base relids in minimum LHS for join */
|
|
|
|
Relids min_righthand; /* base relids in minimum RHS for join */
|
2007-08-31 03:44:06 +02:00
|
|
|
Relids syn_lefthand; /* base relids syntactically within LHS */
|
|
|
|
Relids syn_righthand; /* base relids syntactically within RHS */
|
2008-08-14 20:48:00 +02:00
|
|
|
JoinType jointype; /* always INNER, LEFT, FULL, SEMI, or ANTI */
|
2005-12-20 03:30:36 +01:00
|
|
|
bool lhs_strict; /* joinclause is strict for some LHS rel */
|
2007-05-23 01:23:58 +02:00
|
|
|
bool delay_upper_joins; /* can't commute with upper RHS */
|
Improve planner's cost estimation in the presence of semijoins.
If we have a semijoin, say
SELECT * FROM x WHERE x1 IN (SELECT y1 FROM y)
and we're estimating the cost of a parameterized indexscan on x, the number
of repetitions of the indexscan should not be taken as the size of y; it'll
really only be the number of distinct values of y1, because the only valid
plan with y on the outside of a nestloop would require y to be unique-ified
before joining it to x. Most of the time this doesn't make that much
difference, but sometimes it can lead to drastically underestimating the
cost of the indexscan and hence choosing a bad plan, as pointed out by
David Kubečka.
Fixing this is a bit difficult because parameterized indexscans are costed
out quite early in the planning process, before we have the information
that would be needed to call estimate_num_groups() and thereby estimate the
number of distinct values of the join column(s). However we can move the
code that extracts a semijoin RHS's unique-ification columns, so that it's
done in initsplan.c rather than on-the-fly in create_unique_path(). That
shouldn't make any difference speed-wise and it's really a bit cleaner too.
The other bit of information we need is the size of the semijoin RHS,
which is easy if it's a single relation (we make those estimates before
considering indexscan costs) but problematic if it's a join relation.
The solution adopted here is just to use the product of the sizes of the
join component rels. That will generally be an overestimate, but since
estimate_num_groups() only uses this input as a clamp, an overestimate
shouldn't hurt us too badly. In any case we don't allow this new logic
to produce a value larger than we would have chosen before, so that at
worst an overestimate leaves us no wiser than we were before.
2015-03-12 02:21:00 +01:00
|
|
|
/* Remaining fields are set only for JOIN_SEMI jointype: */
|
|
|
|
bool semi_can_btree; /* true if semi_operators are all btree */
|
|
|
|
bool semi_can_hash; /* true if semi_operators are all hash */
|
|
|
|
List *semi_operators; /* OIDs of equality join operators */
|
|
|
|
List *semi_rhs_exprs; /* righthand-side expressions of these ops */
|
2019-01-29 21:48:51 +01:00
|
|
|
};
|
2003-01-20 19:55:07 +01:00
|
|
|
|
2006-01-31 22:39:25 +01:00
|
|
|
/*
|
|
|
|
* Append-relation info.
|
|
|
|
*
|
|
|
|
* When we expand an inheritable table or a UNION-ALL subselect into an
|
|
|
|
* "append relation" (essentially, a list of child RTEs), we build an
|
2017-09-14 21:41:08 +02:00
|
|
|
* AppendRelInfo for each child RTE. The list of AppendRelInfos indicates
|
|
|
|
* which child RTEs must be included when expanding the parent, and each node
|
2019-12-03 00:05:29 +01:00
|
|
|
* carries information needed to translate between columns of the parent and
|
|
|
|
* columns of the child.
|
|
|
|
*
|
|
|
|
* These structs are kept in the PlannerInfo node's append_rel_list, with
|
|
|
|
* append_rel_array[] providing a convenient lookup method for the struct
|
|
|
|
* associated with a particular child relid (there can be only one, though
|
|
|
|
* parent rels may have many entries in append_rel_list).
|
2006-01-31 22:39:25 +01:00
|
|
|
*
|
|
|
|
* Note: after completion of the planner prep phase, any given RTE is an
|
|
|
|
* append parent having entries in append_rel_list if and only if its
|
|
|
|
* "inh" flag is set. We clear "inh" for plain tables that turn out not
|
|
|
|
* to have inheritance children, and (in an abuse of the original meaning
|
|
|
|
* of the flag) we set "inh" for subquery RTEs that turn out to be
|
|
|
|
* flattenable UNION ALL queries. This lets us avoid useless searches
|
|
|
|
* of append_rel_list.
|
|
|
|
*
|
|
|
|
* Note: the data structure assumes that append-rel members are single
|
|
|
|
* baserels. This is OK for inheritance, but it prevents us from pulling
|
|
|
|
* up a UNION ALL member subquery if it contains a join. While that could
|
|
|
|
* be fixed with a more complex data structure, at present there's not much
|
|
|
|
* point because no improvement in the plan could result.
|
|
|
|
*/
|
|
|
|
|
|
|
|
typedef struct AppendRelInfo
|
|
|
|
{
|
|
|
|
NodeTag type;
|
2006-10-04 02:30:14 +02:00
|
|
|
|
2006-01-31 22:39:25 +01:00
|
|
|
/*
|
|
|
|
* These fields uniquely identify this append relationship. There can be
|
|
|
|
* (in fact, always should be) multiple AppendRelInfos for the same
|
|
|
|
* parent_relid, but never more than one per child_relid, since a given
|
|
|
|
* RTE cannot be a child of more than one append parent.
|
|
|
|
*/
|
|
|
|
Index parent_relid; /* RT index of append parent rel */
|
|
|
|
Index child_relid; /* RT index of append child rel */
|
2006-10-04 02:30:14 +02:00
|
|
|
|
2006-01-31 22:39:25 +01:00
|
|
|
/*
|
|
|
|
* For an inheritance appendrel, the parent and child are both regular
|
|
|
|
* relations, and we store their rowtype OIDs here for use in translating
|
|
|
|
* whole-row Vars. For a UNION-ALL appendrel, the parent and child are
|
|
|
|
* both subqueries with no named rowtype, and we store InvalidOid here.
|
|
|
|
*/
|
|
|
|
Oid parent_reltype; /* OID of parent's composite type */
|
|
|
|
Oid child_reltype; /* OID of child's composite type */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The N'th element of this list is a Var or expression representing the
|
|
|
|
* child column corresponding to the N'th column of the parent. This is
|
|
|
|
* used to translate Vars referencing the parent rel into references to
|
|
|
|
* the child. A list element is NULL if it corresponds to a dropped
|
|
|
|
* column of the parent (this is only possible for inheritance cases, not
|
2008-11-11 19:13:32 +01:00
|
|
|
* UNION ALL). The list elements are always simple Vars for inheritance
|
|
|
|
* cases, but can be arbitrary expressions in UNION ALL cases.
|
2006-01-31 22:39:25 +01:00
|
|
|
*
|
|
|
|
* Notice we only store entries for user columns (attno > 0). Whole-row
|
|
|
|
* Vars are special-cased, and system columns (attno < 0) need no special
|
|
|
|
* translation since their attnos are the same for all tables.
|
|
|
|
*
|
|
|
|
* Caution: the Vars have varlevelsup = 0. Be careful to adjust as needed
|
|
|
|
* when copying into a subquery.
|
|
|
|
*/
|
|
|
|
List *translated_vars; /* Expressions in the child's Vars */
|
2006-10-04 02:30:14 +02:00
|
|
|
|
2019-12-03 00:05:29 +01:00
|
|
|
/*
|
|
|
|
* This array simplifies translations in the reverse direction, from
|
|
|
|
* child's column numbers to parent's. The entry at [ccolno - 1] is the
|
|
|
|
* 1-based parent column number for child column ccolno, or zero if that
|
|
|
|
* child column is dropped or doesn't exist in the parent.
|
|
|
|
*/
|
|
|
|
int num_child_cols; /* length of array */
|
|
|
|
AttrNumber *parent_colnos; /* array of parent attnos, or zeroes */
|
|
|
|
|
2006-01-31 22:39:25 +01:00
|
|
|
/*
|
|
|
|
* We store the parent table's OID here for inheritance, or InvalidOid for
|
|
|
|
* UNION ALL. This is only needed to help in generating error messages if
|
|
|
|
* an attempt is made to reference a dropped parent column.
|
|
|
|
*/
|
|
|
|
Oid parent_reloid; /* OID of parent relation */
|
|
|
|
} AppendRelInfo;
|
|
|
|
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
/*
|
|
|
|
* Information about a row-identity "resjunk" column in UPDATE/DELETE.
|
|
|
|
*
|
|
|
|
* In partitioned UPDATE/DELETE it's important for child partitions to share
|
|
|
|
* row-identity columns whenever possible, so as not to chew up too many
|
|
|
|
* targetlist columns. We use these structs to track which identity columns
|
|
|
|
* have been requested. In the finished plan, each of these will give rise
|
|
|
|
* to one resjunk entry in the targetlist of the ModifyTable's subplan node.
|
|
|
|
*
|
|
|
|
* All the Vars stored in RowIdentityVarInfos must have varno ROWID_VAR, for
|
|
|
|
* convenience of detecting duplicate requests. We'll replace that, in the
|
|
|
|
* final plan, with the varno of the generating rel.
|
|
|
|
*
|
|
|
|
* Outside this list, a Var with varno ROWID_VAR and varattno k is a reference
|
|
|
|
* to the k-th element of the row_identity_vars list (k counting from 1).
|
|
|
|
* We add such a reference to root->processed_tlist when creating the entry,
|
|
|
|
* and it propagates into the plan tree from there.
|
|
|
|
*/
|
|
|
|
typedef struct RowIdentityVarInfo
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
|
|
|
|
Var *rowidvar; /* Var to be evaluated (but varno=ROWID_VAR) */
|
|
|
|
int32 rowidwidth; /* estimated average width */
|
|
|
|
char *rowidname; /* name of the resjunk column */
|
|
|
|
Relids rowidrels; /* RTE indexes of target rels using this */
|
|
|
|
} RowIdentityVarInfo;
|
|
|
|
|
2008-10-21 22:42:53 +02:00
|
|
|
/*
|
|
|
|
* For each distinct placeholder expression generated during planning, we
|
|
|
|
* store a PlaceHolderInfo node in the PlannerInfo node's placeholder_list.
|
|
|
|
* This stores info that is needed centrally rather than in each copy of the
|
|
|
|
* PlaceHolderVar. The phid fields identify which PlaceHolderInfo goes with
|
|
|
|
* each PlaceHolderVar. Note that phid is unique throughout a planner run,
|
|
|
|
* not just within a query level --- this is so that we need not reassign ID's
|
|
|
|
* when pulling a subquery into its parent.
|
|
|
|
*
|
|
|
|
* The idea is to evaluate the expression at (only) the ph_eval_at join level,
|
|
|
|
* then allow it to bubble up like a Var until the ph_needed join level.
|
|
|
|
* ph_needed has the same definition as attr_needed for a regular Var.
|
2008-10-22 22:17:52 +02:00
|
|
|
*
|
2013-08-18 02:22:37 +02:00
|
|
|
* The PlaceHolderVar's expression might contain LATERAL references to vars
|
|
|
|
* coming from outside its syntactic scope. If so, those rels are *not*
|
|
|
|
* included in ph_eval_at, but they are recorded in ph_lateral.
|
|
|
|
*
|
2013-08-15 00:38:32 +02:00
|
|
|
* Notice that when ph_eval_at is a join rather than a single baserel, the
|
|
|
|
* PlaceHolderInfo may create constraints on join order: the ph_eval_at join
|
|
|
|
* has to be formed below any outer joins that should null the PlaceHolderVar.
|
2010-09-28 18:08:56 +02:00
|
|
|
*
|
2008-10-22 22:17:52 +02:00
|
|
|
* We create a PlaceHolderInfo only after determining that the PlaceHolderVar
|
2010-09-28 18:08:56 +02:00
|
|
|
* is actually referenced in the plan tree, so that unreferenced placeholders
|
|
|
|
* don't result in unnecessary constraints on join order.
|
2008-10-21 22:42:53 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
typedef struct PlaceHolderInfo
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
|
|
|
|
Index phid; /* ID for PH (unique within planner run) */
|
|
|
|
PlaceHolderVar *ph_var; /* copy of PlaceHolderVar tree */
|
|
|
|
Relids ph_eval_at; /* lowest level we can evaluate value at */
|
2013-08-18 02:22:37 +02:00
|
|
|
Relids ph_lateral; /* relids of contained lateral refs, if any */
|
2008-10-21 22:42:53 +02:00
|
|
|
Relids ph_needed; /* highest level the value is needed at */
|
|
|
|
int32 ph_width; /* estimated attribute width */
|
|
|
|
} PlaceHolderInfo;
|
|
|
|
|
2010-11-04 17:01:17 +01:00
|
|
|
/*
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
* This struct describes one potentially index-optimizable MIN/MAX aggregate
|
|
|
|
* function. MinMaxAggPath contains a list of these, and if we accept that
|
|
|
|
* path, the list is stored into root->minmax_aggs for use during setrefs.c.
|
2010-11-04 17:01:17 +01:00
|
|
|
*/
|
|
|
|
typedef struct MinMaxAggInfo
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
|
|
|
|
Oid aggfnoid; /* pg_proc Oid of the aggregate */
|
|
|
|
Oid aggsortop; /* Oid of its sort operator */
|
|
|
|
Expr *target; /* expression we are aggregating on */
|
2011-03-22 05:34:31 +01:00
|
|
|
PlannerInfo *subroot; /* modified "root" for planning the subquery */
|
|
|
|
Path *path; /* access path for subquery */
|
|
|
|
Cost pathcost; /* estimated cost to fetch first row */
|
|
|
|
Param *param; /* param for subplan's output */
|
2010-11-04 17:01:17 +01:00
|
|
|
} MinMaxAggInfo;
|
|
|
|
|
2007-02-19 08:03:34 +01:00
|
|
|
/*
|
Fix PARAM_EXEC assignment mechanism to be safe in the presence of WITH.
The planner previously assumed that parameter Vars having the same absolute
query level, varno, and varattno could safely be assigned the same runtime
PARAM_EXEC slot, even though they might be different Vars appearing in
different subqueries. This was (probably) safe before the introduction of
CTEs, but the lazy-evalution mechanism used for CTEs means that a CTE can
be executed during execution of some other subquery, causing the lifespan
of Params at the same syntactic nesting level as the CTE to overlap with
use of the same slots inside the CTE. In 9.1 we created additional hazards
by using the same parameter-assignment technology for nestloop inner scan
parameters, but it was broken before that, as illustrated by the added
regression test.
To fix, restructure the planner's management of PlannerParamItems so that
items having different semantic lifespans are kept rigorously separated.
This will probably result in complex queries using more runtime PARAM_EXEC
slots than before, but the slots are cheap enough that this hardly matters.
Also, stop generating PlannerParamItems containing Params for subquery
outputs: all we really need to do is reserve the PARAM_EXEC slot number,
and that now only takes incrementing a counter. The planning code is
simpler and probably faster than before, as well as being more correct.
Per report from Vik Reykja.
These changes will mostly also need to be made in the back branches, but
I'm going to hold off on that until after 9.2.0 wraps.
2012-09-05 18:54:03 +02:00
|
|
|
* At runtime, PARAM_EXEC slots are used to pass values around from one plan
|
|
|
|
* node to another. They can be used to pass values down into subqueries (for
|
|
|
|
* outer references in subqueries), or up out of subqueries (for the results
|
|
|
|
* of a subplan), or from a NestLoop plan node into its inner relation (when
|
|
|
|
* the inner scan is parameterized with values from the outer relation).
|
|
|
|
* The planner is responsible for assigning nonconflicting PARAM_EXEC IDs to
|
|
|
|
* the PARAM_EXEC Params it generates.
|
|
|
|
*
|
|
|
|
* Outer references are managed via root->plan_params, which is a list of
|
|
|
|
* PlannerParamItems. While planning a subquery, each parent query level's
|
|
|
|
* plan_params contains the values required from it by the current subquery.
|
|
|
|
* During create_plan(), we use plan_params to track values that must be
|
|
|
|
* passed from outer to inner sides of NestLoop plan nodes.
|
|
|
|
*
|
|
|
|
* The item a PlannerParamItem represents can be one of three kinds:
|
|
|
|
*
|
|
|
|
* A Var: the slot represents a variable of this level that must be passed
|
2010-07-12 19:01:06 +02:00
|
|
|
* down because subqueries have outer references to it, or must be passed
|
Fix PARAM_EXEC assignment mechanism to be safe in the presence of WITH.
The planner previously assumed that parameter Vars having the same absolute
query level, varno, and varattno could safely be assigned the same runtime
PARAM_EXEC slot, even though they might be different Vars appearing in
different subqueries. This was (probably) safe before the introduction of
CTEs, but the lazy-evalution mechanism used for CTEs means that a CTE can
be executed during execution of some other subquery, causing the lifespan
of Params at the same syntactic nesting level as the CTE to overlap with
use of the same slots inside the CTE. In 9.1 we created additional hazards
by using the same parameter-assignment technology for nestloop inner scan
parameters, but it was broken before that, as illustrated by the added
regression test.
To fix, restructure the planner's management of PlannerParamItems so that
items having different semantic lifespans are kept rigorously separated.
This will probably result in complex queries using more runtime PARAM_EXEC
slots than before, but the slots are cheap enough that this hardly matters.
Also, stop generating PlannerParamItems containing Params for subquery
outputs: all we really need to do is reserve the PARAM_EXEC slot number,
and that now only takes incrementing a counter. The planning code is
simpler and probably faster than before, as well as being more correct.
Per report from Vik Reykja.
These changes will mostly also need to be made in the back branches, but
I'm going to hold off on that until after 9.2.0 wraps.
2012-09-05 18:54:03 +02:00
|
|
|
* from a NestLoop node to its inner scan. The varlevelsup value in the Var
|
|
|
|
* will always be zero.
|
2007-02-19 08:03:34 +01:00
|
|
|
*
|
2012-03-24 21:21:39 +01:00
|
|
|
* A PlaceHolderVar: this works much like the Var case, except that the
|
|
|
|
* entry is a PlaceHolderVar node with a contained expression. The PHV
|
|
|
|
* will have phlevelsup = 0, and the contained expression is adjusted
|
|
|
|
* to match in level.
|
2011-11-03 05:50:58 +01:00
|
|
|
*
|
2007-02-19 08:03:34 +01:00
|
|
|
* An Aggref (with an expression tree representing its argument): the slot
|
|
|
|
* represents an aggregate expression that is an outer reference for some
|
|
|
|
* subquery. The Aggref itself has agglevelsup = 0, and its argument tree
|
|
|
|
* is adjusted to match in level.
|
|
|
|
*
|
2011-11-03 05:50:58 +01:00
|
|
|
* Note: we detect duplicate Var and PlaceHolderVar parameters and coalesce
|
Fix PARAM_EXEC assignment mechanism to be safe in the presence of WITH.
The planner previously assumed that parameter Vars having the same absolute
query level, varno, and varattno could safely be assigned the same runtime
PARAM_EXEC slot, even though they might be different Vars appearing in
different subqueries. This was (probably) safe before the introduction of
CTEs, but the lazy-evalution mechanism used for CTEs means that a CTE can
be executed during execution of some other subquery, causing the lifespan
of Params at the same syntactic nesting level as the CTE to overlap with
use of the same slots inside the CTE. In 9.1 we created additional hazards
by using the same parameter-assignment technology for nestloop inner scan
parameters, but it was broken before that, as illustrated by the added
regression test.
To fix, restructure the planner's management of PlannerParamItems so that
items having different semantic lifespans are kept rigorously separated.
This will probably result in complex queries using more runtime PARAM_EXEC
slots than before, but the slots are cheap enough that this hardly matters.
Also, stop generating PlannerParamItems containing Params for subquery
outputs: all we really need to do is reserve the PARAM_EXEC slot number,
and that now only takes incrementing a counter. The planning code is
simpler and probably faster than before, as well as being more correct.
Per report from Vik Reykja.
These changes will mostly also need to be made in the back branches, but
I'm going to hold off on that until after 9.2.0 wraps.
2012-09-05 18:54:03 +02:00
|
|
|
* them into one slot, but we do not bother to do that for Aggrefs.
|
|
|
|
* The scope of duplicate-elimination only extends across the set of
|
|
|
|
* parameters passed from one query level into a single subquery, or for
|
|
|
|
* nestloop parameters across the set of nestloop parameters used in a single
|
|
|
|
* query level. So there is no possibility of a PARAM_EXEC slot being used
|
|
|
|
* for conflicting purposes.
|
|
|
|
*
|
|
|
|
* In addition, PARAM_EXEC slots are assigned for Params representing outputs
|
|
|
|
* from subplans (values that are setParam items for those subplans). These
|
|
|
|
* IDs need not be tracked via PlannerParamItems, since we do not need any
|
|
|
|
* duplicate-elimination nor later processing of the represented expressions.
|
2017-11-13 21:24:12 +01:00
|
|
|
* Instead, we just record the assignment of the slot number by appending to
|
|
|
|
* root->glob->paramExecTypes.
|
2007-02-19 08:03:34 +01:00
|
|
|
*/
|
|
|
|
typedef struct PlannerParamItem
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
|
Fix PARAM_EXEC assignment mechanism to be safe in the presence of WITH.
The planner previously assumed that parameter Vars having the same absolute
query level, varno, and varattno could safely be assigned the same runtime
PARAM_EXEC slot, even though they might be different Vars appearing in
different subqueries. This was (probably) safe before the introduction of
CTEs, but the lazy-evalution mechanism used for CTEs means that a CTE can
be executed during execution of some other subquery, causing the lifespan
of Params at the same syntactic nesting level as the CTE to overlap with
use of the same slots inside the CTE. In 9.1 we created additional hazards
by using the same parameter-assignment technology for nestloop inner scan
parameters, but it was broken before that, as illustrated by the added
regression test.
To fix, restructure the planner's management of PlannerParamItems so that
items having different semantic lifespans are kept rigorously separated.
This will probably result in complex queries using more runtime PARAM_EXEC
slots than before, but the slots are cheap enough that this hardly matters.
Also, stop generating PlannerParamItems containing Params for subquery
outputs: all we really need to do is reserve the PARAM_EXEC slot number,
and that now only takes incrementing a counter. The planning code is
simpler and probably faster than before, as well as being more correct.
Per report from Vik Reykja.
These changes will mostly also need to be made in the back branches, but
I'm going to hold off on that until after 9.2.0 wraps.
2012-09-05 18:54:03 +02:00
|
|
|
Node *item; /* the Var, PlaceHolderVar, or Aggref */
|
|
|
|
int paramId; /* its assigned PARAM_EXEC slot number */
|
2007-02-19 08:03:34 +01:00
|
|
|
} PlannerParamItem;
|
|
|
|
|
2012-01-28 01:26:38 +01:00
|
|
|
/*
|
2017-04-08 04:20:03 +02:00
|
|
|
* When making cost estimates for a SEMI/ANTI/inner_unique join, there are
|
|
|
|
* some correction factors that are needed in both nestloop and hash joins
|
2012-01-28 01:26:38 +01:00
|
|
|
* to account for the fact that the executor can stop scanning inner rows
|
|
|
|
* as soon as it finds a match to the current outer row. These numbers
|
|
|
|
* depend only on the selected outer and inner join relations, not on the
|
|
|
|
* particular paths used for them, so it's worthwhile to calculate them
|
|
|
|
* just once per relation pair not once per considered path. This struct
|
|
|
|
* is filled by compute_semi_anti_join_factors and must be passed along
|
|
|
|
* to the join cost estimation functions.
|
|
|
|
*
|
|
|
|
* outer_match_frac is the fraction of the outer tuples that are
|
|
|
|
* expected to have at least one match.
|
|
|
|
* match_count is the average number of matches expected for
|
|
|
|
* outer tuples that have at least one match.
|
|
|
|
*/
|
|
|
|
typedef struct SemiAntiJoinFactors
|
|
|
|
{
|
|
|
|
Selectivity outer_match_frac;
|
|
|
|
Selectivity match_count;
|
|
|
|
} SemiAntiJoinFactors;
|
|
|
|
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
/*
|
|
|
|
* Struct for extra information passed to subroutines of add_paths_to_joinrel
|
|
|
|
*
|
|
|
|
* restrictlist contains all of the RestrictInfo nodes for restriction
|
|
|
|
* clauses that apply to this join
|
|
|
|
* mergeclause_list is a list of RestrictInfo nodes for available
|
|
|
|
* mergejoin clauses in this join
|
2017-04-08 04:20:03 +02:00
|
|
|
* inner_unique is true if each outer tuple provably matches no more
|
|
|
|
* than one inner tuple
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
* sjinfo is extra info about special joins for selectivity estimation
|
2017-04-08 04:20:03 +02:00
|
|
|
* semifactors is as shown above (only valid for SEMI/ANTI/inner_unique joins)
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
* param_source_rels are OK targets for parameterization of result paths
|
|
|
|
*/
|
|
|
|
typedef struct JoinPathExtraData
|
|
|
|
{
|
|
|
|
List *restrictlist;
|
|
|
|
List *mergeclause_list;
|
2017-04-08 04:20:03 +02:00
|
|
|
bool inner_unique;
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
SpecialJoinInfo *sjinfo;
|
|
|
|
SemiAntiJoinFactors semifactors;
|
|
|
|
Relids param_source_rels;
|
|
|
|
} JoinPathExtraData;
|
|
|
|
|
Implement partition-wise grouping/aggregation.
If the partition keys of input relation are part of the GROUP BY
clause, all the rows belonging to a given group come from a single
partition. This allows aggregation/grouping over a partitioned
relation to be broken down * into aggregation/grouping on each
partition. This should be no worse, and often better, than the normal
approach.
If the GROUP BY clause does not contain all the partition keys, we can
still perform partial aggregation for each partition and then finalize
aggregation after appending the partial results. This is less certain
to be a win, but it's still useful.
Jeevan Chalke, Ashutosh Bapat, Robert Haas. The larger patch series
of which this patch is a part was also reviewed and tested by Antonin
Houska, Rajkumar Raghuwanshi, David Rowley, Dilip Kumar, Konstantin
Knizhnik, Pascal Legrand, and Rafia Sabih.
Discussion: http://postgr.es/m/CAM2+6=V64_xhstVHie0Rz=KPEQnLJMZt_e314P0jaT_oJ9MR8A@mail.gmail.com
2018-03-22 17:49:48 +01:00
|
|
|
/*
|
|
|
|
* Various flags indicating what kinds of grouping are possible.
|
|
|
|
*
|
|
|
|
* GROUPING_CAN_USE_SORT should be set if it's possible to perform
|
|
|
|
* sort-based implementations of grouping. When grouping sets are in use,
|
|
|
|
* this will be true if sorting is potentially usable for any of the grouping
|
|
|
|
* sets, even if it's not usable for all of them.
|
|
|
|
*
|
|
|
|
* GROUPING_CAN_USE_HASH should be set if it's possible to perform
|
|
|
|
* hash-based implementations of grouping.
|
|
|
|
*
|
|
|
|
* GROUPING_CAN_PARTIAL_AGG should be set if the aggregation is of a type
|
|
|
|
* for which we support partial aggregation (not, for example, grouping sets).
|
|
|
|
* It says nothing about parallel-safety or the availability of suitable paths.
|
|
|
|
*/
|
|
|
|
#define GROUPING_CAN_USE_SORT 0x0001
|
|
|
|
#define GROUPING_CAN_USE_HASH 0x0002
|
|
|
|
#define GROUPING_CAN_PARTIAL_AGG 0x0004
|
|
|
|
|
|
|
|
/*
|
|
|
|
* What kind of partitionwise aggregation is in use?
|
|
|
|
*
|
|
|
|
* PARTITIONWISE_AGGREGATE_NONE: Not used.
|
|
|
|
*
|
|
|
|
* PARTITIONWISE_AGGREGATE_FULL: Aggregate each partition separately, and
|
|
|
|
* append the results.
|
|
|
|
*
|
|
|
|
* PARTITIONWISE_AGGREGATE_PARTIAL: Partially aggregate each partition
|
|
|
|
* separately, append the results, and then finalize aggregation.
|
|
|
|
*/
|
|
|
|
typedef enum
|
|
|
|
{
|
|
|
|
PARTITIONWISE_AGGREGATE_NONE,
|
|
|
|
PARTITIONWISE_AGGREGATE_FULL,
|
|
|
|
PARTITIONWISE_AGGREGATE_PARTIAL
|
|
|
|
} PartitionwiseAggregateType;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Struct for extra information passed to subroutines of create_grouping_paths
|
|
|
|
*
|
|
|
|
* flags indicating what kinds of grouping are possible.
|
|
|
|
* partial_costs_set is true if the agg_partial_costs and agg_final_costs
|
|
|
|
* have been initialized.
|
|
|
|
* agg_partial_costs gives partial aggregation costs.
|
|
|
|
* agg_final_costs gives finalization costs.
|
|
|
|
* target_parallel_safe is true if target is parallel safe.
|
|
|
|
* havingQual gives list of quals to be applied after aggregation.
|
|
|
|
* targetList gives list of columns to be projected.
|
|
|
|
* patype is the type of partitionwise aggregation that is being performed.
|
|
|
|
*/
|
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
/* Data which remains constant once set. */
|
|
|
|
int flags;
|
|
|
|
bool partial_costs_set;
|
|
|
|
AggClauseCosts agg_partial_costs;
|
|
|
|
AggClauseCosts agg_final_costs;
|
|
|
|
|
|
|
|
/* Data which may differ across partitions. */
|
|
|
|
bool target_parallel_safe;
|
|
|
|
Node *havingQual;
|
|
|
|
List *targetList;
|
|
|
|
PartitionwiseAggregateType patype;
|
|
|
|
} GroupPathExtraData;
|
|
|
|
|
2019-04-02 13:30:45 +02:00
|
|
|
/*
|
|
|
|
* Struct for extra information passed to subroutines of grouping_planner
|
|
|
|
*
|
2019-05-08 09:49:09 +02:00
|
|
|
* limit_needed is true if we actually need a Limit plan node.
|
2019-04-02 13:30:45 +02:00
|
|
|
* limit_tuples is an estimated bound on the number of output tuples,
|
2019-05-08 09:49:09 +02:00
|
|
|
* or -1 if no LIMIT or couldn't estimate.
|
2019-04-02 13:30:45 +02:00
|
|
|
* count_est and offset_est are the estimated values of the LIMIT and OFFSET
|
|
|
|
* expressions computed by preprocess_limit() (see comments for
|
|
|
|
* preprocess_limit() for more information).
|
|
|
|
*/
|
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
bool limit_needed;
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality limit_tuples;
|
2019-04-02 13:30:45 +02:00
|
|
|
int64 count_est;
|
|
|
|
int64 offset_est;
|
|
|
|
} FinalPathExtraData;
|
|
|
|
|
2012-01-28 01:26:38 +01:00
|
|
|
/*
|
|
|
|
* For speed reasons, cost estimation for join paths is performed in two
|
|
|
|
* phases: the first phase tries to quickly derive a lower bound for the
|
|
|
|
* join cost, and then we check if that's sufficient to reject the path.
|
|
|
|
* If not, we come back for a more refined cost estimate. The first phase
|
|
|
|
* fills a JoinCostWorkspace struct with its preliminary cost estimates
|
|
|
|
* and possibly additional intermediate values. The second phase takes
|
|
|
|
* these values as inputs to avoid repeating work.
|
|
|
|
*
|
|
|
|
* (Ideally we'd declare this in cost.h, but it's also needed in pathnode.h,
|
|
|
|
* so seems best to put it here.)
|
|
|
|
*/
|
|
|
|
typedef struct JoinCostWorkspace
|
|
|
|
{
|
|
|
|
/* Preliminary cost estimates --- must not be larger than final ones! */
|
|
|
|
Cost startup_cost; /* cost expended before fetching any tuples */
|
|
|
|
Cost total_cost; /* total cost (assuming all tuples fetched) */
|
|
|
|
|
|
|
|
/* Fields below here should be treated as private to costsize.c */
|
|
|
|
Cost run_cost; /* non-startup cost components */
|
|
|
|
|
|
|
|
/* private for cost_nestloop code */
|
Fix planner's cost estimation for SEMI/ANTI joins with inner indexscans.
When the inner side of a nestloop SEMI or ANTI join is an indexscan that
uses all the join clauses as indexquals, it can be presumed that both
matched and unmatched outer rows will be processed very quickly: for
matched rows, we'll stop after fetching one row from the indexscan, while
for unmatched rows we'll have an indexscan that finds no matching index
entries, which should also be quick. The planner already knew about this,
but it was nonetheless charging for at least one full run of the inner
indexscan, as a consequence of concerns about the behavior of materialized
inner scans --- but those concerns don't apply in the fast case. If the
inner side has low cardinality (many matching rows) this could make an
indexscan plan look far more expensive than it actually is. To fix,
rearrange the work in initial_cost_nestloop/final_cost_nestloop so that we
don't add the inner scan cost until we've inspected the indexquals, and
then we can add either the full-run cost or just the first tuple's cost as
appropriate.
Experimentation with this fix uncovered another problem: add_path and
friends were coded to disregard cheap startup cost when considering
parameterized paths. That's usually okay (and desirable, because it thins
the path herd faster); but in this fast case for SEMI/ANTI joins, it could
result in throwing away the desired plain indexscan path in favor of a
bitmap scan path before we ever get to the join costing logic. In the
many-matching-rows cases of interest here, a bitmap scan will do a lot more
work than required, so this is a problem. To fix, add a per-relation flag
consider_param_startup that works like the existing consider_startup flag,
but applies to parameterized paths, and set it for relations that are the
inside of a SEMI or ANTI join.
To make this patch reasonably safe to back-patch, care has been taken to
avoid changing the planner's behavior except in the very narrow case of
SEMI/ANTI joins with inner indexscans. There are places in
compare_path_costs_fuzzily and add_path_precheck that are not terribly
consistent with the new approach, but changing them will affect planner
decisions at the margins in other cases, so we'll leave that for a
HEAD-only fix.
Back-patch to 9.3; before that, the consider_startup flag didn't exist,
meaning that the second aspect of the patch would be too invasive.
Per a complaint from Peter Holzer and analysis by Tomas Vondra.
2015-06-03 17:58:47 +02:00
|
|
|
Cost inner_run_cost; /* also used by cost_mergejoin code */
|
2012-01-28 01:26:38 +01:00
|
|
|
Cost inner_rescan_run_cost;
|
|
|
|
|
|
|
|
/* private for cost_mergejoin code */
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality outer_rows;
|
|
|
|
Cardinality inner_rows;
|
|
|
|
Cardinality outer_skip_rows;
|
|
|
|
Cardinality inner_skip_rows;
|
2012-01-28 01:26:38 +01:00
|
|
|
|
|
|
|
/* private for cost_hashjoin code */
|
|
|
|
int numbuckets;
|
|
|
|
int numbatches;
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality inner_rows_total;
|
2012-01-28 01:26:38 +01:00
|
|
|
} JoinCostWorkspace;
|
|
|
|
|
2020-11-24 09:45:00 +01:00
|
|
|
/*
|
|
|
|
* AggInfo holds information about an aggregate that needs to be computed.
|
|
|
|
* Multiple Aggrefs in a query can refer to the same AggInfo by having the
|
|
|
|
* same 'aggno' value, so that the aggregate is computed only once.
|
|
|
|
*/
|
|
|
|
typedef struct AggInfo
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Link to an Aggref expr this state value is for.
|
|
|
|
*
|
|
|
|
* There can be multiple identical Aggref's sharing the same per-agg. This
|
|
|
|
* points to the first one of them.
|
|
|
|
*/
|
|
|
|
Aggref *representative_aggref;
|
|
|
|
|
|
|
|
int transno;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* "shareable" is false if this agg cannot share state values with other
|
|
|
|
* aggregates because the final function is read-write.
|
|
|
|
*/
|
|
|
|
bool shareable;
|
|
|
|
|
|
|
|
/* Oid of the final function or InvalidOid */
|
|
|
|
Oid finalfn_oid;
|
|
|
|
|
|
|
|
} AggInfo;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* AggTransInfo holds information about transition state that is used by one
|
|
|
|
* or more aggregates in the query. Multiple aggregates can share the same
|
|
|
|
* transition state, if they have the same inputs and the same transition
|
|
|
|
* function. Aggrefs that share the same transition info have the same
|
|
|
|
* 'aggtransno' value.
|
|
|
|
*/
|
|
|
|
typedef struct AggTransInfo
|
|
|
|
{
|
|
|
|
List *args;
|
|
|
|
Expr *aggfilter;
|
|
|
|
|
|
|
|
/* Oid of the state transition function */
|
|
|
|
Oid transfn_oid;
|
|
|
|
|
|
|
|
/* Oid of the serialization function or InvalidOid */
|
|
|
|
Oid serialfn_oid;
|
|
|
|
|
|
|
|
/* Oid of the deserialization function or InvalidOid */
|
|
|
|
Oid deserialfn_oid;
|
|
|
|
|
|
|
|
/* Oid of the combine function or InvalidOid */
|
|
|
|
Oid combinefn_oid;
|
|
|
|
|
|
|
|
/* Oid of state value's datatype */
|
|
|
|
Oid aggtranstype;
|
|
|
|
int32 aggtranstypmod;
|
|
|
|
int transtypeLen;
|
|
|
|
bool transtypeByVal;
|
|
|
|
int32 aggtransspace;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* initial value from pg_aggregate entry
|
|
|
|
*/
|
|
|
|
Datum initValue;
|
|
|
|
bool initValueIsNull;
|
|
|
|
|
|
|
|
} AggTransInfo;
|
|
|
|
|
2019-01-29 22:49:25 +01:00
|
|
|
#endif /* PATHNODES_H */
|