1996-08-28 03:59:28 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
1999-02-14 00:22:53 +01:00
|
|
|
* plannodes.h
|
1996-08-28 03:59:28 +02:00
|
|
|
* definitions for query plan nodes
|
|
|
|
*
|
|
|
|
*
|
2021-01-02 19:06:25 +01:00
|
|
|
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-08-28 03:59:28 +02:00
|
|
|
*
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/include/nodes/plannodes.h
|
1996-08-28 03:59:28 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#ifndef PLANNODES_H
|
|
|
|
#define PLANNODES_H
|
|
|
|
|
2002-12-05 16:50:39 +01:00
|
|
|
#include "access/sdir.h"
|
2018-06-10 22:30:14 +02:00
|
|
|
#include "access/stratnum.h"
|
2014-11-07 23:26:02 +01:00
|
|
|
#include "lib/stringinfo.h"
|
2003-02-09 01:30:41 +01:00
|
|
|
#include "nodes/bitmapset.h"
|
2015-03-15 20:19:04 +01:00
|
|
|
#include "nodes/lockoptions.h"
|
2002-12-05 16:50:39 +01:00
|
|
|
#include "nodes/primnodes.h"
|
1996-08-28 03:59:28 +02:00
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* node definitions
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
2007-02-20 18:32:18 +01:00
|
|
|
/* ----------------
|
|
|
|
* PlannedStmt node
|
|
|
|
*
|
|
|
|
* The output of the planner is a Plan tree headed by a PlannedStmt node.
|
|
|
|
* PlannedStmt holds the "one time" information needed by the executor.
|
Change representation of statement lists, and add statement location info.
This patch makes several changes that improve the consistency of
representation of lists of statements. It's always been the case
that the output of parse analysis is a list of Query nodes, whatever
the types of the individual statements in the list. This patch brings
similar consistency to the outputs of raw parsing and planning steps:
* The output of raw parsing is now always a list of RawStmt nodes;
the statement-type-dependent nodes are one level down from that.
* The output of pg_plan_queries() is now always a list of PlannedStmt
nodes, even for utility statements. In the case of a utility statement,
"planning" just consists of wrapping a CMD_UTILITY PlannedStmt around
the utility node. This list representation is now used in Portal and
CachedPlan plan lists, replacing the former convention of intermixing
PlannedStmts with bare utility-statement nodes.
Now, every list of statements has a consistent head-node type depending
on how far along it is in processing. This allows changing many places
that formerly used generic "Node *" pointers to use a more specific
pointer type, thus reducing the number of IsA() tests and casts needed,
as well as improving code clarity.
Also, the post-parse-analysis representation of DECLARE CURSOR is changed
so that it looks more like EXPLAIN, PREPARE, etc. That is, the contained
SELECT remains a child of the DeclareCursorStmt rather than getting flipped
around to be the other way. It's now true for both Query and PlannedStmt
that utilityStmt is non-null if and only if commandType is CMD_UTILITY.
That allows simplifying a lot of places that were testing both fields.
(I think some of those were just defensive programming, but in many places,
it was actually necessary to avoid confusing DECLARE CURSOR with SELECT.)
Because PlannedStmt carries a canSetTag field, we're also able to get rid
of some ad-hoc rules about how to reconstruct canSetTag for a bare utility
statement; specifically, the assumption that a utility is canSetTag if and
only if it's the only one in its list. While I see no near-term need for
relaxing that restriction, it's nice to get rid of the ad-hocery.
The API of ProcessUtility() is changed so that what it's passed is the
wrapper PlannedStmt not just the bare utility statement. This will affect
all users of ProcessUtility_hook, but the changes are pretty trivial; see
the affected contrib modules for examples of the minimum change needed.
(Most compilers should give pointer-type-mismatch warnings for uncorrected
code.)
There's also a change in the API of ExplainOneQuery_hook, to pass through
cursorOptions instead of expecting hook functions to know what to pick.
This is needed because of the DECLARE CURSOR changes, but really should
have been done in 9.6; it's unlikely that any extant hook functions
know about using CURSOR_OPT_PARALLEL_OK.
Finally, teach gram.y to save statement boundary locations in RawStmt
nodes, and pass those through to Query and PlannedStmt nodes. This allows
more intelligent handling of cases where a source query string contains
multiple statements. This patch doesn't actually do anything with the
information, but a follow-on patch will. (Passing this information through
cleanly is the true motivation for these changes; while I think this is all
good cleanup, it's unlikely we'd have bothered without this end goal.)
catversion bump because addition of location fields to struct Query
affects stored rules.
This patch is by me, but it owes a good deal to Fabien Coelho who did
a lot of preliminary work on the problem, and also reviewed the patch.
Discussion: https://postgr.es/m/alpine.DEB.2.20.1612200926310.29821@lancre
2017-01-14 22:02:35 +01:00
|
|
|
*
|
|
|
|
* For simplicity in APIs, we also wrap utility statements in PlannedStmt
|
|
|
|
* nodes; in such cases, commandType == CMD_UTILITY, the statement itself
|
|
|
|
* is in the utilityStmt field, and the rest of the struct is mostly dummy.
|
|
|
|
* (We do use canSetTag, stmt_location, stmt_len, and possibly queryId.)
|
2007-02-20 18:32:18 +01:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct PlannedStmt
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
|
2018-04-12 12:22:56 +02:00
|
|
|
CmdType commandType; /* select|insert|update|delete|utility */
|
2007-02-20 18:32:18 +01:00
|
|
|
|
2017-10-12 01:52:46 +02:00
|
|
|
uint64 queryId; /* query identifier (copied from Query) */
|
2012-03-27 21:14:13 +02:00
|
|
|
|
2009-10-10 03:43:50 +02:00
|
|
|
bool hasReturning; /* is it insert|update|delete RETURNING? */
|
|
|
|
|
2011-02-26 00:56:23 +01:00
|
|
|
bool hasModifyingCTE; /* has insert|update|delete in WITH? */
|
|
|
|
|
2007-02-20 18:32:18 +01:00
|
|
|
bool canSetTag; /* do I set the command result tag? */
|
|
|
|
|
2007-09-20 19:56:33 +02:00
|
|
|
bool transientPlan; /* redo plan when TransactionXmin changes? */
|
|
|
|
|
Avoid invalidating all foreign-join cached plans when user mappings change.
We must not push down a foreign join when the foreign tables involved
should be accessed under different user mappings. Previously we tried
to enforce that rule literally during planning, but that meant that the
resulting plans were dependent on the current contents of the
pg_user_mapping catalog, and we had to blow away all cached plans
containing any remote join when anything at all changed in pg_user_mapping.
This could have been improved somewhat, but the fact that a syscache inval
callback has very limited info about what changed made it hard to do better
within that design. Instead, let's change the planner to not consider user
mappings per se, but to allow a foreign join if both RTEs have the same
checkAsUser value. If they do, then they necessarily will use the same
user mapping at runtime, and we don't need to know specifically which one
that is. Post-plan-time changes in pg_user_mapping no longer require any
plan invalidation.
This rule does give up some optimization ability, to wit where two foreign
table references come from views with different owners or one's from a view
and one's directly in the query, but nonetheless the same user mapping
would have applied. We'll sacrifice the first case, but to not regress
more than we have to in the second case, allow a foreign join involving
both zero and nonzero checkAsUser values if the nonzero one is the same as
the prevailing effective userID. In that case, mark the plan as only
runnable by that userID.
The plancache code already had a notion of plans being userID-specific,
in order to support RLS. It was a little confused though, in particular
lacking clarity of thought as to whether it was the rewritten query or just
the finished plan that's dependent on the userID. Rearrange that code so
that it's clearer what depends on which, and so that the same logic applies
to both RLS-injected role dependency and foreign-join-injected role
dependency.
Note that this patch doesn't remove the other issue mentioned in the
original complaint, which is that while we'll reliably stop using a foreign
join if it's disallowed in a new context, we might fail to start using a
foreign join if it's now allowed, but we previously created a generic
cached plan that didn't use one. It was agreed that the chance of winning
that way was not high enough to justify the much larger number of plan
invalidations that would have to occur if we tried to cause it to happen.
In passing, clean up randomly-varying spelling of EXPLAIN commands in
postgres_fdw.sql, and fix a COSTS ON example that had been allowed to
leak into the committed tests.
This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the
previous attempt at ensuring we wouldn't push down foreign joins that
span permissions contexts.
Etsuro Fujita and Tom Lane
Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
|
|
|
bool dependsOnRole; /* is plan specific to current role? */
|
|
|
|
|
|
|
|
bool parallelModeNeeded; /* parallel mode required to execute? */
|
|
|
|
|
2018-03-22 19:45:07 +01:00
|
|
|
int jitFlags; /* which forms of JIT should be performed */
|
|
|
|
|
2007-02-20 18:32:18 +01:00
|
|
|
struct Plan *planTree; /* tree of Plan nodes */
|
|
|
|
|
|
|
|
List *rtable; /* list of RangeTblEntry nodes */
|
|
|
|
|
|
|
|
/* rtable indexes of target relations for INSERT/UPDATE/DELETE */
|
|
|
|
List *resultRelations; /* integer list of RT indexes, or NIL */
|
|
|
|
|
Further adjust EXPLAIN's choices of table alias names.
This patch causes EXPLAIN to always assign a separate table alias to the
parent RTE of an append relation (inheritance set); before, such RTEs
were ignored if not actually scanned by the plan. Since the child RTEs
now always have that same alias to start with (cf. commit 55a1954da),
the net effect is that the parent RTE usually gets the alias used or
implied by the query text, and the children all get that alias with "_N"
appended. (The exception to "usually" is if there are duplicate aliases
in different subtrees of the original query; then some of those original
RTEs will also have "_N" appended.)
This results in more uniform output for partitioned-table plans than
we had before: the partitioned table itself gets the original alias,
and all child tables have aliases with "_N", rather than the previous
behavior where one of the children would get an alias without "_N".
The reason for giving the parent RTE an alias, even if it isn't scanned
by the plan, is that we now use the parent's alias to qualify Vars that
refer to an appendrel output column and appear above the Append or
MergeAppend that computes the appendrel. But below the append, Vars
refer to some one of the child relations, and are displayed that way.
This seems clearer than the old behavior where a Var that could carry
values from any child relation was displayed as if it referred to only
one of them.
While at it, change ruleutils.c so that the code paths used by EXPLAIN
deal in Plan trees not PlanState trees. This effectively reverts a
decision made in commit 1cc29fe7c, which seemed like a good idea at
the time to make ruleutils.c consistent with explain.c. However,
it's problematic because we'd really like to allow executor startup
pruning to remove all the children of an append node when possible,
leaving no child PlanState to resolve Vars against. (That's not done
here, but will be in the next patch.) This requires different handling
of subplans and initplans than before, but is otherwise a pretty
straightforward change.
Discussion: https://postgr.es/m/001001d4f44b$2a2cca50$7e865ef0$@lab.ntt.co.jp
2019-12-11 23:05:18 +01:00
|
|
|
List *appendRelations; /* list of AppendRelInfo nodes */
|
|
|
|
|
2017-04-12 22:06:49 +02:00
|
|
|
List *subplans; /* Plan trees for SubPlan expressions; note
|
|
|
|
* that some could be NULL */
|
2007-02-22 23:00:26 +01:00
|
|
|
|
2007-02-27 02:11:26 +01:00
|
|
|
Bitmapset *rewindPlanIDs; /* indices of subplans that require REWIND */
|
|
|
|
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
List *rowMarks; /* a list of PlanRowMark's */
|
2007-02-20 18:32:18 +01:00
|
|
|
|
2007-10-11 20:05:27 +02:00
|
|
|
List *relationOids; /* OIDs of relations the plan depends on */
|
|
|
|
|
2008-09-09 20:58:09 +02:00
|
|
|
List *invalItems; /* other dependencies, as PlanInvalItems */
|
|
|
|
|
2017-11-13 21:24:12 +01:00
|
|
|
List *paramExecTypes; /* type OIDs for PARAM_EXEC Params */
|
Change representation of statement lists, and add statement location info.
This patch makes several changes that improve the consistency of
representation of lists of statements. It's always been the case
that the output of parse analysis is a list of Query nodes, whatever
the types of the individual statements in the list. This patch brings
similar consistency to the outputs of raw parsing and planning steps:
* The output of raw parsing is now always a list of RawStmt nodes;
the statement-type-dependent nodes are one level down from that.
* The output of pg_plan_queries() is now always a list of PlannedStmt
nodes, even for utility statements. In the case of a utility statement,
"planning" just consists of wrapping a CMD_UTILITY PlannedStmt around
the utility node. This list representation is now used in Portal and
CachedPlan plan lists, replacing the former convention of intermixing
PlannedStmts with bare utility-statement nodes.
Now, every list of statements has a consistent head-node type depending
on how far along it is in processing. This allows changing many places
that formerly used generic "Node *" pointers to use a more specific
pointer type, thus reducing the number of IsA() tests and casts needed,
as well as improving code clarity.
Also, the post-parse-analysis representation of DECLARE CURSOR is changed
so that it looks more like EXPLAIN, PREPARE, etc. That is, the contained
SELECT remains a child of the DeclareCursorStmt rather than getting flipped
around to be the other way. It's now true for both Query and PlannedStmt
that utilityStmt is non-null if and only if commandType is CMD_UTILITY.
That allows simplifying a lot of places that were testing both fields.
(I think some of those were just defensive programming, but in many places,
it was actually necessary to avoid confusing DECLARE CURSOR with SELECT.)
Because PlannedStmt carries a canSetTag field, we're also able to get rid
of some ad-hoc rules about how to reconstruct canSetTag for a bare utility
statement; specifically, the assumption that a utility is canSetTag if and
only if it's the only one in its list. While I see no near-term need for
relaxing that restriction, it's nice to get rid of the ad-hocery.
The API of ProcessUtility() is changed so that what it's passed is the
wrapper PlannedStmt not just the bare utility statement. This will affect
all users of ProcessUtility_hook, but the changes are pretty trivial; see
the affected contrib modules for examples of the minimum change needed.
(Most compilers should give pointer-type-mismatch warnings for uncorrected
code.)
There's also a change in the API of ExplainOneQuery_hook, to pass through
cursorOptions instead of expecting hook functions to know what to pick.
This is needed because of the DECLARE CURSOR changes, but really should
have been done in 9.6; it's unlikely that any extant hook functions
know about using CURSOR_OPT_PARALLEL_OK.
Finally, teach gram.y to save statement boundary locations in RawStmt
nodes, and pass those through to Query and PlannedStmt nodes. This allows
more intelligent handling of cases where a source query string contains
multiple statements. This patch doesn't actually do anything with the
information, but a follow-on patch will. (Passing this information through
cleanly is the true motivation for these changes; while I think this is all
good cleanup, it's unlikely we'd have bothered without this end goal.)
catversion bump because addition of location fields to struct Query
affects stored rules.
This patch is by me, but it owes a good deal to Fabien Coelho who did
a lot of preliminary work on the problem, and also reviewed the patch.
Discussion: https://postgr.es/m/alpine.DEB.2.20.1612200926310.29821@lancre
2017-01-14 22:02:35 +01:00
|
|
|
|
|
|
|
Node *utilityStmt; /* non-null if this is utility stmt */
|
|
|
|
|
|
|
|
/* statement location in source string (copied from Query) */
|
|
|
|
int stmt_location; /* start location, or -1 if unknown */
|
|
|
|
int stmt_len; /* length in bytes; 0 means "rest of string" */
|
2007-02-20 18:32:18 +01:00
|
|
|
} PlannedStmt;
|
|
|
|
|
2007-02-22 23:00:26 +01:00
|
|
|
/* macro for fetching the Plan associated with a SubPlan node */
|
|
|
|
#define exec_subplan_get_plan(plannedstmt, subplan) \
|
|
|
|
((Plan *) list_nth((plannedstmt)->subplans, (subplan)->plan_id - 1))
|
|
|
|
|
2007-02-20 18:32:18 +01:00
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
/* ----------------
|
|
|
|
* Plan node
|
2002-12-05 16:50:39 +01:00
|
|
|
*
|
|
|
|
* All plan nodes "derive" from the Plan structure by having the
|
|
|
|
* Plan structure as the first field. This ensures that everything works
|
|
|
|
* when nodes are cast to Plan's. (node pointers are frequently cast to Plan*
|
|
|
|
* when passed around generically in the executor)
|
|
|
|
*
|
|
|
|
* We never actually instantiate any Plan nodes; this is just the common
|
|
|
|
* abstract superclass for all Plan-type nodes.
|
1996-08-28 03:59:28 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct Plan
|
|
|
|
{
|
|
|
|
NodeTag type;
|
2000-01-09 01:26:47 +01:00
|
|
|
|
2002-12-05 16:50:39 +01:00
|
|
|
/*
|
|
|
|
* estimated execution costs for plan (see costsize.c for more info)
|
|
|
|
*/
|
2000-02-15 21:49:31 +01:00
|
|
|
Cost startup_cost; /* cost expended before fetching any tuples */
|
|
|
|
Cost total_cost; /* total cost (assuming all tuples fetched) */
|
|
|
|
|
|
|
|
/*
|
2002-12-05 16:50:39 +01:00
|
|
|
* planner's estimate of result size of this plan step
|
2000-02-15 21:49:31 +01:00
|
|
|
*/
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality plan_rows; /* number of rows plan is expected to emit */
|
2000-02-15 21:49:31 +01:00
|
|
|
int plan_width; /* average row width in bytes */
|
2000-01-09 01:26:47 +01:00
|
|
|
|
2015-11-11 14:57:52 +01:00
|
|
|
/*
|
|
|
|
* information needed for parallel query
|
|
|
|
*/
|
|
|
|
bool parallel_aware; /* engage parallel-aware logic? */
|
2017-04-12 21:13:23 +02:00
|
|
|
bool parallel_safe; /* OK to use as part of parallel plan? */
|
2015-11-11 14:57:52 +01:00
|
|
|
|
Add support for asynchronous execution.
This implements asynchronous execution, which runs multiple parts of a
non-parallel-aware Append concurrently rather than serially to improve
performance when possible. Currently, the only node type that can be
run concurrently is a ForeignScan that is an immediate child of such an
Append. In the case where such ForeignScans access data on different
remote servers, this would run those ForeignScans concurrently, and
overlap the remote operations to be performed simultaneously, so it'll
improve the performance especially when the operations involve
time-consuming ones such as remote join and remote aggregation.
We may extend this to other node types such as joins or aggregates over
ForeignScans in the future.
This also adds the support for postgres_fdw, which is enabled by the
table-level/server-level option "async_capable". The default is false.
Robert Haas, Kyotaro Horiguchi, Thomas Munro, and myself. This commit
is mostly based on the patch proposed by Robert Haas, but also uses
stuff from the patch proposed by Kyotaro Horiguchi and from the patch
proposed by Thomas Munro. Reviewed by Kyotaro Horiguchi, Konstantin
Knizhnik, Andrey Lepikhov, Movead Li, Thomas Munro, Justin Pryzby, and
others.
Discussion: https://postgr.es/m/CA%2BTgmoaXQEt4tZ03FtQhnzeDEMzBck%2BLrni0UWHVVgOTnA6C1w%40mail.gmail.com
Discussion: https://postgr.es/m/CA%2BhUKGLBRyu0rHrDCMC4%3DRn3252gogyp1SjOgG8SEKKZv%3DFwfQ%40mail.gmail.com
Discussion: https://postgr.es/m/20200228.170650.667613673625155850.horikyota.ntt%40gmail.com
2021-03-31 11:45:00 +02:00
|
|
|
/*
|
|
|
|
* information needed for asynchronous execution
|
|
|
|
*/
|
|
|
|
bool async_capable; /* engage asynchronous-capable logic? */
|
|
|
|
|
2001-09-18 03:59:07 +02:00
|
|
|
/*
|
2002-12-05 16:50:39 +01:00
|
|
|
* Common structural data for all Plan types.
|
2001-09-18 03:59:07 +02:00
|
|
|
*/
|
2015-09-29 03:55:57 +02:00
|
|
|
int plan_node_id; /* unique across entire final plan tree */
|
2002-12-05 16:50:39 +01:00
|
|
|
List *targetlist; /* target list to be computed at this node */
|
|
|
|
List *qual; /* implicitly-ANDed qual conditions */
|
|
|
|
struct Plan *lefttree; /* input plan tree(s) */
|
|
|
|
struct Plan *righttree;
|
|
|
|
List *initPlan; /* Init Plan nodes (un-correlated expr
|
|
|
|
* subselects) */
|
2001-09-18 03:59:07 +02:00
|
|
|
|
|
|
|
/*
|
2002-12-05 16:50:39 +01:00
|
|
|
* Information for management of parameter-change-driven rescanning
|
2003-02-09 01:30:41 +01:00
|
|
|
*
|
|
|
|
* extParam includes the paramIDs of all external PARAM_EXEC params
|
|
|
|
* affecting this plan node or its children. setParam params from the
|
|
|
|
* node's initPlans are not included, but their extParams are.
|
|
|
|
*
|
|
|
|
* allParam includes all the extParam paramIDs, plus the IDs of local
|
|
|
|
* params that affect the node (i.e., the setParams of its initplans).
|
|
|
|
* These are _all_ the PARAM_EXEC params that affect this node.
|
2001-09-18 03:59:07 +02:00
|
|
|
*/
|
2003-02-09 01:30:41 +01:00
|
|
|
Bitmapset *extParam;
|
|
|
|
Bitmapset *allParam;
|
1996-08-28 03:59:28 +02:00
|
|
|
} Plan;
|
|
|
|
|
|
|
|
/* ----------------
|
2010-10-26 10:15:17 +02:00
|
|
|
* these are defined to avoid confusion problems with "left"
|
1996-08-28 03:59:28 +02:00
|
|
|
* and "right" and "inner" and "outer". The convention is that
|
|
|
|
* the "left" plan is the "outer" plan and the "right" plan is
|
|
|
|
* the inner plan, but these make the code more readable.
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
#define innerPlan(node) (((Plan *)(node))->righttree)
|
|
|
|
#define outerPlan(node) (((Plan *)(node))->lefttree)
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------------
|
2000-11-12 01:37:02 +01:00
|
|
|
* Result node -
|
|
|
|
* If no outer plan, evaluate a variable-free targetlist.
|
2002-11-06 01:00:45 +01:00
|
|
|
* If outer plan, return tuples from outer plan (after a level of
|
|
|
|
* projection as shown by targetlist).
|
|
|
|
*
|
|
|
|
* If resconstantqual isn't NULL, it represents a one-time qualification
|
|
|
|
* test (i.e., one that doesn't depend on any variables from the outer plan,
|
|
|
|
* so needs to be evaluated only once).
|
1996-08-28 03:59:28 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct Result
|
|
|
|
{
|
|
|
|
Plan plan;
|
|
|
|
Node *resconstantqual;
|
|
|
|
} Result;
|
|
|
|
|
Move targetlist SRF handling from expression evaluation to new executor node.
Evaluation of set returning functions (SRFs_ in the targetlist (like SELECT
generate_series(1,5)) so far was done in the expression evaluation (i.e.
ExecEvalExpr()) and projection (i.e. ExecProject/ExecTargetList) code.
This meant that most executor nodes performing projection, and most
expression evaluation functions, had to deal with the possibility that an
evaluated expression could return a set of return values.
That's bad because it leads to repeated code in a lot of places. It also,
and that's my (Andres's) motivation, made it a lot harder to implement a
more efficient way of doing expression evaluation.
To fix this, introduce a new executor node (ProjectSet) that can evaluate
targetlists containing one or more SRFs. To avoid the complexity of the old
way of handling nested expressions returning sets (e.g. having to pass up
ExprDoneCond, and dealing with arguments to functions returning sets etc.),
those SRFs can only be at the top level of the node's targetlist. The
planner makes sure (via split_pathtarget_at_srfs()) that SRF evaluation is
only necessary in ProjectSet nodes and that SRFs are only present at the
top level of the node's targetlist. If there are nested SRFs the planner
creates multiple stacked ProjectSet nodes. The ProjectSet nodes always get
input from an underlying node.
We also discussed and prototyped evaluating targetlist SRFs using ROWS
FROM(), but that turned out to be more complicated than we'd hoped.
While moving SRF evaluation to ProjectSet would allow to retain the old
"least common multiple" behavior when multiple SRFs are present in one
targetlist (i.e. continue returning rows until all SRFs are at the end of
their input at the same time), we decided to instead only return rows till
all SRFs are exhausted, returning NULL for already exhausted ones. We
deemed the previous behavior to be too confusing, unexpected and actually
not particularly useful.
As a side effect, the previously prohibited case of multiple set returning
arguments to a function, is now allowed. Not because it's particularly
desirable, but because it ends up working and there seems to be no argument
for adding code to prohibit it.
Currently the behavior for COALESCE and CASE containing SRFs has changed,
returning multiple rows from the expression, even when the SRF containing
"arm" of the expression is not evaluated. That's because the SRFs are
evaluated in a separate ProjectSet node. As that's quite confusing, we're
likely to instead prohibit SRFs in those places. But that's still being
discussed, and the code would reside in places not touched here, so that's
a task for later.
There's a lot of, now superfluous, code dealing with set return expressions
around. But as the changes to get rid of those are verbose largely boring,
it seems better for readability to keep the cleanup as a separate commit.
Author: Tom Lane and Andres Freund
Discussion: https://postgr.es/m/20160822214023.aaxz5l4igypowyri@alap3.anarazel.de
2017-01-18 21:46:50 +01:00
|
|
|
/* ----------------
|
|
|
|
* ProjectSet node -
|
|
|
|
* Apply a projection that includes set-returning functions to the
|
|
|
|
* output tuples of the outer plan.
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct ProjectSet
|
|
|
|
{
|
|
|
|
Plan plan;
|
|
|
|
} ProjectSet;
|
|
|
|
|
2009-10-10 03:43:50 +02:00
|
|
|
/* ----------------
|
|
|
|
* ModifyTable node -
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
* Apply rows produced by outer plan to result table(s),
|
2009-10-10 03:43:50 +02:00
|
|
|
* by inserting, updating, or deleting.
|
2011-01-13 02:47:02 +01:00
|
|
|
*
|
2018-10-07 20:33:17 +02:00
|
|
|
* If the originally named target table is a partitioned table, both
|
|
|
|
* nominalRelation and rootRelation contain the RT index of the partition
|
|
|
|
* root, which is not otherwise mentioned in the plan. Otherwise rootRelation
|
|
|
|
* is zero. However, nominalRelation will always be set, as it's the rel that
|
|
|
|
* EXPLAIN should claim is the INSERT/UPDATE/DELETE target.
|
|
|
|
*
|
2011-01-13 02:47:02 +01:00
|
|
|
* Note that rowMarks and epqParam are presumed to be valid for all the
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
* table(s); they can't contain any info that varies across tables.
|
2009-10-10 03:43:50 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct ModifyTable
|
|
|
|
{
|
|
|
|
Plan plan;
|
2018-04-12 12:22:56 +02:00
|
|
|
CmdType operation; /* INSERT, UPDATE, or DELETE */
|
2011-02-26 00:56:23 +01:00
|
|
|
bool canSetTag; /* do we set the command tag/es_processed? */
|
2015-02-18 00:04:11 +01:00
|
|
|
Index nominalRelation; /* Parent RT index for use of EXPLAIN */
|
2018-10-07 20:33:17 +02:00
|
|
|
Index rootRelation; /* Root RT index, if target is partitioned */
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
bool partColsUpdated; /* some part key in hierarchy updated? */
|
2009-10-10 03:43:50 +02:00
|
|
|
List *resultRelations; /* integer list of RT indexes */
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
List *updateColnosLists; /* per-target-table update_colnos lists */
|
2013-07-18 23:10:16 +02:00
|
|
|
List *withCheckOptionLists; /* per-target-table WCO lists */
|
2009-10-10 03:43:50 +02:00
|
|
|
List *returningLists; /* per-target-table RETURNING tlists */
|
2013-03-10 19:14:53 +01:00
|
|
|
List *fdwPrivLists; /* per-target-table FDW private data lists */
|
2016-03-18 18:48:58 +01:00
|
|
|
Bitmapset *fdwDirectModifyPlans; /* indices of FDW DM plans */
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
List *rowMarks; /* PlanRowMarks (non-locking only) */
|
|
|
|
int epqParam; /* ID of Param for EvalPlanQual re-eval */
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
OnConflictAction onConflictAction; /* ON CONFLICT action */
|
|
|
|
List *arbiterIndexes; /* List of ON CONFLICT arbiter index OIDs */
|
Fix mishandling of resjunk columns in ON CONFLICT ... UPDATE tlists.
It's unusual to have any resjunk columns in an ON CONFLICT ... UPDATE
list, but it can happen when MULTIEXPR_SUBLINK SubPlans are present.
If it happens, the ON CONFLICT UPDATE code path would end up storing
tuples that include the values of the extra resjunk columns. That's
fairly harmless in the short run, but if new columns are added to
the table then the values would become accessible, possibly leading
to malfunctions if they don't match the datatypes of the new columns.
This had escaped notice through a confluence of missing sanity checks,
including
* There's no cross-check that a tuple presented to heap_insert or
heap_update matches the table rowtype. While it's difficult to
check that fully at reasonable cost, we can easily add assertions
that there aren't too many columns.
* The output-column-assignment cases in execExprInterp.c lacked
any sanity checks on the output column numbers, which seems like
an oversight considering there are plenty of assertion checks on
input column numbers. Add assertions there too.
* We failed to apply nodeModifyTable's ExecCheckPlanOutput() to
the ON CONFLICT UPDATE tlist. That wouldn't have caught this
specific error, since that function is chartered to ignore resjunk
columns; but it sure seems like a bad omission now that we've seen
this bug.
In HEAD, the right way to fix this is to make the processing of
ON CONFLICT UPDATE tlists work the same as regular UPDATE tlists
now do, that is don't add "SET x = x" entries, and use
ExecBuildUpdateProjection to evaluate the tlist and combine it with
old values of the not-set columns. This adds a little complication
to ExecBuildUpdateProjection, but allows removal of a comparable
amount of now-dead code from the planner.
In the back branches, the most expedient solution seems to be to
(a) use an output slot for the ON CONFLICT UPDATE projection that
actually matches the target table, and then (b) invent a variant of
ExecBuildProjectionInfo that can be told to not store values resulting
from resjunk columns, so it doesn't try to store into nonexistent
columns of the output slot. (We can't simply ignore the resjunk columns
altogether; they have to be evaluated for MULTIEXPR_SUBLINK to work.)
This works back to v10. In 9.6, projections work much differently and
we can't cheaply give them such an option. The 9.6 version of this
patch works by inserting a JunkFilter when it's necessary to get rid
of resjunk columns.
In addition, v11 and up have the reverse problem when trying to
perform ON CONFLICT UPDATE on a partitioned table. Through a
further oversight, adjust_partition_tlist() discarded resjunk columns
when re-ordering the ON CONFLICT UPDATE tlist to match a partition.
This accidentally prevented the storing-bogus-tuples problem, but
at the cost that MULTIEXPR_SUBLINK cases didn't work, typically
crashing if more than one row has to be updated. Fix by preserving
resjunk columns in that routine. (I failed to resist the temptation
to add more assertions there too, and to do some minor code
beautification.)
Per report from Andres Freund. Back-patch to all supported branches.
Security: CVE-2021-32028
2021-05-10 17:02:29 +02:00
|
|
|
List *onConflictSet; /* INSERT ON CONFLICT DO UPDATE targetlist */
|
|
|
|
List *onConflictCols; /* target column numbers for onConflictSet */
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
Node *onConflictWhere; /* WHERE for ON CONFLICT UPDATE */
|
|
|
|
Index exclRelRTI; /* RTI of the EXCLUDED pseudo relation */
|
|
|
|
List *exclRelTlist; /* tlist of the EXCLUDED pseudo relation */
|
2009-10-10 03:43:50 +02:00
|
|
|
} ModifyTable;
|
|
|
|
|
2018-08-02 01:42:46 +02:00
|
|
|
struct PartitionPruneInfo; /* forward reference to struct below */
|
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
/* ----------------
|
2000-11-12 01:37:02 +01:00
|
|
|
* Append node -
|
|
|
|
* Generate the concatenation of the results of sub-plans.
|
1996-08-28 03:59:28 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct Append
|
|
|
|
{
|
|
|
|
Plan plan;
|
Further adjust EXPLAIN's choices of table alias names.
This patch causes EXPLAIN to always assign a separate table alias to the
parent RTE of an append relation (inheritance set); before, such RTEs
were ignored if not actually scanned by the plan. Since the child RTEs
now always have that same alias to start with (cf. commit 55a1954da),
the net effect is that the parent RTE usually gets the alias used or
implied by the query text, and the children all get that alias with "_N"
appended. (The exception to "usually" is if there are duplicate aliases
in different subtrees of the original query; then some of those original
RTEs will also have "_N" appended.)
This results in more uniform output for partitioned-table plans than
we had before: the partitioned table itself gets the original alias,
and all child tables have aliases with "_N", rather than the previous
behavior where one of the children would get an alias without "_N".
The reason for giving the parent RTE an alias, even if it isn't scanned
by the plan, is that we now use the parent's alias to qualify Vars that
refer to an appendrel output column and appear above the Append or
MergeAppend that computes the appendrel. But below the append, Vars
refer to some one of the child relations, and are displayed that way.
This seems clearer than the old behavior where a Var that could carry
values from any child relation was displayed as if it referred to only
one of them.
While at it, change ruleutils.c so that the code paths used by EXPLAIN
deal in Plan trees not PlanState trees. This effectively reverts a
decision made in commit 1cc29fe7c, which seemed like a good idea at
the time to make ruleutils.c consistent with explain.c. However,
it's problematic because we'd really like to allow executor startup
pruning to remove all the children of an append node when possible,
leaving no child PlanState to resolve Vars against. (That's not done
here, but will be in the next patch.) This requires different handling
of subplans and initplans than before, but is otherwise a pretty
straightforward change.
Discussion: https://postgr.es/m/001001d4f44b$2a2cca50$7e865ef0$@lab.ntt.co.jp
2019-12-11 23:05:18 +01:00
|
|
|
Bitmapset *apprelids; /* RTIs of appendrel(s) formed by this node */
|
1998-07-15 16:54:39 +02:00
|
|
|
List *appendplans;
|
Add support for asynchronous execution.
This implements asynchronous execution, which runs multiple parts of a
non-parallel-aware Append concurrently rather than serially to improve
performance when possible. Currently, the only node type that can be
run concurrently is a ForeignScan that is an immediate child of such an
Append. In the case where such ForeignScans access data on different
remote servers, this would run those ForeignScans concurrently, and
overlap the remote operations to be performed simultaneously, so it'll
improve the performance especially when the operations involve
time-consuming ones such as remote join and remote aggregation.
We may extend this to other node types such as joins or aggregates over
ForeignScans in the future.
This also adds the support for postgres_fdw, which is enabled by the
table-level/server-level option "async_capable". The default is false.
Robert Haas, Kyotaro Horiguchi, Thomas Munro, and myself. This commit
is mostly based on the patch proposed by Robert Haas, but also uses
stuff from the patch proposed by Kyotaro Horiguchi and from the patch
proposed by Thomas Munro. Reviewed by Kyotaro Horiguchi, Konstantin
Knizhnik, Andrey Lepikhov, Movead Li, Thomas Munro, Justin Pryzby, and
others.
Discussion: https://postgr.es/m/CA%2BTgmoaXQEt4tZ03FtQhnzeDEMzBck%2BLrni0UWHVVgOTnA6C1w%40mail.gmail.com
Discussion: https://postgr.es/m/CA%2BhUKGLBRyu0rHrDCMC4%3DRn3252gogyp1SjOgG8SEKKZv%3DFwfQ%40mail.gmail.com
Discussion: https://postgr.es/m/20200228.170650.667613673625155850.horikyota.ntt%40gmail.com
2021-03-31 11:45:00 +02:00
|
|
|
int nasyncplans; /* # of asynchronous plans */
|
2018-04-09 22:23:49 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* All 'appendplans' preceding this index are non-partial plans. All
|
|
|
|
* 'appendplans' from this index onwards are partial plans.
|
|
|
|
*/
|
Support Parallel Append plan nodes.
When we create an Append node, we can spread out the workers over the
subplans instead of piling on to each subplan one at a time, which
should typically be a bit more efficient, both because the startup
cost of any plan executed entirely by one worker is paid only once and
also because of reduced contention. We can also construct Append
plans using a mix of partial and non-partial subplans, which may allow
for parallelism in places that otherwise couldn't support it.
Unfortunately, this patch doesn't handle the important case of
parallelizing UNION ALL by running each branch in a separate worker;
the executor infrastructure is added here, but more planner work is
needed.
Amit Khandekar, Robert Haas, Amul Sul, reviewed and tested by
Ashutosh Bapat, Amit Langote, Rafia Sabih, Amit Kapila, and
Rajkumar Raghuwanshi.
Discussion: http://postgr.es/m/CAJ3gD9dy0K_E8r727heqXoBmWZ83HwLFwdcaSSmBQ1+S+vRuUQ@mail.gmail.com
2017-12-05 23:28:39 +01:00
|
|
|
int first_partial_plan;
|
Support partition pruning at execution time
Existing partition pruning is only able to work at plan time, for query
quals that appear in the parsed query. This is good but limiting, as
there can be parameters that appear later that can be usefully used to
further prune partitions.
This commit adds support for pruning subnodes of Append which cannot
possibly contain any matching tuples, during execution, by evaluating
Params to determine the minimum set of subnodes that can possibly match.
We support more than just simple Params in WHERE clauses. Support
additionally includes:
1. Parameterized Nested Loop Joins: The parameter from the outer side of the
join can be used to determine the minimum set of inner side partitions to
scan.
2. Initplans: Once an initplan has been executed we can then determine which
partitions match the value from the initplan.
Partition pruning is performed in two ways. When Params external to the plan
are found to match the partition key we attempt to prune away unneeded Append
subplans during the initialization of the executor. This allows us to bypass
the initialization of non-matching subplans meaning they won't appear in the
EXPLAIN or EXPLAIN ANALYZE output.
For parameters whose value is only known during the actual execution
then the pruning of these subplans must wait. Subplans which are
eliminated during this stage of pruning are still visible in the EXPLAIN
output. In order to determine if pruning has actually taken place, the
EXPLAIN ANALYZE must be viewed. If a certain Append subplan was never
executed due to the elimination of the partition then the execution
timing area will state "(never executed)". Whereas, if, for example in
the case of parameterized nested loops, the number of loops stated in
the EXPLAIN ANALYZE output for certain subplans may appear lower than
others due to the subplan having been scanned fewer times. This is due
to the list of matching subnodes having to be evaluated whenever a
parameter which was found to match the partition key changes.
This commit required some additional infrastructure that permits the
building of a data structure which is able to perform the translation of
the matching partition IDs, as returned by get_matching_partitions, into
the list index of a subpaths list, as exist in node types such as
Append, MergeAppend and ModifyTable. This allows us to translate a list
of clauses into a Bitmapset of all the subpath indexes which must be
included to satisfy the clause list.
Author: David Rowley, based on an earlier effort by Beena Emerson
Reviewers: Amit Langote, Robert Haas, Amul Sul, Rajkumar Raghuwanshi,
Jesper Pedersen
Discussion: https://postgr.es/m/CAOG9ApE16ac-_VVZVvv0gePSgkg_BwYEV1NBqZFqDR2bBE0X0A@mail.gmail.com
2018-04-07 22:54:31 +02:00
|
|
|
|
2018-08-02 01:42:46 +02:00
|
|
|
/* Info for run-time subplan pruning; NULL if we're not doing that */
|
|
|
|
struct PartitionPruneInfo *part_prune_info;
|
1996-08-28 03:59:28 +02:00
|
|
|
} Append;
|
|
|
|
|
2010-10-14 22:56:39 +02:00
|
|
|
/* ----------------
|
|
|
|
* MergeAppend node -
|
|
|
|
* Merge the results of pre-sorted sub-plans to preserve the ordering.
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct MergeAppend
|
|
|
|
{
|
|
|
|
Plan plan;
|
Further adjust EXPLAIN's choices of table alias names.
This patch causes EXPLAIN to always assign a separate table alias to the
parent RTE of an append relation (inheritance set); before, such RTEs
were ignored if not actually scanned by the plan. Since the child RTEs
now always have that same alias to start with (cf. commit 55a1954da),
the net effect is that the parent RTE usually gets the alias used or
implied by the query text, and the children all get that alias with "_N"
appended. (The exception to "usually" is if there are duplicate aliases
in different subtrees of the original query; then some of those original
RTEs will also have "_N" appended.)
This results in more uniform output for partitioned-table plans than
we had before: the partitioned table itself gets the original alias,
and all child tables have aliases with "_N", rather than the previous
behavior where one of the children would get an alias without "_N".
The reason for giving the parent RTE an alias, even if it isn't scanned
by the plan, is that we now use the parent's alias to qualify Vars that
refer to an appendrel output column and appear above the Append or
MergeAppend that computes the appendrel. But below the append, Vars
refer to some one of the child relations, and are displayed that way.
This seems clearer than the old behavior where a Var that could carry
values from any child relation was displayed as if it referred to only
one of them.
While at it, change ruleutils.c so that the code paths used by EXPLAIN
deal in Plan trees not PlanState trees. This effectively reverts a
decision made in commit 1cc29fe7c, which seemed like a good idea at
the time to make ruleutils.c consistent with explain.c. However,
it's problematic because we'd really like to allow executor startup
pruning to remove all the children of an append node when possible,
leaving no child PlanState to resolve Vars against. (That's not done
here, but will be in the next patch.) This requires different handling
of subplans and initplans than before, but is otherwise a pretty
straightforward change.
Discussion: https://postgr.es/m/001001d4f44b$2a2cca50$7e865ef0$@lab.ntt.co.jp
2019-12-11 23:05:18 +01:00
|
|
|
Bitmapset *apprelids; /* RTIs of appendrel(s) formed by this node */
|
2010-10-14 22:56:39 +02:00
|
|
|
List *mergeplans;
|
2018-10-07 20:33:17 +02:00
|
|
|
/* these fields are just like the sort-key info in struct Sort: */
|
2010-10-14 22:56:39 +02:00
|
|
|
int numCols; /* number of sort-key columns */
|
|
|
|
AttrNumber *sortColIdx; /* their indexes in the target list */
|
|
|
|
Oid *sortOperators; /* OIDs of operators to sort them by */
|
2011-02-08 22:04:18 +01:00
|
|
|
Oid *collations; /* OIDs of collations */
|
2010-10-14 22:56:39 +02:00
|
|
|
bool *nullsFirst; /* NULLS FIRST/LAST directions */
|
2018-08-02 01:42:46 +02:00
|
|
|
/* Info for run-time subplan pruning; NULL if we're not doing that */
|
|
|
|
struct PartitionPruneInfo *part_prune_info;
|
2010-10-14 22:56:39 +02:00
|
|
|
} MergeAppend;
|
|
|
|
|
2008-10-04 23:56:55 +02:00
|
|
|
/* ----------------
|
|
|
|
* RecursiveUnion node -
|
|
|
|
* Generate a recursive union of two subplans.
|
|
|
|
*
|
|
|
|
* The "outer" subplan is always the non-recursive term, and the "inner"
|
|
|
|
* subplan is the recursive term.
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct RecursiveUnion
|
|
|
|
{
|
|
|
|
Plan plan;
|
|
|
|
int wtParam; /* ID of Param representing work table */
|
2008-10-07 21:27:04 +02:00
|
|
|
/* Remaining fields are zero/null in UNION ALL case */
|
|
|
|
int numCols; /* number of columns to check for
|
|
|
|
* duplicate-ness */
|
|
|
|
AttrNumber *dupColIdx; /* their indexes in the target list */
|
|
|
|
Oid *dupOperators; /* equality operators to compare with */
|
2019-03-22 12:09:32 +01:00
|
|
|
Oid *dupCollations;
|
2008-10-07 21:27:04 +02:00
|
|
|
long numGroups; /* estimated number of groups in input */
|
2008-10-04 23:56:55 +02:00
|
|
|
} RecursiveUnion;
|
|
|
|
|
2005-04-20 00:35:18 +02:00
|
|
|
/* ----------------
|
|
|
|
* BitmapAnd node -
|
|
|
|
* Generate the intersection of the results of sub-plans.
|
|
|
|
*
|
|
|
|
* The subplans must be of types that yield tuple bitmaps. The targetlist
|
|
|
|
* and qual fields of the plan are unused and are always NIL.
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct BitmapAnd
|
|
|
|
{
|
|
|
|
Plan plan;
|
|
|
|
List *bitmapplans;
|
|
|
|
} BitmapAnd;
|
|
|
|
|
|
|
|
/* ----------------
|
|
|
|
* BitmapOr node -
|
|
|
|
* Generate the union of the results of sub-plans.
|
|
|
|
*
|
|
|
|
* The subplans must be of types that yield tuple bitmaps. The targetlist
|
|
|
|
* and qual fields of the plan are unused and are always NIL.
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct BitmapOr
|
|
|
|
{
|
|
|
|
Plan plan;
|
Support parallel bitmap heap scans.
The index is scanned by a single process, but then all cooperating
processes can iterate jointly over the resulting set of heap blocks.
In the future, we might also want to support using a parallel bitmap
index scan to set up for a parallel bitmap heap scan, but that's a
job for another day.
Dilip Kumar, with some corrections and cosmetic changes by me. The
larger patch set of which this is a part has been reviewed and tested
by (at least) Andres Freund, Amit Khandekar, Tushar Ahuja, Rafia
Sabih, Haribabu Kommi, Thomas Munro, and me.
Discussion: http://postgr.es/m/CAFiTN-uc4=0WxRGfCzs-xfkMYcSEWUC-Fon6thkJGjkh9i=13A@mail.gmail.com
2017-03-08 18:05:43 +01:00
|
|
|
bool isshared;
|
2005-04-20 00:35:18 +02:00
|
|
|
List *bitmapplans;
|
|
|
|
} BitmapOr;
|
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
/*
|
|
|
|
* ==========
|
|
|
|
* Scan nodes
|
|
|
|
* ==========
|
|
|
|
*/
|
|
|
|
typedef struct Scan
|
|
|
|
{
|
|
|
|
Plan plan;
|
|
|
|
Index scanrelid; /* relid is index into the range table */
|
|
|
|
} Scan;
|
|
|
|
|
|
|
|
/* ----------------
|
|
|
|
* sequential scan node
|
|
|
|
* ----------------
|
|
|
|
*/
|
2021-08-08 16:55:51 +02:00
|
|
|
typedef struct SeqScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
|
|
|
} SeqScan;
|
1996-08-28 03:59:28 +02:00
|
|
|
|
2015-05-15 20:37:10 +02:00
|
|
|
/* ----------------
|
|
|
|
* table sample scan node
|
|
|
|
* ----------------
|
|
|
|
*/
|
Redesign tablesample method API, and do extensive code review.
The original implementation of TABLESAMPLE modeled the tablesample method
API on index access methods, which wasn't a good choice because, without
specialized DDL commands, there's no way to build an extension that can
implement a TSM. (Raw inserts into system catalogs are not an acceptable
thing to do, because we can't undo them during DROP EXTENSION, nor will
pg_upgrade behave sanely.) Instead adopt an API more like procedural
language handlers or foreign data wrappers, wherein the only SQL-level
support object needed is a single handler function identified by having
a special return type. This lets us get rid of the supporting catalog
altogether, so that no custom DDL support is needed for the feature.
Adjust the API so that it can support non-constant tablesample arguments
(the original coding assumed we could evaluate the argument expressions at
ExecInitSampleScan time, which is undesirable even if it weren't outright
unsafe), and discourage sampling methods from looking at invisible tuples.
Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable
within and across queries, as required by the SQL standard, and deal more
honestly with methods that can't support that requirement.
Make a full code-review pass over the tablesample additions, and fix
assorted bugs, omissions, infelicities, and cosmetic issues (such as
failure to put the added code stanzas in a consistent ordering).
Improve EXPLAIN's output of tablesample plans, too.
Back-patch to 9.5 so that we don't have to support the original API
in production.
2015-07-25 20:39:00 +02:00
|
|
|
typedef struct SampleScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
|
|
|
/* use struct pointer to avoid including parsenodes.h here */
|
|
|
|
struct TableSampleClause *tablesample;
|
|
|
|
} SampleScan;
|
2015-05-15 20:37:10 +02:00
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
/* ----------------
|
|
|
|
* index scan node
|
2003-11-09 22:30:38 +01:00
|
|
|
*
|
2005-04-25 03:30:14 +02:00
|
|
|
* indexqualorig is an implicitly-ANDed list of index qual expressions, each
|
|
|
|
* in the same form it appeared in the query WHERE condition. Each should
|
|
|
|
* be of the form (indexkey OP comparisonval) or (comparisonval OP indexkey).
|
|
|
|
* The indexkey is a Var or expression referencing column(s) of the index's
|
|
|
|
* base table. The comparisonval might be any expression, but it won't use
|
2010-12-03 02:50:48 +01:00
|
|
|
* any columns of the base table. The expressions are ordered by index
|
|
|
|
* column position (but items referencing the same index column can appear
|
|
|
|
* in any order). indexqualorig is used at runtime only if we have to recheck
|
|
|
|
* a lossy indexqual.
|
2005-04-25 03:30:14 +02:00
|
|
|
*
|
|
|
|
* indexqual has the same form, but the expressions have been commuted if
|
|
|
|
* necessary to put the indexkeys on the left, and the indexkeys are replaced
|
2011-10-11 20:20:06 +02:00
|
|
|
* by Var nodes identifying the index columns (their varno is INDEX_VAR and
|
|
|
|
* their varattno is the index column number).
|
2010-12-03 02:50:48 +01:00
|
|
|
*
|
|
|
|
* indexorderbyorig is similarly the original form of any ORDER BY expressions
|
|
|
|
* that are being implemented by the index, while indexorderby is modified to
|
|
|
|
* have index column Vars on the left-hand side. Here, multiple expressions
|
|
|
|
* must appear in exactly the ORDER BY order, and this is not necessarily the
|
|
|
|
* index column order. Only the expressions are provided, not the auxiliary
|
|
|
|
* sort-order information from the ORDER BY SortGroupClauses; it's assumed
|
|
|
|
* that the sort ordering is fully determinable from the top-level operators.
|
2015-05-15 13:26:51 +02:00
|
|
|
* indexorderbyorig is used at runtime to recheck the ordering, if the index
|
|
|
|
* cannot calculate an accurate ordering. It is also needed for EXPLAIN.
|
|
|
|
*
|
2015-05-18 03:22:12 +02:00
|
|
|
* indexorderbyops is a list of the OIDs of the operators used to sort the
|
|
|
|
* ORDER BY expressions. This is used together with indexorderbyorig to
|
|
|
|
* recheck ordering at run time. (Note that indexorderby, indexorderbyorig,
|
|
|
|
* and indexorderbyops are used for amcanorderbyop cases, not amcanorder.)
|
2011-10-08 02:13:02 +02:00
|
|
|
*
|
|
|
|
* indexorderdir specifies the scan ordering, for indexscans on amcanorder
|
2011-10-11 20:20:06 +02:00
|
|
|
* indexes (for other indexes it should be "don't care").
|
1996-08-28 03:59:28 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct IndexScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
2005-04-25 03:30:14 +02:00
|
|
|
Oid indexid; /* OID of index to scan */
|
2010-12-03 02:50:48 +01:00
|
|
|
List *indexqual; /* list of index quals (usually OpExprs) */
|
2005-04-25 03:30:14 +02:00
|
|
|
List *indexqualorig; /* the same in original form */
|
2010-12-03 02:50:48 +01:00
|
|
|
List *indexorderby; /* list of index ORDER BY exprs */
|
|
|
|
List *indexorderbyorig; /* the same in original form */
|
2015-05-18 03:22:12 +02:00
|
|
|
List *indexorderbyops; /* OIDs of sort ops for ORDER BY exprs */
|
2005-04-25 03:30:14 +02:00
|
|
|
ScanDirection indexorderdir; /* forward or backward or don't care */
|
1996-08-28 03:59:28 +02:00
|
|
|
} IndexScan;
|
|
|
|
|
2011-10-11 20:20:06 +02:00
|
|
|
/* ----------------
|
|
|
|
* index-only scan node
|
|
|
|
*
|
|
|
|
* IndexOnlyScan is very similar to IndexScan, but it specifies an
|
|
|
|
* index-only scan, in which the data comes from the index not the heap.
|
|
|
|
* Because of this, *all* Vars in the plan node's targetlist, qual, and
|
|
|
|
* index expressions reference index columns and have varno = INDEX_VAR.
|
|
|
|
* Hence we do not need separate indexqualorig and indexorderbyorig lists,
|
|
|
|
* since their contents would be equivalent to indexqual and indexorderby.
|
|
|
|
*
|
|
|
|
* To help EXPLAIN interpret the index Vars for display, we provide
|
|
|
|
* indextlist, which represents the contents of the index as a targetlist
|
|
|
|
* with one TLE per index column. Vars appearing in this list reference
|
|
|
|
* the base table, and this is the only field in the plan node that may
|
|
|
|
* contain such Vars.
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct IndexOnlyScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
|
|
|
Oid indexid; /* OID of index to scan */
|
|
|
|
List *indexqual; /* list of index quals (usually OpExprs) */
|
|
|
|
List *indexorderby; /* list of index ORDER BY exprs */
|
|
|
|
List *indextlist; /* TargetEntry list describing index's cols */
|
|
|
|
ScanDirection indexorderdir; /* forward or backward or don't care */
|
|
|
|
} IndexOnlyScan;
|
|
|
|
|
1999-11-23 21:07:06 +01:00
|
|
|
/* ----------------
|
2005-04-20 00:35:18 +02:00
|
|
|
* bitmap index scan node
|
|
|
|
*
|
|
|
|
* BitmapIndexScan delivers a bitmap of potential tuple locations;
|
|
|
|
* it does not access the heap itself. The bitmap is used by an
|
|
|
|
* ancestor BitmapHeapScan node, possibly after passing through
|
|
|
|
* intermediate BitmapAnd and/or BitmapOr nodes to combine it with
|
|
|
|
* the results of other BitmapIndexScans.
|
|
|
|
*
|
2005-04-25 03:30:14 +02:00
|
|
|
* The fields have the same meanings as for IndexScan, except we don't
|
|
|
|
* store a direction flag because direction is uninteresting.
|
|
|
|
*
|
2005-04-20 00:35:18 +02:00
|
|
|
* In a BitmapIndexScan plan node, the targetlist and qual fields are
|
2005-04-25 03:30:14 +02:00
|
|
|
* not used and are always NIL. The indexqualorig field is unused at
|
2005-04-20 00:35:18 +02:00
|
|
|
* run time too, but is saved for the benefit of EXPLAIN.
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct BitmapIndexScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
2005-04-25 03:30:14 +02:00
|
|
|
Oid indexid; /* OID of index to scan */
|
Support parallel bitmap heap scans.
The index is scanned by a single process, but then all cooperating
processes can iterate jointly over the resulting set of heap blocks.
In the future, we might also want to support using a parallel bitmap
index scan to set up for a parallel bitmap heap scan, but that's a
job for another day.
Dilip Kumar, with some corrections and cosmetic changes by me. The
larger patch set of which this is a part has been reviewed and tested
by (at least) Andres Freund, Amit Khandekar, Tushar Ahuja, Rafia
Sabih, Haribabu Kommi, Thomas Munro, and me.
Discussion: http://postgr.es/m/CAFiTN-uc4=0WxRGfCzs-xfkMYcSEWUC-Fon6thkJGjkh9i=13A@mail.gmail.com
2017-03-08 18:05:43 +01:00
|
|
|
bool isshared; /* Create shared bitmap if set */
|
2005-04-25 03:30:14 +02:00
|
|
|
List *indexqual; /* list of index quals (OpExprs) */
|
|
|
|
List *indexqualorig; /* the same in original form */
|
2005-04-20 00:35:18 +02:00
|
|
|
} BitmapIndexScan;
|
|
|
|
|
|
|
|
/* ----------------
|
|
|
|
* bitmap sequential scan node
|
|
|
|
*
|
|
|
|
* This needs a copy of the qual conditions being used by the input index
|
|
|
|
* scans because there are various cases where we need to recheck the quals;
|
|
|
|
* for example, when the bitmap is lossy about the specific rows on a page
|
|
|
|
* that meet the index condition.
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct BitmapHeapScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
|
|
|
List *bitmapqualorig; /* index quals, in standard expr form */
|
|
|
|
} BitmapHeapScan;
|
|
|
|
|
|
|
|
/* ----------------
|
|
|
|
* tid scan node
|
2005-11-26 23:14:57 +01:00
|
|
|
*
|
|
|
|
* tidquals is an implicitly OR'ed list of qual expressions of the form
|
2018-12-30 21:24:28 +01:00
|
|
|
* "CTID = pseudoconstant", or "CTID = ANY(pseudoconstant_array)",
|
|
|
|
* or a CurrentOfExpr for the relation.
|
1999-11-23 21:07:06 +01:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct TidScan
|
|
|
|
{
|
2000-01-09 01:26:47 +01:00
|
|
|
Scan scan;
|
2005-11-26 23:14:57 +01:00
|
|
|
List *tidquals; /* qual(s) involving CTID = something */
|
1999-11-23 21:07:06 +01:00
|
|
|
} TidScan;
|
|
|
|
|
2021-02-27 10:59:36 +01:00
|
|
|
/* ----------------
|
|
|
|
* tid range scan node
|
|
|
|
*
|
|
|
|
* tidrangequals is an implicitly AND'ed list of qual expressions of the form
|
|
|
|
* "CTID relop pseudoconstant", where relop is one of >,>=,<,<=.
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct TidRangeScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
|
|
|
List *tidrangequals; /* qual(s) involving CTID op something */
|
|
|
|
} TidRangeScan;
|
|
|
|
|
2000-09-29 20:21:41 +02:00
|
|
|
/* ----------------
|
|
|
|
* subquery scan node
|
|
|
|
*
|
|
|
|
* SubqueryScan is for scanning the output of a sub-query in the range table.
|
2007-02-19 03:23:12 +01:00
|
|
|
* We often need an extra plan node above the sub-query's plan to perform
|
|
|
|
* expression evaluations (which we can't push into the sub-query without
|
|
|
|
* risking changing its semantics). Although we are not scanning a physical
|
|
|
|
* relation, we make this a descendant of Scan anyway for code-sharing
|
|
|
|
* purposes.
|
2000-09-29 20:21:41 +02:00
|
|
|
*
|
|
|
|
* Note: we store the sub-plan in the type-specific subplan field, not in
|
|
|
|
* the generic lefttree field as you might expect. This is because we do
|
|
|
|
* not want plan-tree-traversal routines to recurse into the subplan without
|
|
|
|
* knowing that they are changing Query contexts.
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct SubqueryScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
|
|
|
Plan *subplan;
|
|
|
|
} SubqueryScan;
|
|
|
|
|
2002-05-12 22:10:05 +02:00
|
|
|
/* ----------------
|
|
|
|
* FunctionScan node
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct FunctionScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
Support multi-argument UNNEST(), and TABLE() syntax for multiple functions.
This patch adds the ability to write TABLE( function1(), function2(), ...)
as a single FROM-clause entry. The result is the concatenation of the
first row from each function, followed by the second row from each
function, etc; with NULLs inserted if any function produces fewer rows than
others. This is believed to be a much more useful behavior than what
Postgres currently does with multiple SRFs in a SELECT list.
This syntax also provides a reasonable way to combine use of column
definition lists with WITH ORDINALITY: put the column definition list
inside TABLE(), where it's clear that it doesn't control the ordinality
column as well.
Also implement SQL-compliant multiple-argument UNNEST(), by turning
UNNEST(a,b,c) into TABLE(unnest(a), unnest(b), unnest(c)).
The SQL standard specifies TABLE() with only a single function, not
multiple functions, and it seems to require an implicit UNNEST() which is
not what this patch does. There may be something wrong with that reading
of the spec, though, because if it's right then the spec's TABLE() is just
a pointless alternative spelling of UNNEST(). After further review of
that, we might choose to adopt a different syntax for what this patch does,
but in any case this functionality seems clearly worthwhile.
Andrew Gierth, reviewed by Zoltán Böszörményi and Heikki Linnakangas, and
significantly revised by me
2013-11-22 01:37:02 +01:00
|
|
|
List *functions; /* list of RangeTblFunction nodes */
|
|
|
|
bool funcordinality; /* WITH ORDINALITY */
|
2002-05-12 22:10:05 +02:00
|
|
|
} FunctionScan;
|
|
|
|
|
2006-08-02 03:59:48 +02:00
|
|
|
/* ----------------
|
|
|
|
* ValuesScan node
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct ValuesScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
2007-02-19 03:23:12 +01:00
|
|
|
List *values_lists; /* list of expression lists */
|
2006-08-02 03:59:48 +02:00
|
|
|
} ValuesScan;
|
|
|
|
|
2017-03-08 16:39:37 +01:00
|
|
|
/* ----------------
|
|
|
|
* TableFunc scan node
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct TableFuncScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
|
|
|
TableFunc *tablefunc; /* table function node */
|
|
|
|
} TableFuncScan;
|
|
|
|
|
2008-10-04 23:56:55 +02:00
|
|
|
/* ----------------
|
|
|
|
* CteScan node
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct CteScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
|
|
|
int ctePlanId; /* ID of init SubPlan for CTE */
|
|
|
|
int cteParam; /* ID of Param representing CTE output */
|
|
|
|
} CteScan;
|
|
|
|
|
2017-04-01 06:17:18 +02:00
|
|
|
/* ----------------
|
|
|
|
* NamedTuplestoreScan node
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct NamedTuplestoreScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
|
|
|
char *enrname; /* Name given to Ephemeral Named Relation */
|
|
|
|
} NamedTuplestoreScan;
|
|
|
|
|
2008-10-04 23:56:55 +02:00
|
|
|
/* ----------------
|
|
|
|
* WorkTableScan node
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct WorkTableScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
|
|
|
int wtParam; /* ID of Param representing work table */
|
|
|
|
} WorkTableScan;
|
|
|
|
|
2011-02-20 06:17:18 +01:00
|
|
|
/* ----------------
|
|
|
|
* ForeignScan node
|
Revise FDW planning API, again.
Further reflection shows that a single callback isn't very workable if we
desire to let FDWs generate multiple Paths, because that forces the FDW to
do all work necessary to generate a valid Plan node for each Path. Instead
split the former PlanForeignScan API into three steps: GetForeignRelSize,
GetForeignPaths, GetForeignPlan. We had already bit the bullet of breaking
the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain,
and it's substantially more flexible for complex FDWs.
Add an fdw_private field to RelOptInfo so that the new functions can save
state there rather than possibly having to recalculate information two or
three times.
In addition, we'd not thought through what would be needed to allow an FDW
to set up subexpressions of its choice for runtime execution. We could
treat ForeignScan.fdw_private as an executable expression but that seems
likely to break existing FDWs unnecessarily (in particular, it would
restrict the set of node types allowable in fdw_private to those supported
by expression_tree_walker). Instead, invent a separate field fdw_exprs
which will receive the postprocessing appropriate for expression trees.
(One field is enough since it can be a list of expressions; also, we assume
the corresponding expression state tree(s) will be held within fdw_state,
so we don't need to add anything to ForeignScanState.)
Per review of Hanada Shigeru's pgsql_fdw patch. We may need to tweak this
further as we continue to work on that patch, but to me it feels a lot
closer to being right now.
2012-03-09 18:48:48 +01:00
|
|
|
*
|
|
|
|
* fdw_exprs and fdw_private are both under the control of the foreign-data
|
|
|
|
* wrapper, but fdw_exprs is presumed to contain expression trees and will
|
|
|
|
* be post-processed accordingly by the planner; fdw_private won't be.
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
* Note that everything in both lists must be copiable by copyObject().
|
Revise FDW planning API, again.
Further reflection shows that a single callback isn't very workable if we
desire to let FDWs generate multiple Paths, because that forces the FDW to
do all work necessary to generate a valid Plan node for each Path. Instead
split the former PlanForeignScan API into three steps: GetForeignRelSize,
GetForeignPaths, GetForeignPlan. We had already bit the bullet of breaking
the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain,
and it's substantially more flexible for complex FDWs.
Add an fdw_private field to RelOptInfo so that the new functions can save
state there rather than possibly having to recalculate information two or
three times.
In addition, we'd not thought through what would be needed to allow an FDW
to set up subexpressions of its choice for runtime execution. We could
treat ForeignScan.fdw_private as an executable expression but that seems
likely to break existing FDWs unnecessarily (in particular, it would
restrict the set of node types allowable in fdw_private to those supported
by expression_tree_walker). Instead, invent a separate field fdw_exprs
which will receive the postprocessing appropriate for expression trees.
(One field is enough since it can be a list of expressions; also, we assume
the corresponding expression state tree(s) will be held within fdw_state,
so we don't need to add anything to ForeignScanState.)
Per review of Hanada Shigeru's pgsql_fdw patch. We may need to tweak this
further as we continue to work on that patch, but to me it feels a lot
closer to being right now.
2012-03-09 18:48:48 +01:00
|
|
|
* One way to store an arbitrary blob of bytes is to represent it as a bytea
|
|
|
|
* Const. Usually, though, you'll be better off choosing a representation
|
|
|
|
* that can be dumped usefully by nodeToString().
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
*
|
|
|
|
* fdw_scan_tlist is a targetlist describing the contents of the scan tuple
|
|
|
|
* returned by the FDW; it can be NIL if the scan tuple matches the declared
|
|
|
|
* rowtype of the foreign table, which is the normal case for a simple foreign
|
|
|
|
* table scan. (If the plan node represents a foreign join, fdw_scan_tlist
|
|
|
|
* is required since there is no rowtype available from the system catalogs.)
|
|
|
|
* When fdw_scan_tlist is provided, Vars in the node's tlist and quals must
|
|
|
|
* have varno INDEX_VAR, and their varattnos correspond to resnos in the
|
|
|
|
* fdw_scan_tlist (which are also column numbers in the actual scan tuple).
|
|
|
|
* fdw_scan_tlist is never actually executed; it just holds expression trees
|
|
|
|
* describing what is in the scan tuple's columns.
|
|
|
|
*
|
2015-10-15 19:00:40 +02:00
|
|
|
* fdw_recheck_quals should contain any quals which the core system passed to
|
2015-10-20 17:11:35 +02:00
|
|
|
* the FDW but which were not added to scan.plan.qual; that is, it should
|
2015-10-15 19:00:40 +02:00
|
|
|
* contain the quals being checked remotely. This is needed for correct
|
|
|
|
* behavior during EvalPlanQual rechecks.
|
|
|
|
*
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
* When the plan node represents a foreign join, scan.scanrelid is zero and
|
|
|
|
* fs_relids must be consulted to identify the join relation. (fs_relids
|
|
|
|
* is valid for simple scans as well, but will always match scan.scanrelid.)
|
2020-10-14 09:58:38 +02:00
|
|
|
*
|
|
|
|
* If the FDW's PlanDirectModify() callback decides to repurpose a ForeignScan
|
|
|
|
* node to perform the UPDATE or DELETE operation directly in the remote
|
|
|
|
* server, it sets 'operation' and 'resultRelation' to identify the operation
|
|
|
|
* type and target relation. Note that these fields are only set if the
|
|
|
|
* modification is performed *fully* remotely; otherwise, the modification is
|
|
|
|
* driven by a local ModifyTable node and 'operation' is left to CMD_SELECT.
|
2011-02-20 06:17:18 +01:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct ForeignScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
2016-03-18 18:48:58 +01:00
|
|
|
CmdType operation; /* SELECT/INSERT/UPDATE/DELETE */
|
2020-10-14 09:58:38 +02:00
|
|
|
Index resultRelation; /* direct modification target's RT index */
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
Oid fs_server; /* OID of foreign server */
|
Revise FDW planning API, again.
Further reflection shows that a single callback isn't very workable if we
desire to let FDWs generate multiple Paths, because that forces the FDW to
do all work necessary to generate a valid Plan node for each Path. Instead
split the former PlanForeignScan API into three steps: GetForeignRelSize,
GetForeignPaths, GetForeignPlan. We had already bit the bullet of breaking
the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain,
and it's substantially more flexible for complex FDWs.
Add an fdw_private field to RelOptInfo so that the new functions can save
state there rather than possibly having to recalculate information two or
three times.
In addition, we'd not thought through what would be needed to allow an FDW
to set up subexpressions of its choice for runtime execution. We could
treat ForeignScan.fdw_private as an executable expression but that seems
likely to break existing FDWs unnecessarily (in particular, it would
restrict the set of node types allowable in fdw_private to those supported
by expression_tree_walker). Instead, invent a separate field fdw_exprs
which will receive the postprocessing appropriate for expression trees.
(One field is enough since it can be a list of expressions; also, we assume
the corresponding expression state tree(s) will be held within fdw_state,
so we don't need to add anything to ForeignScanState.)
Per review of Hanada Shigeru's pgsql_fdw patch. We may need to tweak this
further as we continue to work on that patch, but to me it feels a lot
closer to being right now.
2012-03-09 18:48:48 +01:00
|
|
|
List *fdw_exprs; /* expressions that FDW may evaluate */
|
2012-03-05 22:15:59 +01:00
|
|
|
List *fdw_private; /* private data for FDW */
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
List *fdw_scan_tlist; /* optional tlist describing scan tuple */
|
2015-10-20 17:11:35 +02:00
|
|
|
List *fdw_recheck_quals; /* original quals not in scan.plan.qual */
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
Bitmapset *fs_relids; /* RTIs generated by this scan */
|
Revise FDW planning API, again.
Further reflection shows that a single callback isn't very workable if we
desire to let FDWs generate multiple Paths, because that forces the FDW to
do all work necessary to generate a valid Plan node for each Path. Instead
split the former PlanForeignScan API into three steps: GetForeignRelSize,
GetForeignPaths, GetForeignPlan. We had already bit the bullet of breaking
the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain,
and it's substantially more flexible for complex FDWs.
Add an fdw_private field to RelOptInfo so that the new functions can save
state there rather than possibly having to recalculate information two or
three times.
In addition, we'd not thought through what would be needed to allow an FDW
to set up subexpressions of its choice for runtime execution. We could
treat ForeignScan.fdw_private as an executable expression but that seems
likely to break existing FDWs unnecessarily (in particular, it would
restrict the set of node types allowable in fdw_private to those supported
by expression_tree_walker). Instead, invent a separate field fdw_exprs
which will receive the postprocessing appropriate for expression trees.
(One field is enough since it can be a list of expressions; also, we assume
the corresponding expression state tree(s) will be held within fdw_state,
so we don't need to add anything to ForeignScanState.)
Per review of Hanada Shigeru's pgsql_fdw patch. We may need to tweak this
further as we continue to work on that patch, but to me it feels a lot
closer to being right now.
2012-03-09 18:48:48 +01:00
|
|
|
bool fsSystemCol; /* true if any "system column" is needed */
|
2011-02-20 06:17:18 +01:00
|
|
|
} ForeignScan;
|
|
|
|
|
2014-11-07 23:26:02 +01:00
|
|
|
/* ----------------
|
2014-11-21 00:36:07 +01:00
|
|
|
* CustomScan node
|
2014-11-22 00:21:46 +01:00
|
|
|
*
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
* The comments for ForeignScan's fdw_exprs, fdw_private, fdw_scan_tlist,
|
|
|
|
* and fs_relids fields apply equally to CustomScan's custom_exprs,
|
|
|
|
* custom_private, custom_scan_tlist, and custom_relids fields. The
|
|
|
|
* convention of setting scan.scanrelid to zero for joins applies as well.
|
|
|
|
*
|
2015-05-01 14:50:35 +02:00
|
|
|
* Note that since Plan trees can be copied, custom scan providers *must*
|
|
|
|
* fit all plan data they need into those fields; embedding CustomScan in
|
|
|
|
* a larger struct will not work.
|
2014-11-07 23:26:02 +01:00
|
|
|
* ----------------
|
|
|
|
*/
|
2016-03-29 17:00:18 +02:00
|
|
|
struct CustomScanMethods;
|
2008-10-04 23:56:55 +02:00
|
|
|
|
2014-11-21 00:36:07 +01:00
|
|
|
typedef struct CustomScan
|
|
|
|
{
|
|
|
|
Scan scan;
|
2016-08-31 09:06:18 +02:00
|
|
|
uint32 flags; /* mask of CUSTOMPATH_* flags, see
|
|
|
|
* nodes/extensible.h */
|
2015-06-26 15:40:47 +02:00
|
|
|
List *custom_plans; /* list of Plan nodes, if any */
|
2014-11-22 00:21:46 +01:00
|
|
|
List *custom_exprs; /* expressions that custom code may evaluate */
|
|
|
|
List *custom_private; /* private data for custom code */
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
List *custom_scan_tlist; /* optional tlist describing scan tuple */
|
2015-05-01 14:50:35 +02:00
|
|
|
Bitmapset *custom_relids; /* RTIs generated by this scan */
|
2016-03-29 17:00:18 +02:00
|
|
|
const struct CustomScanMethods *methods;
|
2014-11-21 00:36:07 +01:00
|
|
|
} CustomScan;
|
|
|
|
|
1999-11-23 21:07:06 +01:00
|
|
|
/*
|
1996-08-28 03:59:28 +02:00
|
|
|
* ==========
|
|
|
|
* Join nodes
|
|
|
|
* ==========
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* ----------------
|
|
|
|
* Join node
|
2000-09-12 23:07:18 +02:00
|
|
|
*
|
|
|
|
* jointype: rule for joining tuples from left and right subtrees
|
2017-04-08 04:20:03 +02:00
|
|
|
* inner_unique each outer tuple can match to no more than one inner tuple
|
2000-09-12 23:07:18 +02:00
|
|
|
* joinqual: qual conditions that came from JOIN/ON or JOIN/USING
|
|
|
|
* (plan.qual contains conditions that came from WHERE)
|
|
|
|
*
|
|
|
|
* When jointype is INNER, joinqual and plan.qual are semantically
|
|
|
|
* interchangeable. For OUTER jointypes, the two are *not* interchangeable;
|
|
|
|
* only joinqual is used to determine whether a match has been found for
|
|
|
|
* the purpose of deciding whether to generate null-extended tuples.
|
|
|
|
* (But plan.qual is still applied before actually returning a tuple.)
|
|
|
|
* For an outer join, only joinquals are allowed to be used as the merge
|
|
|
|
* or hash condition of a merge or hash join.
|
2017-04-08 04:20:03 +02:00
|
|
|
*
|
|
|
|
* inner_unique is set if the joinquals are such that no more than one inner
|
|
|
|
* tuple could match any given outer tuple. This allows the executor to
|
|
|
|
* skip searching for additional matches. (This must be provable from just
|
|
|
|
* the joinquals, ignoring plan.qual, due to where the executor tests it.)
|
1996-08-28 03:59:28 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
2000-09-12 23:07:18 +02:00
|
|
|
typedef struct Join
|
|
|
|
{
|
|
|
|
Plan plan;
|
|
|
|
JoinType jointype;
|
2017-04-08 04:20:03 +02:00
|
|
|
bool inner_unique;
|
2000-09-12 23:07:18 +02:00
|
|
|
List *joinqual; /* JOIN quals (in addition to plan.qual) */
|
|
|
|
} Join;
|
1996-08-28 03:59:28 +02:00
|
|
|
|
|
|
|
/* ----------------
|
|
|
|
* nest loop join node
|
2010-07-12 19:01:06 +02:00
|
|
|
*
|
|
|
|
* The nestParams list identifies any executor Params that must be passed
|
|
|
|
* into execution of the inner subplan carrying values from the current row
|
|
|
|
* of the outer subplan. Currently we restrict these values to be simple
|
2011-11-03 05:50:58 +01:00
|
|
|
* Vars, but perhaps someday that'd be worth relaxing. (Note: during plan
|
|
|
|
* creation, the paramval can actually be a PlaceHolderVar expression; but it
|
|
|
|
* must be a Var with varno OUTER_VAR by the time it gets to the executor.)
|
1996-08-28 03:59:28 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct NestLoop
|
|
|
|
{
|
|
|
|
Join join;
|
2010-07-12 19:01:06 +02:00
|
|
|
List *nestParams; /* list of NestLoopParam nodes */
|
1996-08-28 03:59:28 +02:00
|
|
|
} NestLoop;
|
|
|
|
|
2010-07-12 19:01:06 +02:00
|
|
|
typedef struct NestLoopParam
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
int paramno; /* number of the PARAM_EXEC Param to set */
|
|
|
|
Var *paramval; /* outer-relation Var to assign to Param */
|
|
|
|
} NestLoopParam;
|
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
/* ----------------
|
|
|
|
* merge join node
|
2007-01-10 19:06:05 +01:00
|
|
|
*
|
|
|
|
* The expected ordering of each mergeable column is described by a btree
|
2011-03-26 21:35:25 +01:00
|
|
|
* opfamily OID, a collation OID, a direction (BTLessStrategyNumber or
|
|
|
|
* BTGreaterStrategyNumber) and a nulls-first flag. Note that the two sides
|
|
|
|
* of each mergeclause may be of different datatypes, but they are ordered the
|
|
|
|
* same way according to the common opfamily and collation. The operator in
|
|
|
|
* each mergeclause must be an equality operator of the indicated opfamily.
|
1996-08-28 03:59:28 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct MergeJoin
|
|
|
|
{
|
|
|
|
Join join;
|
2017-04-08 04:20:03 +02:00
|
|
|
bool skip_mark_restore; /* Can we skip mark/restore calls? */
|
2006-12-23 01:43:13 +01:00
|
|
|
List *mergeclauses; /* mergeclauses as expression trees */
|
2007-01-10 19:06:05 +01:00
|
|
|
/* these are arrays, but have the same length as the mergeclauses list: */
|
|
|
|
Oid *mergeFamilies; /* per-clause OIDs of btree opfamilies */
|
2011-02-08 22:04:18 +01:00
|
|
|
Oid *mergeCollations; /* per-clause OIDs of collations */
|
2007-01-10 19:06:05 +01:00
|
|
|
int *mergeStrategies; /* per-clause ordering (ASC or DESC) */
|
|
|
|
bool *mergeNullsFirst; /* per-clause nulls ordering */
|
1996-08-28 03:59:28 +02:00
|
|
|
} MergeJoin;
|
|
|
|
|
|
|
|
/* ----------------
|
2009-03-21 01:04:40 +01:00
|
|
|
* hash join node
|
1996-08-28 03:59:28 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct HashJoin
|
|
|
|
{
|
|
|
|
Join join;
|
|
|
|
List *hashclauses;
|
Fix representation of hash keys in Hash/HashJoin nodes.
In 5f32b29c1819 I changed the creation of HashState.hashkeys to
actually use HashState as the parent (instead of HashJoinState, which
was incorrect, as they were executed below HashState), to fix the
problem of hashkeys expressions otherwise relying on slot types
appropriate for HashJoinState, rather than HashState as would be
correct. That reliance was only introduced in 12, which is why it
previously worked to use HashJoinState as the parent (although I'd be
unsurprised if there were problematic cases).
Unfortunately that's not a sufficient solution, because before this
commit, the to-be-hashed expressions referenced inner/outer as
appropriate for the HashJoin, not Hash. That didn't have obvious bad
consequences, because the slots containing the tuples were put into
ecxt_innertuple when hashing a tuple for HashState (even though Hash
doesn't have an inner plan).
There are less common cases where this can cause visible problems
however (rather than just confusion when inspecting such executor
trees). E.g. "ERROR: bogus varno: 65000", when explaining queries
containing a HashJoin where the subsidiary Hash node's hash keys
reference a subplan. While normally hashkeys aren't displayed by
EXPLAIN, if one of those expressions references a subplan, that
subplan may be printed as part of the Hash node - which then failed
because an inner plan was referenced, and Hash doesn't have that.
It seems quite possible that there's other broken cases, too.
Fix the problem by properly splitting the expression for the HashJoin
and Hash nodes at plan time, and have them reference the proper
subsidiary node. While other workarounds are possible, fixing this
correctly seems easy enough. It was a pretty ugly hack to have
ExecInitHashJoin put the expression into the already initialized
HashState, in the first place.
I decided to not just split inner/outer hashkeys inside
make_hashjoin(), but also to separate out hashoperators and
hashcollations at plan time. Otherwise we would have ended up having
two very similar loops, one at plan time and the other during executor
startup. The work seems to more appropriately belong to plan time,
anyway.
Reported-By: Nikita Glukhov, Alexander Korotkov
Author: Andres Freund
Reviewed-By: Tom Lane, in an earlier version
Discussion: https://postgr.es/m/CAPpHfdvGVegF_TKKRiBrSmatJL2dR9uwFCuR+teQ_8tEXU8mxg@mail.gmail.com
Backpatch: 12-
2019-08-02 09:02:46 +02:00
|
|
|
List *hashoperators;
|
|
|
|
List *hashcollations;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* List of expressions to be hashed for tuples from the outer plan, to
|
|
|
|
* perform lookups in the hashtable over the inner plan.
|
|
|
|
*/
|
|
|
|
List *hashkeys;
|
1996-08-28 03:59:28 +02:00
|
|
|
} HashJoin;
|
|
|
|
|
2002-12-05 16:50:39 +01:00
|
|
|
/* ----------------
|
|
|
|
* materialization node
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct Material
|
|
|
|
{
|
|
|
|
Plan plan;
|
|
|
|
} Material;
|
|
|
|
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
/* ----------------
|
2021-07-14 02:43:58 +02:00
|
|
|
* memoize node
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
2021-07-14 02:43:58 +02:00
|
|
|
typedef struct Memoize
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
{
|
|
|
|
Plan plan;
|
|
|
|
|
|
|
|
int numKeys; /* size of the two arrays below */
|
|
|
|
|
|
|
|
Oid *hashOperators; /* hash operators for each key */
|
|
|
|
Oid *collations; /* cache keys */
|
|
|
|
List *param_exprs; /* exprs containing parameters */
|
|
|
|
bool singlerow; /* true if the cache entry should be marked as
|
|
|
|
* complete after we store the first tuple in
|
|
|
|
* it. */
|
2021-11-23 22:06:59 +01:00
|
|
|
bool binary_mode; /* true when cache key should be compared bit
|
|
|
|
* by bit, false when using hash equality ops */
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
uint32 est_entries; /* The maximum number of entries that the
|
|
|
|
* planner expects will fit in the cache, or 0
|
|
|
|
* if unknown */
|
2021-07-14 02:43:58 +02:00
|
|
|
} Memoize;
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
|
2002-12-05 16:50:39 +01:00
|
|
|
/* ----------------
|
|
|
|
* sort node
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct Sort
|
|
|
|
{
|
|
|
|
Plan plan;
|
2003-05-06 02:20:33 +02:00
|
|
|
int numCols; /* number of sort-key columns */
|
|
|
|
AttrNumber *sortColIdx; /* their indexes in the target list */
|
|
|
|
Oid *sortOperators; /* OIDs of operators to sort them by */
|
2011-02-08 22:04:18 +01:00
|
|
|
Oid *collations; /* OIDs of collations */
|
2007-01-09 03:14:16 +01:00
|
|
|
bool *nullsFirst; /* NULLS FIRST/LAST directions */
|
2002-12-05 16:50:39 +01:00
|
|
|
} Sort;
|
|
|
|
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
/* ----------------
|
|
|
|
* incremental sort node
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct IncrementalSort
|
|
|
|
{
|
|
|
|
Sort sort;
|
|
|
|
int nPresortedCols; /* number of presorted columns */
|
|
|
|
} IncrementalSort;
|
|
|
|
|
2002-12-05 16:50:39 +01:00
|
|
|
/* ---------------
|
|
|
|
* group node -
|
|
|
|
* Used for queries with GROUP BY (but no aggregates) specified.
|
|
|
|
* The input must be presorted according to the grouping columns.
|
|
|
|
* ---------------
|
|
|
|
*/
|
|
|
|
typedef struct Group
|
|
|
|
{
|
|
|
|
Plan plan;
|
|
|
|
int numCols; /* number of grouping columns */
|
|
|
|
AttrNumber *grpColIdx; /* their indexes in the target list */
|
2007-01-10 19:06:05 +01:00
|
|
|
Oid *grpOperators; /* equality operators to compare with */
|
2019-03-22 12:09:32 +01:00
|
|
|
Oid *grpCollations;
|
2002-12-05 16:50:39 +01:00
|
|
|
} Group;
|
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
/* ---------------
|
|
|
|
* aggregate node
|
2002-11-06 01:00:45 +01:00
|
|
|
*
|
|
|
|
* An Agg node implements plain or grouped aggregation. For grouped
|
|
|
|
* aggregation, we can work with presorted input or unsorted input;
|
|
|
|
* the latter strategy uses an internal hashtable.
|
|
|
|
*
|
|
|
|
* Notice the lack of any direct info about the aggregate functions to be
|
|
|
|
* computed. They are found by scanning the node's tlist and quals during
|
|
|
|
* executor startup. (It is possible that there are no aggregate functions;
|
|
|
|
* this could happen if they get optimized away by constant-folding, or if
|
|
|
|
* we are using the Agg node to implement hash-based grouping.)
|
1996-08-28 03:59:28 +02:00
|
|
|
* ---------------
|
|
|
|
*/
|
|
|
|
typedef struct Agg
|
|
|
|
{
|
|
|
|
Plan plan;
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
AggStrategy aggstrategy; /* basic strategy, see nodes.h */
|
2016-06-26 20:33:38 +02:00
|
|
|
AggSplit aggsplit; /* agg-splitting mode, see nodes.h */
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
int numCols; /* number of grouping columns */
|
|
|
|
AttrNumber *grpColIdx; /* their indexes in the target list */
|
2007-01-10 19:06:05 +01:00
|
|
|
Oid *grpOperators; /* equality operators to compare with */
|
2019-03-22 12:09:32 +01:00
|
|
|
Oid *grpCollations;
|
2002-11-06 23:31:24 +01:00
|
|
|
long numGroups; /* estimated number of groups in input */
|
2020-02-28 18:32:35 +01:00
|
|
|
uint64 transitionSpace; /* for pass-by-ref transition data */
|
2016-08-24 20:37:50 +02:00
|
|
|
Bitmapset *aggParams; /* IDs of Params used in Aggref inputs */
|
2017-03-27 05:20:54 +02:00
|
|
|
/* Note: planner provides numGroups & aggParams only in HASHED/MIXED case */
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
List *groupingSets; /* grouping sets to use */
|
|
|
|
List *chain; /* chained Agg/Sort nodes */
|
1996-08-28 03:59:28 +02:00
|
|
|
} Agg;
|
|
|
|
|
2008-12-28 19:54:01 +01:00
|
|
|
/* ----------------
|
|
|
|
* window aggregate node
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct WindowAgg
|
|
|
|
{
|
|
|
|
Plan plan;
|
2008-12-31 01:08:39 +01:00
|
|
|
Index winref; /* ID referenced by window functions */
|
2008-12-28 19:54:01 +01:00
|
|
|
int partNumCols; /* number of columns in partition clause */
|
|
|
|
AttrNumber *partColIdx; /* their indexes in the target list */
|
|
|
|
Oid *partOperators; /* equality operators for partition columns */
|
2019-03-22 12:09:32 +01:00
|
|
|
Oid *partCollations; /* collations for partition columns */
|
2008-12-28 19:54:01 +01:00
|
|
|
int ordNumCols; /* number of columns in ordering clause */
|
|
|
|
AttrNumber *ordColIdx; /* their indexes in the target list */
|
|
|
|
Oid *ordOperators; /* equality operators for ordering columns */
|
2019-03-22 12:09:32 +01:00
|
|
|
Oid *ordCollations; /* collations for ordering columns */
|
2008-12-31 01:08:39 +01:00
|
|
|
int frameOptions; /* frame_clause options, see WindowDef */
|
2010-02-12 18:33:21 +01:00
|
|
|
Node *startOffset; /* expression for starting bound, if any */
|
|
|
|
Node *endOffset; /* expression for ending bound, if any */
|
Support all SQL:2011 options for window frame clauses.
This patch adds the ability to use "RANGE offset PRECEDING/FOLLOWING"
frame boundaries in window functions. We'd punted on that back in the
original patch to add window functions, because it was not clear how to
do it in a reasonably data-type-extensible fashion. That problem is
resolved here by adding the ability for btree operator classes to provide
an "in_range" support function that defines how to add or subtract the
RANGE offset value. Factoring it this way also allows the operator class
to avoid overflow problems near the ends of the datatype's range, if it
wishes to expend effort on that. (In the committed patch, the integer
opclasses handle that issue, but it did not seem worth the trouble to
avoid overflow failures for datetime types.)
The patch includes in_range support for the integer_ops opfamily
(int2/int4/int8) as well as the standard datetime types. Support for
other numeric types has been requested, but that seems like suitable
material for a follow-on patch.
In addition, the patch adds GROUPS mode which counts the offset in
ORDER-BY peer groups rather than rows, and it adds the frame_exclusion
options specified by SQL:2011. As far as I can see, we are now fully
up to spec on window framing options.
Existing behaviors remain unchanged, except that I changed the errcode
for a couple of existing error reports to meet the SQL spec's expectation
that negative "offset" values should be reported as SQLSTATE 22013.
Internally and in relevant parts of the documentation, we now consistently
use the terminology "offset PRECEDING/FOLLOWING" rather than "value
PRECEDING/FOLLOWING", since the term "value" is confusingly vague.
Oliver Ford, reviewed and whacked around some by me
Discussion: https://postgr.es/m/CAGMVOdu9sivPAxbNN0X+q19Sfv9edEPv=HibOJhB14TJv_RCQg@mail.gmail.com
2018-02-07 06:06:50 +01:00
|
|
|
/* these fields are used with RANGE offset PRECEDING/FOLLOWING: */
|
|
|
|
Oid startInRangeFunc; /* in_range function for startOffset */
|
|
|
|
Oid endInRangeFunc; /* in_range function for endOffset */
|
|
|
|
Oid inRangeColl; /* collation for in_range tests */
|
|
|
|
bool inRangeAsc; /* use ASC sort order for in_range tests? */
|
|
|
|
bool inRangeNullsFirst; /* nulls sort first for in_range tests? */
|
2008-12-28 19:54:01 +01:00
|
|
|
} WindowAgg;
|
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
/* ----------------
|
|
|
|
* unique node
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct Unique
|
|
|
|
{
|
2000-06-19 00:44:35 +02:00
|
|
|
Plan plan;
|
2000-01-27 19:11:50 +01:00
|
|
|
int numCols; /* number of columns to check for uniqueness */
|
2007-01-10 19:06:05 +01:00
|
|
|
AttrNumber *uniqColIdx; /* their indexes in the target list */
|
|
|
|
Oid *uniqOperators; /* equality operators to compare with */
|
2019-03-22 12:09:32 +01:00
|
|
|
Oid *uniqCollations; /* collations for equality comparisons */
|
1996-08-28 03:59:28 +02:00
|
|
|
} Unique;
|
|
|
|
|
Add a Gather executor node.
A Gather executor node runs any number of copies of a plan in an equal
number of workers and merges all of the results into a single tuple
stream. It can also run the plan itself, if the workers are
unavailable or haven't started up yet. It is intended to work with
the Partial Seq Scan node which will be added in future commits.
It could also be used to implement parallel query of a different sort
by itself, without help from Partial Seq Scan, if the single_copy mode
is used. In that mode, a worker executes the plan, and the parallel
leader does not, merely collecting the worker's results. So, a Gather
node could be inserted into a plan to split the execution of that plan
across two processes. Nested Gather nodes aren't currently supported,
but we might want to add support for that in the future.
There's nothing in the planner to actually generate Gather nodes yet,
so it's not quite time to break out the champagne. But we're getting
close.
Amit Kapila. Some designs suggestions were provided by me, and I also
reviewed the patch. Single-copy mode, documentation, and other minor
changes also by me.
2015-10-01 01:23:36 +02:00
|
|
|
/* ------------
|
|
|
|
* gather node
|
Force rescanning of parallel-aware scan nodes below a Gather[Merge].
The ExecReScan machinery contains various optimizations for postponing
or skipping rescans of plan subtrees; for example a HashAgg node may
conclude that it can re-use the table it built before, instead of
re-reading its input subtree. But that is wrong if the input contains
a parallel-aware table scan node, since the portion of the table scanned
by the leader process is likely to vary from one rescan to the next.
This explains the timing-dependent buildfarm failures we saw after
commit a2b70c89c.
The established mechanism for showing that a plan node's output is
potentially variable is to mark it as depending on some runtime Param.
Hence, to fix this, invent a dummy Param (one that has a PARAM_EXEC
parameter number, but carries no actual value) associated with each Gather
or GatherMerge node, mark parallel-aware nodes below that node as dependent
on that Param, and arrange for ExecReScanGather[Merge] to flag that Param
as changed whenever the Gather[Merge] node is rescanned.
This solution breaks an undocumented assumption made by the parallel
executor logic, namely that all rescans of nodes below a Gather[Merge]
will happen synchronously during the ReScan of the top node itself.
But that's fundamentally contrary to the design of the ExecReScan code,
and so was doomed to fail someday anyway (even if you want to argue
that the bug being fixed here wasn't a failure of that assumption).
A follow-on patch will address that issue. In the meantime, the worst
that's expected to happen is that given very bad timing luck, the leader
might have to do all the work during a rescan, because workers think
they have nothing to do, if they are able to start up before the eventual
ReScan of the leader's parallel-aware table scan node has reset the
shared scan state.
Although this problem exists in 9.6, there does not seem to be any way
for it to manifest there. Without GatherMerge, it seems that a plan tree
that has a rescan-short-circuiting node below Gather will always also
have one above it that will short-circuit in the same cases, preventing
the Gather from being rescanned. Hence we won't take the risk of
back-patching this change into 9.6. But v10 needs it.
Discussion: https://postgr.es/m/CAA4eK1JkByysFJNh9M349u_nNjqETuEnY_y1VUc_kJiU0bxtaQ@mail.gmail.com
2017-08-30 15:29:55 +02:00
|
|
|
*
|
|
|
|
* Note: rescan_param is the ID of a PARAM_EXEC parameter slot. That slot
|
|
|
|
* will never actually contain a value, but the Gather node must flag it as
|
|
|
|
* having changed whenever it is rescanned. The child parallel-aware scan
|
|
|
|
* nodes are marked as depending on that parameter, so that the rescan
|
|
|
|
* machinery is aware that their output is likely to change across rescans.
|
|
|
|
* In some cases we don't need a rescan Param, so rescan_param is set to -1.
|
Add a Gather executor node.
A Gather executor node runs any number of copies of a plan in an equal
number of workers and merges all of the results into a single tuple
stream. It can also run the plan itself, if the workers are
unavailable or haven't started up yet. It is intended to work with
the Partial Seq Scan node which will be added in future commits.
It could also be used to implement parallel query of a different sort
by itself, without help from Partial Seq Scan, if the single_copy mode
is used. In that mode, a worker executes the plan, and the parallel
leader does not, merely collecting the worker's results. So, a Gather
node could be inserted into a plan to split the execution of that plan
across two processes. Nested Gather nodes aren't currently supported,
but we might want to add support for that in the future.
There's nothing in the planner to actually generate Gather nodes yet,
so it's not quite time to break out the champagne. But we're getting
close.
Amit Kapila. Some designs suggestions were provided by me, and I also
reviewed the patch. Single-copy mode, documentation, and other minor
changes also by me.
2015-10-01 01:23:36 +02:00
|
|
|
* ------------
|
|
|
|
*/
|
|
|
|
typedef struct Gather
|
|
|
|
{
|
|
|
|
Plan plan;
|
Force rescanning of parallel-aware scan nodes below a Gather[Merge].
The ExecReScan machinery contains various optimizations for postponing
or skipping rescans of plan subtrees; for example a HashAgg node may
conclude that it can re-use the table it built before, instead of
re-reading its input subtree. But that is wrong if the input contains
a parallel-aware table scan node, since the portion of the table scanned
by the leader process is likely to vary from one rescan to the next.
This explains the timing-dependent buildfarm failures we saw after
commit a2b70c89c.
The established mechanism for showing that a plan node's output is
potentially variable is to mark it as depending on some runtime Param.
Hence, to fix this, invent a dummy Param (one that has a PARAM_EXEC
parameter number, but carries no actual value) associated with each Gather
or GatherMerge node, mark parallel-aware nodes below that node as dependent
on that Param, and arrange for ExecReScanGather[Merge] to flag that Param
as changed whenever the Gather[Merge] node is rescanned.
This solution breaks an undocumented assumption made by the parallel
executor logic, namely that all rescans of nodes below a Gather[Merge]
will happen synchronously during the ReScan of the top node itself.
But that's fundamentally contrary to the design of the ExecReScan code,
and so was doomed to fail someday anyway (even if you want to argue
that the bug being fixed here wasn't a failure of that assumption).
A follow-on patch will address that issue. In the meantime, the worst
that's expected to happen is that given very bad timing luck, the leader
might have to do all the work during a rescan, because workers think
they have nothing to do, if they are able to start up before the eventual
ReScan of the leader's parallel-aware table scan node has reset the
shared scan state.
Although this problem exists in 9.6, there does not seem to be any way
for it to manifest there. Without GatherMerge, it seems that a plan tree
that has a rescan-short-circuiting node below Gather will always also
have one above it that will short-circuit in the same cases, preventing
the Gather from being rescanned. Hence we won't take the risk of
back-patching this change into 9.6. But v10 needs it.
Discussion: https://postgr.es/m/CAA4eK1JkByysFJNh9M349u_nNjqETuEnY_y1VUc_kJiU0bxtaQ@mail.gmail.com
2017-08-30 15:29:55 +02:00
|
|
|
int num_workers; /* planned number of worker processes */
|
|
|
|
int rescan_param; /* ID of Param that signals a rescan, or -1 */
|
|
|
|
bool single_copy; /* don't execute plan more than once */
|
2016-02-07 17:39:22 +01:00
|
|
|
bool invisible; /* suppress EXPLAIN display (for testing)? */
|
2017-11-16 18:06:14 +01:00
|
|
|
Bitmapset *initParam; /* param id's of initplans which are referred
|
|
|
|
* at gather or one of it's child node */
|
Add a Gather executor node.
A Gather executor node runs any number of copies of a plan in an equal
number of workers and merges all of the results into a single tuple
stream. It can also run the plan itself, if the workers are
unavailable or haven't started up yet. It is intended to work with
the Partial Seq Scan node which will be added in future commits.
It could also be used to implement parallel query of a different sort
by itself, without help from Partial Seq Scan, if the single_copy mode
is used. In that mode, a worker executes the plan, and the parallel
leader does not, merely collecting the worker's results. So, a Gather
node could be inserted into a plan to split the execution of that plan
across two processes. Nested Gather nodes aren't currently supported,
but we might want to add support for that in the future.
There's nothing in the planner to actually generate Gather nodes yet,
so it's not quite time to break out the champagne. But we're getting
close.
Amit Kapila. Some designs suggestions were provided by me, and I also
reviewed the patch. Single-copy mode, documentation, and other minor
changes also by me.
2015-10-01 01:23:36 +02:00
|
|
|
} Gather;
|
|
|
|
|
2017-03-09 13:40:36 +01:00
|
|
|
/* ------------
|
|
|
|
* gather merge node
|
|
|
|
* ------------
|
|
|
|
*/
|
|
|
|
typedef struct GatherMerge
|
|
|
|
{
|
|
|
|
Plan plan;
|
Force rescanning of parallel-aware scan nodes below a Gather[Merge].
The ExecReScan machinery contains various optimizations for postponing
or skipping rescans of plan subtrees; for example a HashAgg node may
conclude that it can re-use the table it built before, instead of
re-reading its input subtree. But that is wrong if the input contains
a parallel-aware table scan node, since the portion of the table scanned
by the leader process is likely to vary from one rescan to the next.
This explains the timing-dependent buildfarm failures we saw after
commit a2b70c89c.
The established mechanism for showing that a plan node's output is
potentially variable is to mark it as depending on some runtime Param.
Hence, to fix this, invent a dummy Param (one that has a PARAM_EXEC
parameter number, but carries no actual value) associated with each Gather
or GatherMerge node, mark parallel-aware nodes below that node as dependent
on that Param, and arrange for ExecReScanGather[Merge] to flag that Param
as changed whenever the Gather[Merge] node is rescanned.
This solution breaks an undocumented assumption made by the parallel
executor logic, namely that all rescans of nodes below a Gather[Merge]
will happen synchronously during the ReScan of the top node itself.
But that's fundamentally contrary to the design of the ExecReScan code,
and so was doomed to fail someday anyway (even if you want to argue
that the bug being fixed here wasn't a failure of that assumption).
A follow-on patch will address that issue. In the meantime, the worst
that's expected to happen is that given very bad timing luck, the leader
might have to do all the work during a rescan, because workers think
they have nothing to do, if they are able to start up before the eventual
ReScan of the leader's parallel-aware table scan node has reset the
shared scan state.
Although this problem exists in 9.6, there does not seem to be any way
for it to manifest there. Without GatherMerge, it seems that a plan tree
that has a rescan-short-circuiting node below Gather will always also
have one above it that will short-circuit in the same cases, preventing
the Gather from being rescanned. Hence we won't take the risk of
back-patching this change into 9.6. But v10 needs it.
Discussion: https://postgr.es/m/CAA4eK1JkByysFJNh9M349u_nNjqETuEnY_y1VUc_kJiU0bxtaQ@mail.gmail.com
2017-08-30 15:29:55 +02:00
|
|
|
int num_workers; /* planned number of worker processes */
|
|
|
|
int rescan_param; /* ID of Param that signals a rescan, or -1 */
|
2017-03-09 13:40:36 +01:00
|
|
|
/* remaining fields are just like the sort-key info in struct Sort */
|
|
|
|
int numCols; /* number of sort-key columns */
|
|
|
|
AttrNumber *sortColIdx; /* their indexes in the target list */
|
|
|
|
Oid *sortOperators; /* OIDs of operators to sort them by */
|
|
|
|
Oid *collations; /* OIDs of collations */
|
|
|
|
bool *nullsFirst; /* NULLS FIRST/LAST directions */
|
2017-11-16 18:06:14 +01:00
|
|
|
Bitmapset *initParam; /* param id's of initplans which are referred
|
|
|
|
* at gather merge or one of it's child node */
|
2017-03-09 13:40:36 +01:00
|
|
|
} GatherMerge;
|
|
|
|
|
2002-12-05 16:50:39 +01:00
|
|
|
/* ----------------
|
|
|
|
* hash build node
|
2009-03-21 01:04:40 +01:00
|
|
|
*
|
|
|
|
* If the executor is supposed to try to apply skew join optimization, then
|
2009-12-29 21:11:45 +01:00
|
|
|
* skewTable/skewColumn/skewInherit identify the outer relation's join key
|
2017-05-14 17:07:40 +02:00
|
|
|
* column, from which the relevant MCV statistics can be fetched.
|
2002-12-05 16:50:39 +01:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct Hash
|
|
|
|
{
|
|
|
|
Plan plan;
|
Fix representation of hash keys in Hash/HashJoin nodes.
In 5f32b29c1819 I changed the creation of HashState.hashkeys to
actually use HashState as the parent (instead of HashJoinState, which
was incorrect, as they were executed below HashState), to fix the
problem of hashkeys expressions otherwise relying on slot types
appropriate for HashJoinState, rather than HashState as would be
correct. That reliance was only introduced in 12, which is why it
previously worked to use HashJoinState as the parent (although I'd be
unsurprised if there were problematic cases).
Unfortunately that's not a sufficient solution, because before this
commit, the to-be-hashed expressions referenced inner/outer as
appropriate for the HashJoin, not Hash. That didn't have obvious bad
consequences, because the slots containing the tuples were put into
ecxt_innertuple when hashing a tuple for HashState (even though Hash
doesn't have an inner plan).
There are less common cases where this can cause visible problems
however (rather than just confusion when inspecting such executor
trees). E.g. "ERROR: bogus varno: 65000", when explaining queries
containing a HashJoin where the subsidiary Hash node's hash keys
reference a subplan. While normally hashkeys aren't displayed by
EXPLAIN, if one of those expressions references a subplan, that
subplan may be printed as part of the Hash node - which then failed
because an inner plan was referenced, and Hash doesn't have that.
It seems quite possible that there's other broken cases, too.
Fix the problem by properly splitting the expression for the HashJoin
and Hash nodes at plan time, and have them reference the proper
subsidiary node. While other workarounds are possible, fixing this
correctly seems easy enough. It was a pretty ugly hack to have
ExecInitHashJoin put the expression into the already initialized
HashState, in the first place.
I decided to not just split inner/outer hashkeys inside
make_hashjoin(), but also to separate out hashoperators and
hashcollations at plan time. Otherwise we would have ended up having
two very similar loops, one at plan time and the other during executor
startup. The work seems to more appropriately belong to plan time,
anyway.
Reported-By: Nikita Glukhov, Alexander Korotkov
Author: Andres Freund
Reviewed-By: Tom Lane, in an earlier version
Discussion: https://postgr.es/m/CAPpHfdvGVegF_TKKRiBrSmatJL2dR9uwFCuR+teQ_8tEXU8mxg@mail.gmail.com
Backpatch: 12-
2019-08-02 09:02:46 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* List of expressions to be hashed for tuples from Hash's outer plan,
|
|
|
|
* needed to put them into the hashtable.
|
|
|
|
*/
|
|
|
|
List *hashkeys; /* hash keys for the hashjoin condition */
|
2009-03-21 01:04:40 +01:00
|
|
|
Oid skewTable; /* outer join key's table OID, or InvalidOid */
|
|
|
|
AttrNumber skewColumn; /* outer join key's column #, or zero */
|
2009-12-29 21:11:45 +01:00
|
|
|
bool skewInherit; /* is outer join rel an inheritance tree? */
|
2003-11-25 22:00:54 +01:00
|
|
|
/* all other info is in the parent HashJoin node */
|
2021-09-15 18:56:13 +02:00
|
|
|
Cardinality rows_total; /* estimate total rows if parallel_aware */
|
2002-12-05 16:50:39 +01:00
|
|
|
} Hash;
|
|
|
|
|
2000-10-05 21:11:39 +02:00
|
|
|
/* ----------------
|
|
|
|
* setop node
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct SetOp
|
|
|
|
{
|
|
|
|
Plan plan;
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
SetOpCmd cmd; /* what to do, see nodes.h */
|
|
|
|
SetOpStrategy strategy; /* how to do it, see nodes.h */
|
2000-10-05 21:11:39 +02:00
|
|
|
int numCols; /* number of columns to check for
|
|
|
|
* duplicate-ness */
|
2007-01-10 19:06:05 +01:00
|
|
|
AttrNumber *dupColIdx; /* their indexes in the target list */
|
|
|
|
Oid *dupOperators; /* equality operators to compare with */
|
2019-03-22 12:09:32 +01:00
|
|
|
Oid *dupCollations;
|
2007-01-10 19:06:05 +01:00
|
|
|
AttrNumber flagColIdx; /* where is the flag column, if any */
|
2008-08-07 21:35:02 +02:00
|
|
|
int firstFlag; /* flag value for first input relation */
|
2008-08-07 05:04:04 +02:00
|
|
|
long numGroups; /* estimated number of groups in input */
|
2000-10-05 21:11:39 +02:00
|
|
|
} SetOp;
|
|
|
|
|
2009-10-12 20:10:51 +02:00
|
|
|
/* ----------------
|
|
|
|
* lock-rows node
|
|
|
|
*
|
|
|
|
* rowMarks identifies the rels to be locked by this node; it should be
|
|
|
|
* a subset of the rowMarks listed in the top-level PlannedStmt.
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
* epqParam is a Param that all scan nodes below this one must depend on.
|
|
|
|
* It is used to force re-evaluation of the plan during EvalPlanQual.
|
2009-10-12 20:10:51 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct LockRows
|
|
|
|
{
|
|
|
|
Plan plan;
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
List *rowMarks; /* a list of PlanRowMark's */
|
|
|
|
int epqParam; /* ID of Param for EvalPlanQual re-eval */
|
2009-10-12 20:10:51 +02:00
|
|
|
} LockRows;
|
|
|
|
|
2000-10-26 23:38:24 +02:00
|
|
|
/* ----------------
|
|
|
|
* limit node
|
2006-07-26 21:31:51 +02:00
|
|
|
*
|
|
|
|
* Note: as of Postgres 8.2, the offset and count expressions are expected
|
|
|
|
* to yield int8, rather than int4 as before.
|
2000-10-26 23:38:24 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
typedef struct Limit
|
|
|
|
{
|
|
|
|
Plan plan;
|
|
|
|
Node *limitOffset; /* OFFSET parameter, or NULL if none */
|
|
|
|
Node *limitCount; /* COUNT parameter, or NULL if none */
|
2020-04-07 22:22:13 +02:00
|
|
|
LimitOption limitOption; /* limit type */
|
|
|
|
int uniqNumCols; /* number of columns to check for similarity */
|
|
|
|
AttrNumber *uniqColIdx; /* their indexes in the target list */
|
|
|
|
Oid *uniqOperators; /* equality operators to compare with */
|
|
|
|
Oid *uniqCollations; /* collations for equality comparisons */
|
2000-10-26 23:38:24 +02:00
|
|
|
} Limit;
|
|
|
|
|
2008-09-09 20:58:09 +02:00
|
|
|
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
/*
|
|
|
|
* RowMarkType -
|
|
|
|
* enums for types of row-marking operations
|
|
|
|
*
|
2013-03-10 19:14:53 +01:00
|
|
|
* The first four of these values represent different lock strengths that
|
|
|
|
* we can take on tuples according to SELECT FOR [KEY] UPDATE/SHARE requests.
|
Add support for doing late row locking in FDWs.
Previously, FDWs could only do "early row locking", that is lock a row as
soon as it's fetched, even though local restriction/join conditions might
discard the row later. This patch adds callbacks that allow FDWs to do
late locking in the same way that it's done for regular tables.
To make use of this feature, an FDW must support the "ctid" column as a
unique row identifier. Currently, since ctid has to be of type TID,
the feature is of limited use, though in principle it could be used by
postgres_fdw. We may eventually allow FDWs to specify another data type
for ctid, which would make it possible for more FDWs to use this feature.
This commit does not modify postgres_fdw to use late locking. We've
tested some prototype code for that, but it's not in committable shape,
and besides it's quite unclear whether it actually makes sense to do late
locking against a remote server. The extra round trips required are likely
to outweigh any benefit from improved concurrency.
Etsuro Fujita, reviewed by Ashutosh Bapat, and hacked up a lot by me
2015-05-12 20:10:10 +02:00
|
|
|
* We support these on regular tables, as well as on foreign tables whose FDWs
|
|
|
|
* report support for late locking. For other foreign tables, any locking
|
|
|
|
* that might be done for such requests must happen during the initial row
|
|
|
|
* fetch; their FDWs provide no mechanism for going back to lock a row later.
|
2013-03-10 19:14:53 +01:00
|
|
|
* This means that the semantics will be a bit different than for a local
|
|
|
|
* table; in particular we are likely to lock more rows than would be locked
|
|
|
|
* locally, since remote rows will be locked even if they then fail
|
Add support for doing late row locking in FDWs.
Previously, FDWs could only do "early row locking", that is lock a row as
soon as it's fetched, even though local restriction/join conditions might
discard the row later. This patch adds callbacks that allow FDWs to do
late locking in the same way that it's done for regular tables.
To make use of this feature, an FDW must support the "ctid" column as a
unique row identifier. Currently, since ctid has to be of type TID,
the feature is of limited use, though in principle it could be used by
postgres_fdw. We may eventually allow FDWs to specify another data type
for ctid, which would make it possible for more FDWs to use this feature.
This commit does not modify postgres_fdw to use late locking. We've
tested some prototype code for that, but it's not in committable shape,
and besides it's quite unclear whether it actually makes sense to do late
locking against a remote server. The extra round trips required are likely
to outweigh any benefit from improved concurrency.
Etsuro Fujita, reviewed by Ashutosh Bapat, and hacked up a lot by me
2015-05-12 20:10:10 +02:00
|
|
|
* locally-checked restriction or join quals. However, the prospect of
|
|
|
|
* doing a separate remote query to lock each selected row is usually pretty
|
|
|
|
* unappealing, so early locking remains a credible design choice for FDWs.
|
2013-03-10 19:14:53 +01:00
|
|
|
*
|
|
|
|
* When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE, we have to uniquely
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
* identify all the source rows, not only those from the target relations, so
|
|
|
|
* that we can perform EvalPlanQual rechecking at need. For plain tables we
|
2013-03-10 19:14:53 +01:00
|
|
|
* can just fetch the TID, much as for a target relation; this case is
|
|
|
|
* represented by ROW_MARK_REFERENCE. Otherwise (for example for VALUES or
|
|
|
|
* FUNCTION scans) we have to copy the whole row value. ROW_MARK_COPY is
|
|
|
|
* pretty inefficient, since most of the time we'll never need the data; but
|
Add support for doing late row locking in FDWs.
Previously, FDWs could only do "early row locking", that is lock a row as
soon as it's fetched, even though local restriction/join conditions might
discard the row later. This patch adds callbacks that allow FDWs to do
late locking in the same way that it's done for regular tables.
To make use of this feature, an FDW must support the "ctid" column as a
unique row identifier. Currently, since ctid has to be of type TID,
the feature is of limited use, though in principle it could be used by
postgres_fdw. We may eventually allow FDWs to specify another data type
for ctid, which would make it possible for more FDWs to use this feature.
This commit does not modify postgres_fdw to use late locking. We've
tested some prototype code for that, but it's not in committable shape,
and besides it's quite unclear whether it actually makes sense to do late
locking against a remote server. The extra round trips required are likely
to outweigh any benefit from improved concurrency.
Etsuro Fujita, reviewed by Ashutosh Bapat, and hacked up a lot by me
2015-05-12 20:10:10 +02:00
|
|
|
* fortunately the overhead is usually not performance-critical in practice.
|
|
|
|
* By default we use ROW_MARK_COPY for foreign tables, but if the FDW has
|
|
|
|
* a concept of rowid it can request to use ROW_MARK_REFERENCE instead.
|
|
|
|
* (Again, this probably doesn't make sense if a physical remote fetch is
|
|
|
|
* needed, but for FDWs that map to local storage it might be credible.)
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
*/
|
|
|
|
typedef enum RowMarkType
|
|
|
|
{
|
|
|
|
ROW_MARK_EXCLUSIVE, /* obtain exclusive tuple lock */
|
Improve concurrency of foreign key locking
This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE". UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.
Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.
The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid. Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates. This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed. pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.
Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header. This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.
Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)
With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.
As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.
Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane. There's probably room for several more tests.
There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it. Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.
This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
1290721684-sup-3951@alvh.no-ip.org
1294953201-sup-2099@alvh.no-ip.org
1320343602-sup-2290@alvh.no-ip.org
1339690386-sup-8927@alvh.no-ip.org
4FE5FF020200002500048A3D@gw.wicourts.gov
4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
|
|
|
ROW_MARK_NOKEYEXCLUSIVE, /* obtain no-key exclusive tuple lock */
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
ROW_MARK_SHARE, /* obtain shared tuple lock */
|
Improve concurrency of foreign key locking
This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE". UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.
Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.
The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid. Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates. This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed. pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.
Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header. This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.
Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)
With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.
As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.
Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane. There's probably room for several more tests.
There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it. Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.
This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
1290721684-sup-3951@alvh.no-ip.org
1294953201-sup-2099@alvh.no-ip.org
1320343602-sup-2290@alvh.no-ip.org
1339690386-sup-8927@alvh.no-ip.org
4FE5FF020200002500048A3D@gw.wicourts.gov
4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
|
|
|
ROW_MARK_KEYSHARE, /* obtain keyshare tuple lock */
|
Improve representation of PlanRowMark.
This patch fixes two inadequacies of the PlanRowMark representation.
First, that the original LockingClauseStrength isn't stored (and cannot be
inferred for foreign tables, which always get ROW_MARK_COPY). Since some
PlanRowMarks are created out of whole cloth and don't actually have an
ancestral RowMarkClause, this requires adding a dummy LCS_NONE value to
enum LockingClauseStrength, which is fairly annoying but the alternatives
seem worse. This fix allows getting rid of the use of get_parse_rowmark()
in FDWs (as per the discussion around commits 462bd95705a0c23b and
8ec8760fc87ecde0), and it simplifies some things elsewhere.
Second, that the representation assumed that all child tables in an
inheritance hierarchy would use the same RowMarkType. That's true today
but will soon not be true. We add an "allMarkTypes" field that identifies
the union of mark types used in all a parent table's children, and use
that where appropriate (currently, only in preprocess_targetlist()).
In passing fix a couple of minor infelicities left over from the SKIP
LOCKED patch, notably that _outPlanRowMark still thought waitPolicy
is a bool.
Catversion bump is required because the numeric values of enum
LockingClauseStrength can appear in on-disk rules.
Extracted from a much larger patch to support foreign table inheritance;
it seemed worth breaking this out, since it's a separable concern.
Shigeru Hanada and Etsuro Fujita, somewhat modified by me
2015-03-15 23:41:47 +01:00
|
|
|
ROW_MARK_REFERENCE, /* just fetch the TID, don't lock it */
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
ROW_MARK_COPY /* physically copy the row value */
|
|
|
|
} RowMarkType;
|
|
|
|
|
Improve concurrency of foreign key locking
This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE". UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.
Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.
The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid. Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates. This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed. pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.
Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header. This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.
Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)
With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.
As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.
Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane. There's probably room for several more tests.
There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it. Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.
This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
1290721684-sup-3951@alvh.no-ip.org
1294953201-sup-2099@alvh.no-ip.org
1320343602-sup-2290@alvh.no-ip.org
1339690386-sup-8927@alvh.no-ip.org
4FE5FF020200002500048A3D@gw.wicourts.gov
4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
|
|
|
#define RowMarkRequiresRowShareLock(marktype) ((marktype) <= ROW_MARK_KEYSHARE)
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* PlanRowMark -
|
Improve concurrency of foreign key locking
This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE". UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.
Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.
The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid. Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates. This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed. pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.
Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header. This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.
Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)
With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.
As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.
Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane. There's probably room for several more tests.
There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it. Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.
This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
1290721684-sup-3951@alvh.no-ip.org
1294953201-sup-2099@alvh.no-ip.org
1320343602-sup-2290@alvh.no-ip.org
1339690386-sup-8927@alvh.no-ip.org
4FE5FF020200002500048A3D@gw.wicourts.gov
4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
|
|
|
* plan-time representation of FOR [KEY] UPDATE/SHARE clauses
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
*
|
2013-03-10 19:14:53 +01:00
|
|
|
* When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE, we create a separate
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
* PlanRowMark node for each non-target relation in the query. Relations that
|
2013-03-10 19:14:53 +01:00
|
|
|
* are not specified as FOR UPDATE/SHARE are marked ROW_MARK_REFERENCE (if
|
Add support for doing late row locking in FDWs.
Previously, FDWs could only do "early row locking", that is lock a row as
soon as it's fetched, even though local restriction/join conditions might
discard the row later. This patch adds callbacks that allow FDWs to do
late locking in the same way that it's done for regular tables.
To make use of this feature, an FDW must support the "ctid" column as a
unique row identifier. Currently, since ctid has to be of type TID,
the feature is of limited use, though in principle it could be used by
postgres_fdw. We may eventually allow FDWs to specify another data type
for ctid, which would make it possible for more FDWs to use this feature.
This commit does not modify postgres_fdw to use late locking. We've
tested some prototype code for that, but it's not in committable shape,
and besides it's quite unclear whether it actually makes sense to do late
locking against a remote server. The extra round trips required are likely
to outweigh any benefit from improved concurrency.
Etsuro Fujita, reviewed by Ashutosh Bapat, and hacked up a lot by me
2015-05-12 20:10:10 +02:00
|
|
|
* regular tables or supported foreign tables) or ROW_MARK_COPY (if not).
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
*
|
|
|
|
* Initially all PlanRowMarks have rti == prti and isParent == false.
|
|
|
|
* When the planner discovers that a relation is the root of an inheritance
|
|
|
|
* tree, it sets isParent true, and adds an additional PlanRowMark to the
|
|
|
|
* list for each child relation (including the target rel itself in its role
|
2021-06-02 17:52:35 +02:00
|
|
|
* as a child, if it is not a partitioned table). Any non-leaf partitioned
|
|
|
|
* child relations will also have entries with isParent = true. The child
|
|
|
|
* entries have rti == child rel's RT index and prti == top parent's RT index,
|
2017-03-21 14:48:04 +01:00
|
|
|
* and can therefore be recognized as children by the fact that prti != rti.
|
|
|
|
* The parent's allMarkTypes field gets the OR of (1<<markType) across all
|
|
|
|
* its children (this definition allows children to use different markTypes).
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
*
|
2011-01-13 02:47:02 +01:00
|
|
|
* The planner also adds resjunk output columns to the plan that carry
|
Add support for doing late row locking in FDWs.
Previously, FDWs could only do "early row locking", that is lock a row as
soon as it's fetched, even though local restriction/join conditions might
discard the row later. This patch adds callbacks that allow FDWs to do
late locking in the same way that it's done for regular tables.
To make use of this feature, an FDW must support the "ctid" column as a
unique row identifier. Currently, since ctid has to be of type TID,
the feature is of limited use, though in principle it could be used by
postgres_fdw. We may eventually allow FDWs to specify another data type
for ctid, which would make it possible for more FDWs to use this feature.
This commit does not modify postgres_fdw to use late locking. We've
tested some prototype code for that, but it's not in committable shape,
and besides it's quite unclear whether it actually makes sense to do late
locking against a remote server. The extra round trips required are likely
to outweigh any benefit from improved concurrency.
Etsuro Fujita, reviewed by Ashutosh Bapat, and hacked up a lot by me
2015-05-12 20:10:10 +02:00
|
|
|
* information sufficient to identify the locked or fetched rows. When
|
|
|
|
* markType != ROW_MARK_COPY, these columns are named
|
2011-01-13 02:47:02 +01:00
|
|
|
* tableoid%u OID of table
|
|
|
|
* ctid%u TID of row
|
|
|
|
* The tableoid column is only present for an inheritance hierarchy.
|
|
|
|
* When markType == ROW_MARK_COPY, there is instead a single column named
|
|
|
|
* wholerow%u whole-row value of relation
|
Improve representation of PlanRowMark.
This patch fixes two inadequacies of the PlanRowMark representation.
First, that the original LockingClauseStrength isn't stored (and cannot be
inferred for foreign tables, which always get ROW_MARK_COPY). Since some
PlanRowMarks are created out of whole cloth and don't actually have an
ancestral RowMarkClause, this requires adding a dummy LCS_NONE value to
enum LockingClauseStrength, which is fairly annoying but the alternatives
seem worse. This fix allows getting rid of the use of get_parse_rowmark()
in FDWs (as per the discussion around commits 462bd95705a0c23b and
8ec8760fc87ecde0), and it simplifies some things elsewhere.
Second, that the representation assumed that all child tables in an
inheritance hierarchy would use the same RowMarkType. That's true today
but will soon not be true. We add an "allMarkTypes" field that identifies
the union of mark types used in all a parent table's children, and use
that where appropriate (currently, only in preprocess_targetlist()).
In passing fix a couple of minor infelicities left over from the SKIP
LOCKED patch, notably that _outPlanRowMark still thought waitPolicy
is a bool.
Catversion bump is required because the numeric values of enum
LockingClauseStrength can appear in on-disk rules.
Extracted from a much larger patch to support foreign table inheritance;
it seemed worth breaking this out, since it's a separable concern.
Shigeru Hanada and Etsuro Fujita, somewhat modified by me
2015-03-15 23:41:47 +01:00
|
|
|
* (An inheritance hierarchy could have all three resjunk output columns,
|
|
|
|
* if some children use a different markType than others.)
|
2011-02-10 05:27:07 +01:00
|
|
|
* In all three cases, %u represents the rowmark ID number (rowmarkId).
|
|
|
|
* This number is unique within a plan tree, except that child relation
|
|
|
|
* entries copy their parent's rowmarkId. (Assigning unique numbers
|
|
|
|
* means we needn't renumber rowmarkIds when flattening subqueries, which
|
|
|
|
* would require finding and renaming the resjunk columns as well.)
|
2011-01-13 02:47:02 +01:00
|
|
|
* Note this means that all tables in an inheritance hierarchy share the
|
2021-06-02 17:52:35 +02:00
|
|
|
* same resjunk column names.
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
*/
|
|
|
|
typedef struct PlanRowMark
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
Index rti; /* range table index of markable relation */
|
|
|
|
Index prti; /* range table index of parent relation */
|
2011-02-10 05:27:07 +01:00
|
|
|
Index rowmarkId; /* unique identifier for resjunk columns */
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
RowMarkType markType; /* see enum above */
|
Improve representation of PlanRowMark.
This patch fixes two inadequacies of the PlanRowMark representation.
First, that the original LockingClauseStrength isn't stored (and cannot be
inferred for foreign tables, which always get ROW_MARK_COPY). Since some
PlanRowMarks are created out of whole cloth and don't actually have an
ancestral RowMarkClause, this requires adding a dummy LCS_NONE value to
enum LockingClauseStrength, which is fairly annoying but the alternatives
seem worse. This fix allows getting rid of the use of get_parse_rowmark()
in FDWs (as per the discussion around commits 462bd95705a0c23b and
8ec8760fc87ecde0), and it simplifies some things elsewhere.
Second, that the representation assumed that all child tables in an
inheritance hierarchy would use the same RowMarkType. That's true today
but will soon not be true. We add an "allMarkTypes" field that identifies
the union of mark types used in all a parent table's children, and use
that where appropriate (currently, only in preprocess_targetlist()).
In passing fix a couple of minor infelicities left over from the SKIP
LOCKED patch, notably that _outPlanRowMark still thought waitPolicy
is a bool.
Catversion bump is required because the numeric values of enum
LockingClauseStrength can appear in on-disk rules.
Extracted from a much larger patch to support foreign table inheritance;
it seemed worth breaking this out, since it's a separable concern.
Shigeru Hanada and Etsuro Fujita, somewhat modified by me
2015-03-15 23:41:47 +01:00
|
|
|
int allMarkTypes; /* OR of (1<<markType) for all children */
|
|
|
|
LockClauseStrength strength; /* LockingClause's strength, or LCS_NONE */
|
2014-11-21 00:36:07 +01:00
|
|
|
LockWaitPolicy waitPolicy; /* NOWAIT and SKIP LOCKED options */
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
bool isParent; /* true if this is a "dummy" parent entry */
|
|
|
|
} PlanRowMark;
|
|
|
|
|
|
|
|
|
2018-06-10 22:30:14 +02:00
|
|
|
/*
|
|
|
|
* Node types to represent partition pruning information.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* PartitionPruneInfo - Details required to allow the executor to prune
|
|
|
|
* partitions.
|
|
|
|
*
|
|
|
|
* Here we store mapping details to allow translation of a partitioned table's
|
2018-06-11 00:24:34 +02:00
|
|
|
* index as returned by the partition pruning code into subplan indexes for
|
|
|
|
* plan types which support arbitrary numbers of subplans, such as Append.
|
|
|
|
* We also store various details to tell the executor when it should be
|
|
|
|
* performing partition pruning.
|
2018-06-11 23:14:46 +02:00
|
|
|
*
|
2018-08-02 01:42:46 +02:00
|
|
|
* Each PartitionedRelPruneInfo describes the partitioning rules for a single
|
|
|
|
* partitioned table (a/k/a level of partitioning). Since a partitioning
|
|
|
|
* hierarchy could contain multiple levels, we represent it by a List of
|
|
|
|
* PartitionedRelPruneInfos, where the first entry represents the topmost
|
|
|
|
* partitioned table and additional entries represent non-leaf child
|
|
|
|
* partitions, ordered such that parents appear before their children.
|
|
|
|
* Then, since an Append-type node could have multiple partitioning
|
|
|
|
* hierarchies among its children, we have an unordered List of those Lists.
|
|
|
|
*
|
|
|
|
* prune_infos List of Lists containing PartitionedRelPruneInfo nodes,
|
|
|
|
* one sublist per run-time-prunable partition hierarchy
|
|
|
|
* appearing in the parent plan node's subplans.
|
|
|
|
* other_subplans Indexes of any subplans that are not accounted for
|
|
|
|
* by any of the PartitionedRelPruneInfo nodes in
|
|
|
|
* "prune_infos". These subplans must not be pruned.
|
|
|
|
*/
|
|
|
|
typedef struct PartitionPruneInfo
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
List *prune_infos;
|
|
|
|
Bitmapset *other_subplans;
|
|
|
|
} PartitionPruneInfo;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* PartitionedRelPruneInfo - Details required to allow the executor to prune
|
|
|
|
* partitions for a single partitioned table.
|
2018-06-11 23:14:46 +02:00
|
|
|
*
|
2018-11-15 23:04:48 +01:00
|
|
|
* subplan_map[] and subpart_map[] are indexed by partition index of the
|
|
|
|
* partitioned table referenced by 'rtindex', the partition index being the
|
|
|
|
* order that the partitions are defined in the table's PartitionDesc. For a
|
|
|
|
* leaf partition p, subplan_map[p] contains the zero-based index of the
|
|
|
|
* partition's subplan in the parent plan's subplan list; it is -1 if the
|
|
|
|
* partition is non-leaf or has been pruned. For a non-leaf partition p,
|
|
|
|
* subpart_map[p] contains the zero-based index of that sub-partition's
|
|
|
|
* PartitionedRelPruneInfo in the hierarchy's PartitionedRelPruneInfo list;
|
|
|
|
* it is -1 if the partition is a leaf or has been pruned. Note that subplan
|
|
|
|
* indexes, as stored in 'subplan_map', are global across the parent plan
|
2018-08-02 01:42:46 +02:00
|
|
|
* node, but partition indexes are valid only within a particular hierarchy.
|
2019-03-30 23:58:55 +01:00
|
|
|
* relid_map[p] contains the partition's OID, or 0 if the partition was pruned.
|
2018-06-10 22:30:14 +02:00
|
|
|
*/
|
2018-08-02 01:42:46 +02:00
|
|
|
typedef struct PartitionedRelPruneInfo
|
2018-06-10 22:30:14 +02:00
|
|
|
{
|
|
|
|
NodeTag type;
|
2018-10-04 20:03:37 +02:00
|
|
|
Index rtindex; /* RT index of partition rel for this level */
|
2018-06-11 23:14:46 +02:00
|
|
|
Bitmapset *present_parts; /* Indexes of all partitions which subplans or
|
Restructure creation of run-time pruning steps.
Previously, gen_partprune_steps() always built executor pruning steps
using all suitable clauses, including those containing PARAM_EXEC
Params. This meant that the pruning steps were only completely safe
for executor run-time (scan start) pruning. To prune at executor
startup, we had to ignore the steps involving exec Params. But this
doesn't really work in general, since there may be logic changes
needed as well --- for example, pruning according to the last operator's
btree strategy is the wrong thing if we're not applying that operator.
The rules embodied in gen_partprune_steps() and its minions are
sufficiently complicated that tracking their incremental effects in
other logic seems quite impractical.
Short of a complete redesign, the only safe fix seems to be to run
gen_partprune_steps() twice, once to create executor startup pruning
steps and then again for run-time pruning steps. We can save a few
cycles however by noting during the first scan whether we rejected
any clauses because they involved exec Params --- if not, we don't
need to do the second scan.
In support of this, refactor the internal APIs in partprune.c to make
more use of passing information in the GeneratePruningStepsContext
struct, rather than as separate arguments.
This is, I hope, the last piece of our response to a bug report from
Alan Jackson. Back-patch to v11 where this code came in.
Discussion: https://postgr.es/m/FAD28A83-AC73-489E-A058-2681FA31D648@tvsquared.com
2019-05-18 01:44:19 +02:00
|
|
|
* subparts are present for */
|
|
|
|
int nparts; /* Length of the following arrays: */
|
2018-06-11 23:14:46 +02:00
|
|
|
int *subplan_map; /* subplan index by partition index, or -1 */
|
|
|
|
int *subpart_map; /* subpart index by partition index, or -1 */
|
2019-03-30 23:58:55 +01:00
|
|
|
Oid *relid_map; /* relation OID by partition index, or 0 */
|
Restructure creation of run-time pruning steps.
Previously, gen_partprune_steps() always built executor pruning steps
using all suitable clauses, including those containing PARAM_EXEC
Params. This meant that the pruning steps were only completely safe
for executor run-time (scan start) pruning. To prune at executor
startup, we had to ignore the steps involving exec Params. But this
doesn't really work in general, since there may be logic changes
needed as well --- for example, pruning according to the last operator's
btree strategy is the wrong thing if we're not applying that operator.
The rules embodied in gen_partprune_steps() and its minions are
sufficiently complicated that tracking their incremental effects in
other logic seems quite impractical.
Short of a complete redesign, the only safe fix seems to be to run
gen_partprune_steps() twice, once to create executor startup pruning
steps and then again for run-time pruning steps. We can save a few
cycles however by noting during the first scan whether we rejected
any clauses because they involved exec Params --- if not, we don't
need to do the second scan.
In support of this, refactor the internal APIs in partprune.c to make
more use of passing information in the GeneratePruningStepsContext
struct, rather than as separate arguments.
This is, I hope, the last piece of our response to a bug report from
Alan Jackson. Back-patch to v11 where this code came in.
Discussion: https://postgr.es/m/FAD28A83-AC73-489E-A058-2681FA31D648@tvsquared.com
2019-05-18 01:44:19 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* initial_pruning_steps shows how to prune during executor startup (i.e.,
|
|
|
|
* without use of any PARAM_EXEC Params); it is NIL if no startup pruning
|
|
|
|
* is required. exec_pruning_steps shows how to prune with PARAM_EXEC
|
|
|
|
* Params; it is NIL if no per-scan pruning is required.
|
|
|
|
*/
|
|
|
|
List *initial_pruning_steps; /* List of PartitionPruneStep */
|
|
|
|
List *exec_pruning_steps; /* List of PartitionPruneStep */
|
|
|
|
Bitmapset *execparamids; /* All PARAM_EXEC Param IDs in
|
|
|
|
* exec_pruning_steps */
|
2018-08-02 01:42:46 +02:00
|
|
|
} PartitionedRelPruneInfo;
|
2018-06-10 22:30:14 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Abstract Node type for partition pruning steps (there are no concrete
|
|
|
|
* Nodes of this type).
|
|
|
|
*
|
|
|
|
* step_id is the global identifier of the step within its pruning context.
|
|
|
|
*/
|
|
|
|
typedef struct PartitionPruneStep
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
int step_id;
|
|
|
|
} PartitionPruneStep;
|
|
|
|
|
|
|
|
/*
|
Cleanup partition pruning step generation
There was some code in gen_prune_steps_from_opexps that needlessly
checked a list was not empty when it clearly had to contain at least one
item. This prompted a further cleanup operation in partprune.c.
Additionally, the previous code could end up adding additional needless
INTERSECT steps. However, those do not appear to be able to cause any
misbehavior.
gen_prune_steps_from_opexps is now no longer in charge of generating
combine pruning steps. Instead, gen_partprune_steps_internal, which
already does some combine step creation has been given the sole
responsibility of generating all combine steps. This means that when
we recursively call gen_partprune_steps_internal, since it always now adds
a combine step when it produces multiple steps, we can just pay attention
to the final step returned.
In passing, do quite a bit of work on the comments to try to more clearly
explain the role of both gen_partprune_steps_internal and
gen_prune_steps_from_opexps. This is fairly complex code so some extra
effort to give any new readers an overview of how things work seems like
a good idea.
Author: Amit Langote
Reported-by: Andy Fan
Reviewed-by: Kyotaro Horiguchi, Andy Fan, Ryan Lambert, David Rowley
Discussion: https://postgr.es/m/CAKU4AWqWoVii+bRTeBQmeVW+PznkdO8DfbwqNsu9Gj4ubt9A6w@mail.gmail.com
2021-04-08 12:35:48 +02:00
|
|
|
* PartitionPruneStepOp - Information to prune using a set of mutually ANDed
|
2018-06-10 22:30:14 +02:00
|
|
|
* OpExpr clauses
|
|
|
|
*
|
|
|
|
* This contains information extracted from up to partnatts OpExpr clauses,
|
|
|
|
* where partnatts is the number of partition key columns. 'opstrategy' is the
|
|
|
|
* strategy of the operator in the clause matched to the last partition key.
|
|
|
|
* 'exprs' contains expressions which comprise the lookup key to be passed to
|
|
|
|
* the partition bound search function. 'cmpfns' contains the OIDs of
|
2018-06-11 00:24:34 +02:00
|
|
|
* comparison functions used to compare aforementioned expressions with
|
2018-06-10 22:30:14 +02:00
|
|
|
* partition bounds. Both 'exprs' and 'cmpfns' contain the same number of
|
2018-06-11 00:24:34 +02:00
|
|
|
* items, up to partnatts items.
|
2018-06-10 22:30:14 +02:00
|
|
|
*
|
|
|
|
* Once we find the offset of a partition bound using the lookup key, we
|
|
|
|
* determine which partitions to include in the result based on the value of
|
|
|
|
* 'opstrategy'. For example, if it were equality, we'd return just the
|
|
|
|
* partition that would contain that key or a set of partitions if the key
|
|
|
|
* didn't consist of all partitioning columns. For non-equality strategies,
|
|
|
|
* we'd need to include other partitions as appropriate.
|
|
|
|
*
|
|
|
|
* 'nullkeys' is the set containing the offset of the partition keys (0 to
|
|
|
|
* partnatts - 1) that were matched to an IS NULL clause. This is only
|
|
|
|
* considered for hash partitioning as we need to pass which keys are null
|
|
|
|
* to the hash partition bound search function. It is never possible to
|
|
|
|
* have an expression be present in 'exprs' for a given partition key and
|
|
|
|
* the corresponding bit set in 'nullkeys'.
|
|
|
|
*/
|
|
|
|
typedef struct PartitionPruneStepOp
|
|
|
|
{
|
|
|
|
PartitionPruneStep step;
|
|
|
|
|
|
|
|
StrategyNumber opstrategy;
|
|
|
|
List *exprs;
|
|
|
|
List *cmpfns;
|
|
|
|
Bitmapset *nullkeys;
|
|
|
|
} PartitionPruneStepOp;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* PartitionPruneStepCombine - Information to prune using a BoolExpr clause
|
|
|
|
*
|
|
|
|
* For BoolExpr clauses, we combine the set of partitions determined for each
|
|
|
|
* of the argument clauses.
|
|
|
|
*/
|
|
|
|
typedef enum PartitionPruneCombineOp
|
|
|
|
{
|
|
|
|
PARTPRUNE_COMBINE_UNION,
|
|
|
|
PARTPRUNE_COMBINE_INTERSECT
|
|
|
|
} PartitionPruneCombineOp;
|
|
|
|
|
|
|
|
typedef struct PartitionPruneStepCombine
|
|
|
|
{
|
|
|
|
PartitionPruneStep step;
|
|
|
|
|
|
|
|
PartitionPruneCombineOp combineOp;
|
|
|
|
List *source_stepids;
|
|
|
|
} PartitionPruneStepCombine;
|
|
|
|
|
|
|
|
|
2008-09-09 20:58:09 +02:00
|
|
|
/*
|
|
|
|
* Plan invalidation info
|
|
|
|
*
|
|
|
|
* We track the objects on which a PlannedStmt depends in two ways:
|
|
|
|
* relations are recorded as a simple list of OIDs, and everything else
|
|
|
|
* is represented as a list of PlanInvalItems. A PlanInvalItem is designed
|
|
|
|
* to be used with the syscache invalidation mechanism, so it identifies a
|
2011-08-17 01:27:46 +02:00
|
|
|
* system catalog entry by cache ID and hash value.
|
2008-09-09 20:58:09 +02:00
|
|
|
*/
|
|
|
|
typedef struct PlanInvalItem
|
|
|
|
{
|
|
|
|
NodeTag type;
|
|
|
|
int cacheId; /* a syscache ID, see utils/syscache.h */
|
2011-08-17 01:27:46 +02:00
|
|
|
uint32 hashValue; /* hash value of object's cache lookup key */
|
2008-09-09 20:58:09 +02:00
|
|
|
} PlanInvalItem;
|
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
#endif /* PLANNODES_H */
|