postgresql/src/include/nodes/plannodes.h

898 lines
30 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* plannodes.h
* definitions for query plan nodes
*
*
* Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
2010-09-20 22:08:53 +02:00
* src/include/nodes/plannodes.h
*
*-------------------------------------------------------------------------
*/
#ifndef PLANNODES_H
#define PLANNODES_H
#include "access/sdir.h"
#include "lib/stringinfo.h"
#include "nodes/bitmapset.h"
#include "nodes/lockoptions.h"
#include "nodes/primnodes.h"
/* ----------------------------------------------------------------
* node definitions
* ----------------------------------------------------------------
*/
/* ----------------
* PlannedStmt node
*
* The output of the planner is a Plan tree headed by a PlannedStmt node.
* PlannedStmt holds the "one time" information needed by the executor.
* ----------------
*/
typedef struct PlannedStmt
{
NodeTag type;
CmdType commandType; /* select|insert|update|delete */
uint32 queryId; /* query identifier (copied from Query) */
bool hasReturning; /* is it insert|update|delete RETURNING? */
bool hasModifyingCTE; /* has insert|update|delete in WITH? */
bool canSetTag; /* do I set the command result tag? */
bool transientPlan; /* redo plan when TransactionXmin changes? */
struct Plan *planTree; /* tree of Plan nodes */
List *rtable; /* list of RangeTblEntry nodes */
/* rtable indexes of target relations for INSERT/UPDATE/DELETE */
List *resultRelations; /* integer list of RT indexes, or NIL */
Node *utilityStmt; /* non-null if this is DECLARE CURSOR */
List *subplans; /* Plan trees for SubPlan expressions */
Bitmapset *rewindPlanIDs; /* indices of subplans that require REWIND */
List *rowMarks; /* a list of PlanRowMark's */
List *relationOids; /* OIDs of relations the plan depends on */
List *invalItems; /* other dependencies, as PlanInvalItems */
int nParamExec; /* number of PARAM_EXEC Params used */
Row-Level Security Policies (RLS) Building on the updatable security-barrier views work, add the ability to define policies on tables to limit the set of rows which are returned from a query and which are allowed to be added to a table. Expressions defined by the policy for filtering are added to the security barrier quals of the query, while expressions defined to check records being added to a table are added to the with-check options of the query. New top-level commands are CREATE/ALTER/DROP POLICY and are controlled by the table owner. Row Security is able to be enabled and disabled by the owner on a per-table basis using ALTER TABLE .. ENABLE/DISABLE ROW SECURITY. Per discussion, ROW SECURITY is disabled on tables by default and must be enabled for policies on the table to be used. If no policies exist on a table with ROW SECURITY enabled, a default-deny policy is used and no records will be visible. By default, row security is applied at all times except for the table owner and the superuser. A new GUC, row_security, is added which can be set to ON, OFF, or FORCE. When set to FORCE, row security will be applied even for the table owner and superusers. When set to OFF, row security will be disabled when allowed and an error will be thrown if the user does not have rights to bypass row security. Per discussion, pg_dump sets row_security = OFF by default to ensure that exports and backups will have all data in the table or will error if there are insufficient privileges to bypass row security. A new option has been added to pg_dump, --enable-row-security, to ask pg_dump to export with row security enabled. A new role capability, BYPASSRLS, which can only be set by the superuser, is added to allow other users to be able to bypass row security using row_security = OFF. Many thanks to the various individuals who have helped with the design, particularly Robert Haas for his feedback. Authors include Craig Ringer, KaiGai Kohei, Adam Brightwell, Dean Rasheed, with additional changes and rework by me. Reviewers have included all of the above, Greg Smith, Jeff McCormick, and Robert Haas.
2014-09-19 17:18:35 +02:00
bool hasRowSecurity; /* row security applied? */
Row-Level Security Policies (RLS) Building on the updatable security-barrier views work, add the ability to define policies on tables to limit the set of rows which are returned from a query and which are allowed to be added to a table. Expressions defined by the policy for filtering are added to the security barrier quals of the query, while expressions defined to check records being added to a table are added to the with-check options of the query. New top-level commands are CREATE/ALTER/DROP POLICY and are controlled by the table owner. Row Security is able to be enabled and disabled by the owner on a per-table basis using ALTER TABLE .. ENABLE/DISABLE ROW SECURITY. Per discussion, ROW SECURITY is disabled on tables by default and must be enabled for policies on the table to be used. If no policies exist on a table with ROW SECURITY enabled, a default-deny policy is used and no records will be visible. By default, row security is applied at all times except for the table owner and the superuser. A new GUC, row_security, is added which can be set to ON, OFF, or FORCE. When set to FORCE, row security will be applied even for the table owner and superusers. When set to OFF, row security will be disabled when allowed and an error will be thrown if the user does not have rights to bypass row security. Per discussion, pg_dump sets row_security = OFF by default to ensure that exports and backups will have all data in the table or will error if there are insufficient privileges to bypass row security. A new option has been added to pg_dump, --enable-row-security, to ask pg_dump to export with row security enabled. A new role capability, BYPASSRLS, which can only be set by the superuser, is added to allow other users to be able to bypass row security using row_security = OFF. Many thanks to the various individuals who have helped with the design, particularly Robert Haas for his feedback. Authors include Craig Ringer, KaiGai Kohei, Adam Brightwell, Dean Rasheed, with additional changes and rework by me. Reviewers have included all of the above, Greg Smith, Jeff McCormick, and Robert Haas.
2014-09-19 17:18:35 +02:00
} PlannedStmt;
/* macro for fetching the Plan associated with a SubPlan node */
#define exec_subplan_get_plan(plannedstmt, subplan) \
((Plan *) list_nth((plannedstmt)->subplans, (subplan)->plan_id - 1))
/* ----------------
* Plan node
*
* All plan nodes "derive" from the Plan structure by having the
* Plan structure as the first field. This ensures that everything works
* when nodes are cast to Plan's. (node pointers are frequently cast to Plan*
* when passed around generically in the executor)
*
* We never actually instantiate any Plan nodes; this is just the common
* abstract superclass for all Plan-type nodes.
* ----------------
*/
typedef struct Plan
{
NodeTag type;
/*
* estimated execution costs for plan (see costsize.c for more info)
*/
2005-10-15 04:49:52 +02:00
Cost startup_cost; /* cost expended before fetching any tuples */
Cost total_cost; /* total cost (assuming all tuples fetched) */
/*
* planner's estimate of result size of this plan step
*/
double plan_rows; /* number of rows plan is expected to emit */
int plan_width; /* average row width in bytes */
/*
* Common structural data for all Plan types.
*/
List *targetlist; /* target list to be computed at this node */
List *qual; /* implicitly-ANDed qual conditions */
struct Plan *lefttree; /* input plan tree(s) */
struct Plan *righttree;
List *initPlan; /* Init Plan nodes (un-correlated expr
* subselects) */
/*
* Information for management of parameter-change-driven rescanning
*
* extParam includes the paramIDs of all external PARAM_EXEC params
* affecting this plan node or its children. setParam params from the
* node's initPlans are not included, but their extParams are.
*
* allParam includes all the extParam paramIDs, plus the IDs of local
* params that affect the node (i.e., the setParams of its initplans).
* These are _all_ the PARAM_EXEC params that affect this node.
*/
Bitmapset *extParam;
Bitmapset *allParam;
} Plan;
/* ----------------
2010-10-26 10:15:17 +02:00
* these are defined to avoid confusion problems with "left"
* and "right" and "inner" and "outer". The convention is that
* the "left" plan is the "outer" plan and the "right" plan is
* the inner plan, but these make the code more readable.
* ----------------
*/
#define innerPlan(node) (((Plan *)(node))->righttree)
#define outerPlan(node) (((Plan *)(node))->lefttree)
/* ----------------
* Result node -
* If no outer plan, evaluate a variable-free targetlist.
* If outer plan, return tuples from outer plan (after a level of
* projection as shown by targetlist).
*
* If resconstantqual isn't NULL, it represents a one-time qualification
* test (i.e., one that doesn't depend on any variables from the outer plan,
* so needs to be evaluated only once).
* ----------------
*/
typedef struct Result
{
Plan plan;
Node *resconstantqual;
} Result;
/* ----------------
* ModifyTable node -
* Apply rows produced by subplan(s) to result table(s),
* by inserting, updating, or deleting.
*
* Note that rowMarks and epqParam are presumed to be valid for all the
* subplan(s); they can't contain any info that varies across subplans.
* ----------------
*/
typedef struct ModifyTable
{
Plan plan;
2010-02-26 03:01:40 +01:00
CmdType operation; /* INSERT, UPDATE, or DELETE */
bool canSetTag; /* do we set the command tag/es_processed? */
Index nominalRelation; /* Parent RT index for use of EXPLAIN */
List *resultRelations; /* integer list of RT indexes */
2011-04-10 17:42:00 +02:00
int resultRelIndex; /* index of first resultRel in plan's list */
2010-02-26 03:01:40 +01:00
List *plans; /* plan(s) producing source data */
List *withCheckOptionLists; /* per-target-table WCO lists */
2010-02-26 03:01:40 +01:00
List *returningLists; /* per-target-table RETURNING tlists */
List *fdwPrivLists; /* per-target-table FDW private data lists */
2010-02-26 03:01:40 +01:00
List *rowMarks; /* PlanRowMarks (non-locking only) */
int epqParam; /* ID of Param for EvalPlanQual re-eval */
} ModifyTable;
/* ----------------
* Append node -
* Generate the concatenation of the results of sub-plans.
* ----------------
*/
typedef struct Append
{
Plan plan;
List *appendplans;
1997-09-08 22:59:27 +02:00
} Append;
/* ----------------
* MergeAppend node -
* Merge the results of pre-sorted sub-plans to preserve the ordering.
* ----------------
*/
typedef struct MergeAppend
{
Plan plan;
List *mergeplans;
/* remaining fields are just like the sort-key info in struct Sort */
int numCols; /* number of sort-key columns */
AttrNumber *sortColIdx; /* their indexes in the target list */
Oid *sortOperators; /* OIDs of operators to sort them by */
Oid *collations; /* OIDs of collations */
bool *nullsFirst; /* NULLS FIRST/LAST directions */
} MergeAppend;
/* ----------------
* RecursiveUnion node -
* Generate a recursive union of two subplans.
*
* The "outer" subplan is always the non-recursive term, and the "inner"
* subplan is the recursive term.
* ----------------
*/
typedef struct RecursiveUnion
{
Plan plan;
int wtParam; /* ID of Param representing work table */
/* Remaining fields are zero/null in UNION ALL case */
int numCols; /* number of columns to check for
* duplicate-ness */
AttrNumber *dupColIdx; /* their indexes in the target list */
Oid *dupOperators; /* equality operators to compare with */
long numGroups; /* estimated number of groups in input */
} RecursiveUnion;
/* ----------------
* BitmapAnd node -
* Generate the intersection of the results of sub-plans.
*
* The subplans must be of types that yield tuple bitmaps. The targetlist
* and qual fields of the plan are unused and are always NIL.
* ----------------
*/
typedef struct BitmapAnd
{
Plan plan;
List *bitmapplans;
} BitmapAnd;
/* ----------------
* BitmapOr node -
* Generate the union of the results of sub-plans.
*
* The subplans must be of types that yield tuple bitmaps. The targetlist
* and qual fields of the plan are unused and are always NIL.
* ----------------
*/
typedef struct BitmapOr
{
Plan plan;
List *bitmapplans;
} BitmapOr;
/*
* ==========
* Scan nodes
* ==========
*/
typedef struct Scan
{
Plan plan;
Index scanrelid; /* relid is index into the range table */
} Scan;
/* ----------------
* sequential scan node
* ----------------
*/
typedef Scan SeqScan;
/* ----------------
* index scan node
*
* indexqualorig is an implicitly-ANDed list of index qual expressions, each
* in the same form it appeared in the query WHERE condition. Each should
* be of the form (indexkey OP comparisonval) or (comparisonval OP indexkey).
* The indexkey is a Var or expression referencing column(s) of the index's
* base table. The comparisonval might be any expression, but it won't use
* any columns of the base table. The expressions are ordered by index
* column position (but items referencing the same index column can appear
* in any order). indexqualorig is used at runtime only if we have to recheck
* a lossy indexqual.
*
* indexqual has the same form, but the expressions have been commuted if
* necessary to put the indexkeys on the left, and the indexkeys are replaced
* by Var nodes identifying the index columns (their varno is INDEX_VAR and
* their varattno is the index column number).
*
* indexorderbyorig is similarly the original form of any ORDER BY expressions
* that are being implemented by the index, while indexorderby is modified to
* have index column Vars on the left-hand side. Here, multiple expressions
* must appear in exactly the ORDER BY order, and this is not necessarily the
* index column order. Only the expressions are provided, not the auxiliary
* sort-order information from the ORDER BY SortGroupClauses; it's assumed
* that the sort ordering is fully determinable from the top-level operators.
* indexorderbyorig is unused at run time, but is needed for EXPLAIN.
* (Note these fields are used for amcanorderbyop cases, not amcanorder cases.)
*
* indexorderdir specifies the scan ordering, for indexscans on amcanorder
* indexes (for other indexes it should be "don't care").
* ----------------
*/
typedef struct IndexScan
{
Scan scan;
2005-10-15 04:49:52 +02:00
Oid indexid; /* OID of index to scan */
List *indexqual; /* list of index quals (usually OpExprs) */
2005-10-15 04:49:52 +02:00
List *indexqualorig; /* the same in original form */
2011-04-10 17:42:00 +02:00
List *indexorderby; /* list of index ORDER BY exprs */
List *indexorderbyorig; /* the same in original form */
ScanDirection indexorderdir; /* forward or backward or don't care */
} IndexScan;
/* ----------------
* index-only scan node
*
* IndexOnlyScan is very similar to IndexScan, but it specifies an
* index-only scan, in which the data comes from the index not the heap.
* Because of this, *all* Vars in the plan node's targetlist, qual, and
* index expressions reference index columns and have varno = INDEX_VAR.
* Hence we do not need separate indexqualorig and indexorderbyorig lists,
* since their contents would be equivalent to indexqual and indexorderby.
*
* To help EXPLAIN interpret the index Vars for display, we provide
* indextlist, which represents the contents of the index as a targetlist
* with one TLE per index column. Vars appearing in this list reference
* the base table, and this is the only field in the plan node that may
* contain such Vars.
* ----------------
*/
typedef struct IndexOnlyScan
{
Scan scan;
Oid indexid; /* OID of index to scan */
List *indexqual; /* list of index quals (usually OpExprs) */
List *indexorderby; /* list of index ORDER BY exprs */
List *indextlist; /* TargetEntry list describing index's cols */
ScanDirection indexorderdir; /* forward or backward or don't care */
} IndexOnlyScan;
/* ----------------
* bitmap index scan node
*
* BitmapIndexScan delivers a bitmap of potential tuple locations;
* it does not access the heap itself. The bitmap is used by an
* ancestor BitmapHeapScan node, possibly after passing through
* intermediate BitmapAnd and/or BitmapOr nodes to combine it with
* the results of other BitmapIndexScans.
*
* The fields have the same meanings as for IndexScan, except we don't
* store a direction flag because direction is uninteresting.
*
* In a BitmapIndexScan plan node, the targetlist and qual fields are
* not used and are always NIL. The indexqualorig field is unused at
* run time too, but is saved for the benefit of EXPLAIN.
* ----------------
*/
typedef struct BitmapIndexScan
{
Scan scan;
2005-10-15 04:49:52 +02:00
Oid indexid; /* OID of index to scan */
List *indexqual; /* list of index quals (OpExprs) */
List *indexqualorig; /* the same in original form */
} BitmapIndexScan;
/* ----------------
* bitmap sequential scan node
*
* This needs a copy of the qual conditions being used by the input index
* scans because there are various cases where we need to recheck the quals;
* for example, when the bitmap is lossy about the specific rows on a page
* that meet the index condition.
* ----------------
*/
typedef struct BitmapHeapScan
{
Scan scan;
2005-10-15 04:49:52 +02:00
List *bitmapqualorig; /* index quals, in standard expr form */
} BitmapHeapScan;
/* ----------------
* tid scan node
*
* tidquals is an implicitly OR'ed list of qual expressions of the form
* "CTID = pseudoconstant" or "CTID = ANY(pseudoconstant_array)".
* ----------------
*/
typedef struct TidScan
{
Scan scan;
List *tidquals; /* qual(s) involving CTID = something */
} TidScan;
/* ----------------
* subquery scan node
*
* SubqueryScan is for scanning the output of a sub-query in the range table.
* We often need an extra plan node above the sub-query's plan to perform
* expression evaluations (which we can't push into the sub-query without
* risking changing its semantics). Although we are not scanning a physical
* relation, we make this a descendant of Scan anyway for code-sharing
* purposes.
*
* Note: we store the sub-plan in the type-specific subplan field, not in
* the generic lefttree field as you might expect. This is because we do
* not want plan-tree-traversal routines to recurse into the subplan without
* knowing that they are changing Query contexts.
* ----------------
*/
typedef struct SubqueryScan
{
Scan scan;
Plan *subplan;
} SubqueryScan;
/* ----------------
* FunctionScan node
* ----------------
*/
typedef struct FunctionScan
{
2002-09-04 22:31:48 +02:00
Scan scan;
List *functions; /* list of RangeTblFunction nodes */
bool funcordinality; /* WITH ORDINALITY */
} FunctionScan;
/* ----------------
* ValuesScan node
* ----------------
*/
typedef struct ValuesScan
{
Scan scan;
List *values_lists; /* list of expression lists */
} ValuesScan;
/* ----------------
* CteScan node
* ----------------
*/
typedef struct CteScan
{
Scan scan;
int ctePlanId; /* ID of init SubPlan for CTE */
int cteParam; /* ID of Param representing CTE output */
} CteScan;
/* ----------------
* WorkTableScan node
* ----------------
*/
typedef struct WorkTableScan
{
Scan scan;
int wtParam; /* ID of Param representing work table */
} WorkTableScan;
/* ----------------
* ForeignScan node
Revise FDW planning API, again. Further reflection shows that a single callback isn't very workable if we desire to let FDWs generate multiple Paths, because that forces the FDW to do all work necessary to generate a valid Plan node for each Path. Instead split the former PlanForeignScan API into three steps: GetForeignRelSize, GetForeignPaths, GetForeignPlan. We had already bit the bullet of breaking the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain, and it's substantially more flexible for complex FDWs. Add an fdw_private field to RelOptInfo so that the new functions can save state there rather than possibly having to recalculate information two or three times. In addition, we'd not thought through what would be needed to allow an FDW to set up subexpressions of its choice for runtime execution. We could treat ForeignScan.fdw_private as an executable expression but that seems likely to break existing FDWs unnecessarily (in particular, it would restrict the set of node types allowable in fdw_private to those supported by expression_tree_walker). Instead, invent a separate field fdw_exprs which will receive the postprocessing appropriate for expression trees. (One field is enough since it can be a list of expressions; also, we assume the corresponding expression state tree(s) will be held within fdw_state, so we don't need to add anything to ForeignScanState.) Per review of Hanada Shigeru's pgsql_fdw patch. We may need to tweak this further as we continue to work on that patch, but to me it feels a lot closer to being right now.
2012-03-09 18:48:48 +01:00
*
* fdw_exprs and fdw_private are both under the control of the foreign-data
* wrapper, but fdw_exprs is presumed to contain expression trees and will
* be post-processed accordingly by the planner; fdw_private won't be.
* Note that everything in both lists must be copiable by copyObject().
* One way to store an arbitrary blob of bytes is to represent it as a bytea
* Const. Usually, though, you'll be better off choosing a representation
* that can be dumped usefully by nodeToString().
* ----------------
*/
typedef struct ForeignScan
{
Scan scan;
Revise FDW planning API, again. Further reflection shows that a single callback isn't very workable if we desire to let FDWs generate multiple Paths, because that forces the FDW to do all work necessary to generate a valid Plan node for each Path. Instead split the former PlanForeignScan API into three steps: GetForeignRelSize, GetForeignPaths, GetForeignPlan. We had already bit the bullet of breaking the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain, and it's substantially more flexible for complex FDWs. Add an fdw_private field to RelOptInfo so that the new functions can save state there rather than possibly having to recalculate information two or three times. In addition, we'd not thought through what would be needed to allow an FDW to set up subexpressions of its choice for runtime execution. We could treat ForeignScan.fdw_private as an executable expression but that seems likely to break existing FDWs unnecessarily (in particular, it would restrict the set of node types allowable in fdw_private to those supported by expression_tree_walker). Instead, invent a separate field fdw_exprs which will receive the postprocessing appropriate for expression trees. (One field is enough since it can be a list of expressions; also, we assume the corresponding expression state tree(s) will be held within fdw_state, so we don't need to add anything to ForeignScanState.) Per review of Hanada Shigeru's pgsql_fdw patch. We may need to tweak this further as we continue to work on that patch, but to me it feels a lot closer to being right now.
2012-03-09 18:48:48 +01:00
List *fdw_exprs; /* expressions that FDW may evaluate */
List *fdw_private; /* private data for FDW */
Revise FDW planning API, again. Further reflection shows that a single callback isn't very workable if we desire to let FDWs generate multiple Paths, because that forces the FDW to do all work necessary to generate a valid Plan node for each Path. Instead split the former PlanForeignScan API into three steps: GetForeignRelSize, GetForeignPaths, GetForeignPlan. We had already bit the bullet of breaking the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain, and it's substantially more flexible for complex FDWs. Add an fdw_private field to RelOptInfo so that the new functions can save state there rather than possibly having to recalculate information two or three times. In addition, we'd not thought through what would be needed to allow an FDW to set up subexpressions of its choice for runtime execution. We could treat ForeignScan.fdw_private as an executable expression but that seems likely to break existing FDWs unnecessarily (in particular, it would restrict the set of node types allowable in fdw_private to those supported by expression_tree_walker). Instead, invent a separate field fdw_exprs which will receive the postprocessing appropriate for expression trees. (One field is enough since it can be a list of expressions; also, we assume the corresponding expression state tree(s) will be held within fdw_state, so we don't need to add anything to ForeignScanState.) Per review of Hanada Shigeru's pgsql_fdw patch. We may need to tweak this further as we continue to work on that patch, but to me it feels a lot closer to being right now.
2012-03-09 18:48:48 +01:00
bool fsSystemCol; /* true if any "system column" is needed */
} ForeignScan;
/* ----------------
* CustomScan node
*
* The comments for ForeignScan's fdw_exprs and fdw_private fields apply
* equally to custom_exprs and custom_private. Note that since Plan trees
* can be copied, custom scan providers *must* fit all plan data they need
* into those fields; embedding CustomScan in a larger struct will not work.
* ----------------
*/
struct CustomScan;
typedef struct CustomScanMethods
{
const char *CustomName;
/* Create execution state (CustomScanState) from a CustomScan plan node */
Node *(*CreateCustomScanState) (struct CustomScan *cscan);
/* Optional: print custom_xxx fields in some special way */
void (*TextOutCustomScan) (StringInfo str,
const struct CustomScan *node);
} CustomScanMethods;
typedef struct CustomScan
{
Scan scan;
uint32 flags; /* mask of CUSTOMPATH_* flags, see relation.h */
List *custom_exprs; /* expressions that custom code may evaluate */
List *custom_private; /* private data for custom code */
const CustomScanMethods *methods;
} CustomScan;
/*
* ==========
* Join nodes
* ==========
*/
/* ----------------
* Join node
*
* jointype: rule for joining tuples from left and right subtrees
* joinqual: qual conditions that came from JOIN/ON or JOIN/USING
* (plan.qual contains conditions that came from WHERE)
*
* When jointype is INNER, joinqual and plan.qual are semantically
* interchangeable. For OUTER jointypes, the two are *not* interchangeable;
* only joinqual is used to determine whether a match has been found for
* the purpose of deciding whether to generate null-extended tuples.
* (But plan.qual is still applied before actually returning a tuple.)
* For an outer join, only joinquals are allowed to be used as the merge
* or hash condition of a merge or hash join.
* ----------------
*/
typedef struct Join
{
Plan plan;
JoinType jointype;
List *joinqual; /* JOIN quals (in addition to plan.qual) */
} Join;
/* ----------------
* nest loop join node
*
* The nestParams list identifies any executor Params that must be passed
* into execution of the inner subplan carrying values from the current row
* of the outer subplan. Currently we restrict these values to be simple
* Vars, but perhaps someday that'd be worth relaxing. (Note: during plan
* creation, the paramval can actually be a PlaceHolderVar expression; but it
* must be a Var with varno OUTER_VAR by the time it gets to the executor.)
* ----------------
*/
typedef struct NestLoop
{
Join join;
List *nestParams; /* list of NestLoopParam nodes */
} NestLoop;
typedef struct NestLoopParam
{
NodeTag type;
int paramno; /* number of the PARAM_EXEC Param to set */
Var *paramval; /* outer-relation Var to assign to Param */
} NestLoopParam;
/* ----------------
* merge join node
*
* The expected ordering of each mergeable column is described by a btree
* opfamily OID, a collation OID, a direction (BTLessStrategyNumber or
* BTGreaterStrategyNumber) and a nulls-first flag. Note that the two sides
* of each mergeclause may be of different datatypes, but they are ordered the
* same way according to the common opfamily and collation. The operator in
* each mergeclause must be an equality operator of the indicated opfamily.
* ----------------
*/
typedef struct MergeJoin
{
Join join;
2007-11-15 22:14:46 +01:00
List *mergeclauses; /* mergeclauses as expression trees */
/* these are arrays, but have the same length as the mergeclauses list: */
2007-11-15 22:14:46 +01:00
Oid *mergeFamilies; /* per-clause OIDs of btree opfamilies */
Oid *mergeCollations; /* per-clause OIDs of collations */
int *mergeStrategies; /* per-clause ordering (ASC or DESC) */
bool *mergeNullsFirst; /* per-clause nulls ordering */
} MergeJoin;
/* ----------------
* hash join node
* ----------------
*/
typedef struct HashJoin
{
Join join;
List *hashclauses;
} HashJoin;
/* ----------------
* materialization node
* ----------------
*/
typedef struct Material
{
Plan plan;
} Material;
/* ----------------
* sort node
* ----------------
*/
typedef struct Sort
{
Plan plan;
int numCols; /* number of sort-key columns */
AttrNumber *sortColIdx; /* their indexes in the target list */
Oid *sortOperators; /* OIDs of operators to sort them by */
Oid *collations; /* OIDs of collations */
bool *nullsFirst; /* NULLS FIRST/LAST directions */
} Sort;
/* ---------------
* group node -
* Used for queries with GROUP BY (but no aggregates) specified.
* The input must be presorted according to the grouping columns.
* ---------------
*/
typedef struct Group
{
Plan plan;
int numCols; /* number of grouping columns */
AttrNumber *grpColIdx; /* their indexes in the target list */
Oid *grpOperators; /* equality operators to compare with */
} Group;
/* ---------------
* aggregate node
*
* An Agg node implements plain or grouped aggregation. For grouped
* aggregation, we can work with presorted input or unsorted input;
* the latter strategy uses an internal hashtable.
*
* Notice the lack of any direct info about the aggregate functions to be
* computed. They are found by scanning the node's tlist and quals during
* executor startup. (It is possible that there are no aggregate functions;
* this could happen if they get optimized away by constant-folding, or if
* we are using the Agg node to implement hash-based grouping.)
* ---------------
*/
typedef enum AggStrategy
{
AGG_PLAIN, /* simple agg across all input rows */
AGG_SORTED, /* grouped agg, input must be sorted */
AGG_HASHED /* grouped agg, use internal hashtable */
} AggStrategy;
typedef struct Agg
{
Plan plan;
2003-08-04 02:43:34 +02:00
AggStrategy aggstrategy;
int numCols; /* number of grouping columns */
AttrNumber *grpColIdx; /* their indexes in the target list */
Oid *grpOperators; /* equality operators to compare with */
long numGroups; /* estimated number of groups in input */
1997-09-08 22:59:27 +02:00
} Agg;
/* ----------------
* window aggregate node
* ----------------
*/
typedef struct WindowAgg
{
Plan plan;
Index winref; /* ID referenced by window functions */
int partNumCols; /* number of columns in partition clause */
AttrNumber *partColIdx; /* their indexes in the target list */
Oid *partOperators; /* equality operators for partition columns */
int ordNumCols; /* number of columns in ordering clause */
AttrNumber *ordColIdx; /* their indexes in the target list */
Oid *ordOperators; /* equality operators for ordering columns */
int frameOptions; /* frame_clause options, see WindowDef */
Node *startOffset; /* expression for starting bound, if any */
Node *endOffset; /* expression for ending bound, if any */
} WindowAgg;
/* ----------------
* unique node
* ----------------
*/
typedef struct Unique
{
Plan plan;
2005-10-15 04:49:52 +02:00
int numCols; /* number of columns to check for uniqueness */
AttrNumber *uniqColIdx; /* their indexes in the target list */
Oid *uniqOperators; /* equality operators to compare with */
} Unique;
/* ----------------
* hash build node
*
* If the executor is supposed to try to apply skew join optimization, then
* skewTable/skewColumn/skewInherit identify the outer relation's join key
* column, from which the relevant MCV statistics can be fetched. Also, its
* type information is provided to save a lookup.
* ----------------
*/
typedef struct Hash
{
Plan plan;
Oid skewTable; /* outer join key's table OID, or InvalidOid */
AttrNumber skewColumn; /* outer join key's column #, or zero */
bool skewInherit; /* is outer join rel an inheritance tree? */
Oid skewColType; /* datatype of the outer key column */
int32 skewColTypmod; /* typmod of the outer key column */
/* all other info is in the parent HashJoin node */
} Hash;
/* ----------------
* setop node
* ----------------
*/
typedef enum SetOpCmd
{
SETOPCMD_INTERSECT,
SETOPCMD_INTERSECT_ALL,
SETOPCMD_EXCEPT,
SETOPCMD_EXCEPT_ALL
} SetOpCmd;
typedef enum SetOpStrategy
{
SETOP_SORTED, /* input must be sorted */
SETOP_HASHED /* use internal hashtable */
} SetOpStrategy;
typedef struct SetOp
{
Plan plan;
SetOpCmd cmd; /* what to do */
SetOpStrategy strategy; /* how to do it */
int numCols; /* number of columns to check for
* duplicate-ness */
AttrNumber *dupColIdx; /* their indexes in the target list */
Oid *dupOperators; /* equality operators to compare with */
AttrNumber flagColIdx; /* where is the flag column, if any */
int firstFlag; /* flag value for first input relation */
long numGroups; /* estimated number of groups in input */
} SetOp;
/* ----------------
* lock-rows node
*
* rowMarks identifies the rels to be locked by this node; it should be
* a subset of the rowMarks listed in the top-level PlannedStmt.
* epqParam is a Param that all scan nodes below this one must depend on.
* It is used to force re-evaluation of the plan during EvalPlanQual.
* ----------------
*/
typedef struct LockRows
{
Plan plan;
List *rowMarks; /* a list of PlanRowMark's */
int epqParam; /* ID of Param for EvalPlanQual re-eval */
} LockRows;
/* ----------------
* limit node
*
* Note: as of Postgres 8.2, the offset and count expressions are expected
* to yield int8, rather than int4 as before.
* ----------------
*/
typedef struct Limit
{
Plan plan;
Node *limitOffset; /* OFFSET parameter, or NULL if none */
Node *limitCount; /* COUNT parameter, or NULL if none */
} Limit;
/*
* RowMarkType -
* enums for types of row-marking operations
*
* The first four of these values represent different lock strengths that
* we can take on tuples according to SELECT FOR [KEY] UPDATE/SHARE requests.
* We only support these on regular tables. For foreign tables, any locking
* that might be done for these requests must happen during the initial row
* fetch; there is no mechanism for going back to lock a row later (and thus
* no need for EvalPlanQual machinery during updates of foreign tables).
* This means that the semantics will be a bit different than for a local
* table; in particular we are likely to lock more rows than would be locked
* locally, since remote rows will be locked even if they then fail
* locally-checked restriction or join quals. However, the alternative of
* doing a separate remote query to lock each selected row is extremely
* unappealing, so let's do it like this for now.
*
* When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE, we have to uniquely
* identify all the source rows, not only those from the target relations, so
* that we can perform EvalPlanQual rechecking at need. For plain tables we
* can just fetch the TID, much as for a target relation; this case is
* represented by ROW_MARK_REFERENCE. Otherwise (for example for VALUES or
* FUNCTION scans) we have to copy the whole row value. ROW_MARK_COPY is
* pretty inefficient, since most of the time we'll never need the data; but
* fortunately the case is not performance-critical in practice. Note that
* we use ROW_MARK_COPY for non-target foreign tables, even if the FDW has a
* concept of rowid and so could theoretically support some form of
* ROW_MARK_REFERENCE. Although copying the whole row value is inefficient,
* it's probably still faster than doing a second remote fetch, so it doesn't
* seem worth the extra complexity to permit ROW_MARK_REFERENCE.
*/
typedef enum RowMarkType
{
ROW_MARK_EXCLUSIVE, /* obtain exclusive tuple lock */
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
ROW_MARK_NOKEYEXCLUSIVE, /* obtain no-key exclusive tuple lock */
ROW_MARK_SHARE, /* obtain shared tuple lock */
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
ROW_MARK_KEYSHARE, /* obtain keyshare tuple lock */
2015-03-15 23:41:47 +01:00
ROW_MARK_REFERENCE, /* just fetch the TID, don't lock it */
ROW_MARK_COPY /* physically copy the row value */
} RowMarkType;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
#define RowMarkRequiresRowShareLock(marktype) ((marktype) <= ROW_MARK_KEYSHARE)
/*
* PlanRowMark -
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* plan-time representation of FOR [KEY] UPDATE/SHARE clauses
*
* When doing UPDATE, DELETE, or SELECT FOR UPDATE/SHARE, we create a separate
* PlanRowMark node for each non-target relation in the query. Relations that
* are not specified as FOR UPDATE/SHARE are marked ROW_MARK_REFERENCE (if
* regular tables) or ROW_MARK_COPY (if not).
*
* Initially all PlanRowMarks have rti == prti and isParent == false.
* When the planner discovers that a relation is the root of an inheritance
* tree, it sets isParent true, and adds an additional PlanRowMark to the
* list for each child relation (including the target rel itself in its role
* as a child). The child entries have rti == child rel's RT index and
* prti == parent's RT index, and can therefore be recognized as children by
2015-03-15 23:41:47 +01:00
* the fact that prti != rti. The parent's allMarkTypes field gets the OR
* of (1<<markType) across all its children (this definition allows children
* to use different markTypes).
*
* The planner also adds resjunk output columns to the plan that carry
* information sufficient to identify the locked or fetched rows. For
* regular tables (markType != ROW_MARK_COPY), these columns are named
* tableoid%u OID of table
* ctid%u TID of row
* The tableoid column is only present for an inheritance hierarchy.
* When markType == ROW_MARK_COPY, there is instead a single column named
* wholerow%u whole-row value of relation
2015-03-15 23:41:47 +01:00
* (An inheritance hierarchy could have all three resjunk output columns,
* if some children use a different markType than others.)
* In all three cases, %u represents the rowmark ID number (rowmarkId).
* This number is unique within a plan tree, except that child relation
* entries copy their parent's rowmarkId. (Assigning unique numbers
* means we needn't renumber rowmarkIds when flattening subqueries, which
* would require finding and renaming the resjunk columns as well.)
* Note this means that all tables in an inheritance hierarchy share the
* same resjunk column names. However, in an inherited UPDATE/DELETE the
* columns could have different physical column numbers in each subplan.
*/
typedef struct PlanRowMark
{
NodeTag type;
Index rti; /* range table index of markable relation */
Index prti; /* range table index of parent relation */
Index rowmarkId; /* unique identifier for resjunk columns */
2010-02-26 03:01:40 +01:00
RowMarkType markType; /* see enum above */
2015-03-15 23:41:47 +01:00
int allMarkTypes; /* OR of (1<<markType) for all children */
LockClauseStrength strength; /* LockingClause's strength, or LCS_NONE */
LockWaitPolicy waitPolicy; /* NOWAIT and SKIP LOCKED options */
bool isParent; /* true if this is a "dummy" parent entry */
} PlanRowMark;
/*
* Plan invalidation info
*
* We track the objects on which a PlannedStmt depends in two ways:
* relations are recorded as a simple list of OIDs, and everything else
* is represented as a list of PlanInvalItems. A PlanInvalItem is designed
* to be used with the syscache invalidation mechanism, so it identifies a
* system catalog entry by cache ID and hash value.
*/
typedef struct PlanInvalItem
{
NodeTag type;
int cacheId; /* a syscache ID, see utils/syscache.h */
uint32 hashValue; /* hash value of object's cache lookup key */
} PlanInvalItem;
#endif /* PLANNODES_H */