postgresql/src/backend/nodes/outfuncs.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

4816 lines
112 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* outfuncs.c
* Output functions for Postgres tree nodes.
*
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* src/backend/nodes/outfuncs.c
*
* NOTES
* Every node type that can appear in stored rules' parsetrees *must*
* have an output function defined here (as well as an input function
* in readfuncs.c). In addition, plan nodes should have input and
* output functions so that they can be sent to parallel workers.
*
* For use in debugging, we also provide output functions for nodes
* that appear in raw parsetrees and planner Paths. These node types
* need not have input functions. Output support for raw parsetrees
* is somewhat incomplete, too; in particular, utility statements are
* almost entirely unsupported. We try to support everything that can
* appear in a raw SELECT, though.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <ctype.h>
#include "lib/stringinfo.h"
#include "miscadmin.h"
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#include "nodes/bitmapset.h"
#include "nodes/nodes.h"
#include "nodes/pg_list.h"
1999-07-16 07:00:38 +02:00
#include "utils/datum.h"
static void outChar(StringInfo str, char c);
/*
* Macros to simplify output of different kinds of fields. Use these
* wherever possible to reduce the chance for silly typos. Note that these
* hard-wire conventions about the names of the local variables in an Out
* routine.
*/
/* Write the label for the node type */
#define WRITE_NODE_TYPE(nodelabel) \
appendStringInfoString(str, nodelabel)
/* Write an integer field (anything written as ":fldname %d") */
#define WRITE_INT_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " %d", node->fldname)
/* Write an unsigned integer field (anything written as ":fldname %u") */
#define WRITE_UINT_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname)
/* Write an unsigned integer field (anything written with UINT64_FORMAT) */
#define WRITE_UINT64_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " " UINT64_FORMAT, \
node->fldname)
/* Write an OID field (don't hard-wire assumption that OID is same as uint) */
#define WRITE_OID_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname)
/* Write a long-integer field */
#define WRITE_LONG_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " %ld", node->fldname)
/* Write a char field (ie, one ascii character) */
#define WRITE_CHAR_FIELD(fldname) \
(appendStringInfo(str, " :" CppAsString(fldname) " "), \
outChar(str, node->fldname))
/* Write an enumerated-type field as an integer code */
#define WRITE_ENUM_FIELD(fldname, enumtype) \
appendStringInfo(str, " :" CppAsString(fldname) " %d", \
(int) node->fldname)
/* Write a float field --- caller must give format to define precision */
#define WRITE_FLOAT_FIELD(fldname,format) \
appendStringInfo(str, " :" CppAsString(fldname) " " format, node->fldname)
/* Write a boolean field */
#define WRITE_BOOL_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " %s", \
booltostr(node->fldname))
/* Write a character-string (possibly NULL) field */
#define WRITE_STRING_FIELD(fldname) \
(appendStringInfoString(str, " :" CppAsString(fldname) " "), \
outToken(str, node->fldname))
/* Write a parse location field (actually same as INT case) */
#define WRITE_LOCATION_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " %d", node->fldname)
/* Write a Node field */
#define WRITE_NODE_FIELD(fldname) \
(appendStringInfoString(str, " :" CppAsString(fldname) " "), \
outNode(str, node->fldname))
/* Write a bitmapset field */
#define WRITE_BITMAPSET_FIELD(fldname) \
(appendStringInfoString(str, " :" CppAsString(fldname) " "), \
outBitmapset(str, node->fldname))
#define WRITE_ATTRNUMBER_ARRAY(fldname, len) \
do { \
appendStringInfoString(str, " :" CppAsString(fldname) " "); \
for (int i = 0; i < len; i++) \
appendStringInfo(str, " %d", node->fldname[i]); \
} while(0)
#define WRITE_OID_ARRAY(fldname, len) \
do { \
appendStringInfoString(str, " :" CppAsString(fldname) " "); \
for (int i = 0; i < len; i++) \
appendStringInfo(str, " %u", node->fldname[i]); \
} while(0)
/*
* This macro supports the case that the field is NULL. For the other array
* macros, that is currently not needed.
*/
#define WRITE_INDEX_ARRAY(fldname, len) \
do { \
appendStringInfoString(str, " :" CppAsString(fldname) " "); \
if (node->fldname) \
for (int i = 0; i < len; i++) \
appendStringInfo(str, " %u", node->fldname[i]); \
else \
appendStringInfoString(str, "<>"); \
} while(0)
#define WRITE_INT_ARRAY(fldname, len) \
do { \
appendStringInfoString(str, " :" CppAsString(fldname) " "); \
for (int i = 0; i < len; i++) \
appendStringInfo(str, " %d", node->fldname[i]); \
} while(0)
#define WRITE_BOOL_ARRAY(fldname, len) \
do { \
appendStringInfoString(str, " :" CppAsString(fldname) " "); \
for (int i = 0; i < len; i++) \
appendStringInfo(str, " %s", booltostr(node->fldname[i])); \
} while(0)
#define booltostr(x) ((x) ? "true" : "false")
/*
* outToken
* Convert an ordinary string (eg, an identifier) into a form that
* will be decoded back to a plain token by read.c's functions.
*
* If a null or empty string is given, it is encoded as "<>".
*/
void
outToken(StringInfo str, const char *s)
{
if (s == NULL || *s == '\0')
{
appendStringInfoString(str, "<>");
return;
}
/*
* Look for characters or patterns that are treated specially by read.c
* (either in pg_strtok() or in nodeRead()), and therefore need a
* protective backslash.
*/
/* These characters only need to be quoted at the start of the string */
if (*s == '<' ||
*s == '"' ||
isdigit((unsigned char) *s) ||
((*s == '+' || *s == '-') &&
(isdigit((unsigned char) s[1]) || s[1] == '.')))
appendStringInfoChar(str, '\\');
while (*s)
{
/* These chars must be backslashed anywhere in the string */
if (*s == ' ' || *s == '\n' || *s == '\t' ||
*s == '(' || *s == ')' || *s == '{' || *s == '}' ||
*s == '\\')
appendStringInfoChar(str, '\\');
appendStringInfoChar(str, *s++);
}
}
/*
* Convert one char. Goes through outToken() so that special characters are
* escaped.
*/
static void
outChar(StringInfo str, char c)
{
char in[2];
in[0] = c;
in[1] = '\0';
outToken(str, in);
}
static void
_outList(StringInfo str, const List *node)
{
const ListCell *lc;
appendStringInfoChar(str, '(');
if (IsA(node, IntList))
appendStringInfoChar(str, 'i');
else if (IsA(node, OidList))
appendStringInfoChar(str, 'o');
else if (IsA(node, XidList))
appendStringInfoChar(str, 'x');
foreach(lc, node)
{
/*
* For the sake of backward compatibility, we emit a slightly
* different whitespace format for lists of nodes vs. other types of
* lists. XXX: is this necessary?
*/
if (IsA(node, List))
{
outNode(str, lfirst(lc));
Represent Lists as expansible arrays, not chains of cons-cells. Originally, Postgres Lists were a more or less exact reimplementation of Lisp lists, which consist of chains of separately-allocated cons cells, each having a value and a next-cell link. We'd hacked that once before (commit d0b4399d8) to add a separate List header, but the data was still in cons cells. That makes some operations -- notably list_nth() -- O(N), and it's bulky because of the next-cell pointers and per-cell palloc overhead, and it's very cache-unfriendly if the cons cells end up scattered around rather than being adjacent. In this rewrite, we still have List headers, but the data is in a resizable array of values, with no next-cell links. Now we need at most two palloc's per List, and often only one, since we can allocate some values in the same palloc call as the List header. (Of course, extending an existing List may require repalloc's to enlarge the array. But this involves just O(log N) allocations not O(N).) Of course this is not without downsides. The key difficulty is that addition or deletion of a list entry may now cause other entries to move, which it did not before. For example, that breaks foreach() and sister macros, which historically used a pointer to the current cons-cell as loop state. We can repair those macros transparently by making their actual loop state be an integer list index; the exposed "ListCell *" pointer is no longer state carried across loop iterations, but is just a derived value. (In practice, modern compilers can optimize things back to having just one loop state value, at least for simple cases with inline loop bodies.) In principle, this is a semantics change for cases where the loop body inserts or deletes list entries ahead of the current loop index; but I found no such cases in the Postgres code. The change is not at all transparent for code that doesn't use foreach() but chases lists "by hand" using lnext(). The largest share of such code in the backend is in loops that were maintaining "prev" and "next" variables in addition to the current-cell pointer, in order to delete list cells efficiently using list_delete_cell(). However, we no longer need a previous-cell pointer to delete a list cell efficiently. Keeping a next-cell pointer doesn't work, as explained above, but we can improve matters by changing such code to use a regular foreach() loop and then using the new macro foreach_delete_current() to delete the current cell. (This macro knows how to update the associated foreach loop's state so that no cells will be missed in the traversal.) There remains a nontrivial risk of code assuming that a ListCell * pointer will remain good over an operation that could now move the list contents. To help catch such errors, list.c can be compiled with a new define symbol DEBUG_LIST_MEMORY_USAGE that forcibly moves list contents whenever that could possibly happen. This makes list operations significantly more expensive so it's not normally turned on (though it is on by default if USE_VALGRIND is on). There are two notable API differences from the previous code: * lnext() now requires the List's header pointer in addition to the current cell's address. * list_delete_cell() no longer requires a previous-cell argument. These changes are somewhat unfortunate, but on the other hand code using either function needs inspection to see if it is assuming anything it shouldn't, so it's not all bad. Programmers should be aware of these significant performance changes: * list_nth() and related functions are now O(1); so there's no major access-speed difference between a list and an array. * Inserting or deleting a list element now takes time proportional to the distance to the end of the list, due to moving the array elements. (However, it typically *doesn't* require palloc or pfree, so except in long lists it's probably still faster than before.) Notably, lcons() used to be about the same cost as lappend(), but that's no longer true if the list is long. Code that uses lcons() and list_delete_first() to maintain a stack might usefully be rewritten to push and pop at the end of the list rather than the beginning. * There are now list_insert_nth...() and list_delete_nth...() functions that add or remove a list cell identified by index. These have the data-movement penalty explained above, but there's no search penalty. * list_concat() and variants now copy the second list's data into storage belonging to the first list, so there is no longer any sharing of cells between the input lists. The second argument is now declared "const List *" to reflect that it isn't changed. This patch just does the minimum needed to get the new implementation in place and fix bugs exposed by the regression tests. As suggested by the foregoing, there's a fair amount of followup work remaining to do. Also, the ENABLE_LIST_COMPAT macros are finally removed in this commit. Code using those should have been gone a dozen years ago. Patch by me; thanks to David Rowley, Jesper Pedersen, and others for review. Discussion: https://postgr.es/m/11587.1550975080@sss.pgh.pa.us
2019-07-15 19:41:58 +02:00
if (lnext(node, lc))
appendStringInfoChar(str, ' ');
}
else if (IsA(node, IntList))
appendStringInfo(str, " %d", lfirst_int(lc));
else if (IsA(node, OidList))
appendStringInfo(str, " %u", lfirst_oid(lc));
else if (IsA(node, XidList))
appendStringInfo(str, " %u", lfirst_xid(lc));
else
elog(ERROR, "unrecognized list node type: %d",
(int) node->type);
}
appendStringInfoChar(str, ')');
}
/*
* outBitmapset -
* converts a bitmap set of integers
*
* Note: the output format is "(b int int ...)", similar to an integer List.
*/
void
outBitmapset(StringInfo str, const Bitmapset *bms)
{
int x;
appendStringInfoChar(str, '(');
appendStringInfoChar(str, 'b');
x = -1;
while ((x = bms_next_member(bms, x)) >= 0)
appendStringInfo(str, " %d", x);
appendStringInfoChar(str, ')');
}
/*
* Print the value of a Datum given its type.
*/
void
outDatum(StringInfo str, Datum value, int typlen, bool typbyval)
{
Size length,
i;
char *s;
length = datumGetSize(value, typbyval, typlen);
1999-01-21 17:08:55 +01:00
if (typbyval)
{
s = (char *) (&value);
appendStringInfo(str, "%u [ ", (unsigned int) length);
for (i = 0; i < (Size) sizeof(Datum); i++)
appendStringInfo(str, "%d ", (int) (s[i]));
appendStringInfoChar(str, ']');
}
else
{
s = (char *) DatumGetPointer(value);
if (!PointerIsValid(s))
appendStringInfoString(str, "0 [ ]");
else
{
appendStringInfo(str, "%u [ ", (unsigned int) length);
for (i = 0; i < length; i++)
appendStringInfo(str, "%d ", (int) (s[i]));
appendStringInfoChar(str, ']');
}
}
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#include "outfuncs.funcs.c"
#ifdef OBSOLETE
/*
* Stuff from plannodes.h
*/
static void
_outPlannedStmt(StringInfo str, const PlannedStmt *node)
{
WRITE_NODE_TYPE("PLANNEDSTMT");
WRITE_ENUM_FIELD(commandType, CmdType);
WRITE_UINT64_FIELD(queryId);
WRITE_BOOL_FIELD(hasReturning);
WRITE_BOOL_FIELD(hasModifyingCTE);
WRITE_BOOL_FIELD(canSetTag);
WRITE_BOOL_FIELD(transientPlan);
Avoid invalidating all foreign-join cached plans when user mappings change. We must not push down a foreign join when the foreign tables involved should be accessed under different user mappings. Previously we tried to enforce that rule literally during planning, but that meant that the resulting plans were dependent on the current contents of the pg_user_mapping catalog, and we had to blow away all cached plans containing any remote join when anything at all changed in pg_user_mapping. This could have been improved somewhat, but the fact that a syscache inval callback has very limited info about what changed made it hard to do better within that design. Instead, let's change the planner to not consider user mappings per se, but to allow a foreign join if both RTEs have the same checkAsUser value. If they do, then they necessarily will use the same user mapping at runtime, and we don't need to know specifically which one that is. Post-plan-time changes in pg_user_mapping no longer require any plan invalidation. This rule does give up some optimization ability, to wit where two foreign table references come from views with different owners or one's from a view and one's directly in the query, but nonetheless the same user mapping would have applied. We'll sacrifice the first case, but to not regress more than we have to in the second case, allow a foreign join involving both zero and nonzero checkAsUser values if the nonzero one is the same as the prevailing effective userID. In that case, mark the plan as only runnable by that userID. The plancache code already had a notion of plans being userID-specific, in order to support RLS. It was a little confused though, in particular lacking clarity of thought as to whether it was the rewritten query or just the finished plan that's dependent on the userID. Rearrange that code so that it's clearer what depends on which, and so that the same logic applies to both RLS-injected role dependency and foreign-join-injected role dependency. Note that this patch doesn't remove the other issue mentioned in the original complaint, which is that while we'll reliably stop using a foreign join if it's disallowed in a new context, we might fail to start using a foreign join if it's now allowed, but we previously created a generic cached plan that didn't use one. It was agreed that the chance of winning that way was not high enough to justify the much larger number of plan invalidations that would have to occur if we tried to cause it to happen. In passing, clean up randomly-varying spelling of EXPLAIN commands in postgres_fdw.sql, and fix a COSTS ON example that had been allowed to leak into the committed tests. This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the previous attempt at ensuring we wouldn't push down foreign joins that span permissions contexts. Etsuro Fujita and Tom Lane Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
WRITE_BOOL_FIELD(dependsOnRole);
WRITE_BOOL_FIELD(parallelModeNeeded);
WRITE_INT_FIELD(jitFlags);
WRITE_NODE_FIELD(planTree);
WRITE_NODE_FIELD(rtable);
WRITE_NODE_FIELD(resultRelations);
Further adjust EXPLAIN's choices of table alias names. This patch causes EXPLAIN to always assign a separate table alias to the parent RTE of an append relation (inheritance set); before, such RTEs were ignored if not actually scanned by the plan. Since the child RTEs now always have that same alias to start with (cf. commit 55a1954da), the net effect is that the parent RTE usually gets the alias used or implied by the query text, and the children all get that alias with "_N" appended. (The exception to "usually" is if there are duplicate aliases in different subtrees of the original query; then some of those original RTEs will also have "_N" appended.) This results in more uniform output for partitioned-table plans than we had before: the partitioned table itself gets the original alias, and all child tables have aliases with "_N", rather than the previous behavior where one of the children would get an alias without "_N". The reason for giving the parent RTE an alias, even if it isn't scanned by the plan, is that we now use the parent's alias to qualify Vars that refer to an appendrel output column and appear above the Append or MergeAppend that computes the appendrel. But below the append, Vars refer to some one of the child relations, and are displayed that way. This seems clearer than the old behavior where a Var that could carry values from any child relation was displayed as if it referred to only one of them. While at it, change ruleutils.c so that the code paths used by EXPLAIN deal in Plan trees not PlanState trees. This effectively reverts a decision made in commit 1cc29fe7c, which seemed like a good idea at the time to make ruleutils.c consistent with explain.c. However, it's problematic because we'd really like to allow executor startup pruning to remove all the children of an append node when possible, leaving no child PlanState to resolve Vars against. (That's not done here, but will be in the next patch.) This requires different handling of subplans and initplans than before, but is otherwise a pretty straightforward change. Discussion: https://postgr.es/m/001001d4f44b$2a2cca50$7e865ef0$@lab.ntt.co.jp
2019-12-11 23:05:18 +01:00
WRITE_NODE_FIELD(appendRelations);
WRITE_NODE_FIELD(subplans);
WRITE_BITMAPSET_FIELD(rewindPlanIDs);
WRITE_NODE_FIELD(rowMarks);
WRITE_NODE_FIELD(relationOids);
WRITE_NODE_FIELD(invalItems);
WRITE_NODE_FIELD(paramExecTypes);
Change representation of statement lists, and add statement location info. This patch makes several changes that improve the consistency of representation of lists of statements. It's always been the case that the output of parse analysis is a list of Query nodes, whatever the types of the individual statements in the list. This patch brings similar consistency to the outputs of raw parsing and planning steps: * The output of raw parsing is now always a list of RawStmt nodes; the statement-type-dependent nodes are one level down from that. * The output of pg_plan_queries() is now always a list of PlannedStmt nodes, even for utility statements. In the case of a utility statement, "planning" just consists of wrapping a CMD_UTILITY PlannedStmt around the utility node. This list representation is now used in Portal and CachedPlan plan lists, replacing the former convention of intermixing PlannedStmts with bare utility-statement nodes. Now, every list of statements has a consistent head-node type depending on how far along it is in processing. This allows changing many places that formerly used generic "Node *" pointers to use a more specific pointer type, thus reducing the number of IsA() tests and casts needed, as well as improving code clarity. Also, the post-parse-analysis representation of DECLARE CURSOR is changed so that it looks more like EXPLAIN, PREPARE, etc. That is, the contained SELECT remains a child of the DeclareCursorStmt rather than getting flipped around to be the other way. It's now true for both Query and PlannedStmt that utilityStmt is non-null if and only if commandType is CMD_UTILITY. That allows simplifying a lot of places that were testing both fields. (I think some of those were just defensive programming, but in many places, it was actually necessary to avoid confusing DECLARE CURSOR with SELECT.) Because PlannedStmt carries a canSetTag field, we're also able to get rid of some ad-hoc rules about how to reconstruct canSetTag for a bare utility statement; specifically, the assumption that a utility is canSetTag if and only if it's the only one in its list. While I see no near-term need for relaxing that restriction, it's nice to get rid of the ad-hocery. The API of ProcessUtility() is changed so that what it's passed is the wrapper PlannedStmt not just the bare utility statement. This will affect all users of ProcessUtility_hook, but the changes are pretty trivial; see the affected contrib modules for examples of the minimum change needed. (Most compilers should give pointer-type-mismatch warnings for uncorrected code.) There's also a change in the API of ExplainOneQuery_hook, to pass through cursorOptions instead of expecting hook functions to know what to pick. This is needed because of the DECLARE CURSOR changes, but really should have been done in 9.6; it's unlikely that any extant hook functions know about using CURSOR_OPT_PARALLEL_OK. Finally, teach gram.y to save statement boundary locations in RawStmt nodes, and pass those through to Query and PlannedStmt nodes. This allows more intelligent handling of cases where a source query string contains multiple statements. This patch doesn't actually do anything with the information, but a follow-on patch will. (Passing this information through cleanly is the true motivation for these changes; while I think this is all good cleanup, it's unlikely we'd have bothered without this end goal.) catversion bump because addition of location fields to struct Query affects stored rules. This patch is by me, but it owes a good deal to Fabien Coelho who did a lot of preliminary work on the problem, and also reviewed the patch. Discussion: https://postgr.es/m/alpine.DEB.2.20.1612200926310.29821@lancre
2017-01-14 22:02:35 +01:00
WRITE_NODE_FIELD(utilityStmt);
WRITE_LOCATION_FIELD(stmt_location);
WRITE_INT_FIELD(stmt_len);
}
/*
* print the basic stuff of all nodes that inherit from Plan
*/
static void
_outPlanInfo(StringInfo str, const Plan *node)
{
WRITE_FLOAT_FIELD(startup_cost, "%.2f");
WRITE_FLOAT_FIELD(total_cost, "%.2f");
WRITE_FLOAT_FIELD(plan_rows, "%.0f");
WRITE_INT_FIELD(plan_width);
WRITE_BOOL_FIELD(parallel_aware);
WRITE_BOOL_FIELD(parallel_safe);
Add support for asynchronous execution. This implements asynchronous execution, which runs multiple parts of a non-parallel-aware Append concurrently rather than serially to improve performance when possible. Currently, the only node type that can be run concurrently is a ForeignScan that is an immediate child of such an Append. In the case where such ForeignScans access data on different remote servers, this would run those ForeignScans concurrently, and overlap the remote operations to be performed simultaneously, so it'll improve the performance especially when the operations involve time-consuming ones such as remote join and remote aggregation. We may extend this to other node types such as joins or aggregates over ForeignScans in the future. This also adds the support for postgres_fdw, which is enabled by the table-level/server-level option "async_capable". The default is false. Robert Haas, Kyotaro Horiguchi, Thomas Munro, and myself. This commit is mostly based on the patch proposed by Robert Haas, but also uses stuff from the patch proposed by Kyotaro Horiguchi and from the patch proposed by Thomas Munro. Reviewed by Kyotaro Horiguchi, Konstantin Knizhnik, Andrey Lepikhov, Movead Li, Thomas Munro, Justin Pryzby, and others. Discussion: https://postgr.es/m/CA%2BTgmoaXQEt4tZ03FtQhnzeDEMzBck%2BLrni0UWHVVgOTnA6C1w%40mail.gmail.com Discussion: https://postgr.es/m/CA%2BhUKGLBRyu0rHrDCMC4%3DRn3252gogyp1SjOgG8SEKKZv%3DFwfQ%40mail.gmail.com Discussion: https://postgr.es/m/20200228.170650.667613673625155850.horikyota.ntt%40gmail.com
2021-03-31 11:45:00 +02:00
WRITE_BOOL_FIELD(async_capable);
WRITE_INT_FIELD(plan_node_id);
WRITE_NODE_FIELD(targetlist);
WRITE_NODE_FIELD(qual);
WRITE_NODE_FIELD(lefttree);
WRITE_NODE_FIELD(righttree);
WRITE_NODE_FIELD(initPlan);
WRITE_BITMAPSET_FIELD(extParam);
WRITE_BITMAPSET_FIELD(allParam);
}
/*
* print the basic stuff of all nodes that inherit from Scan
*/
static void
_outScanInfo(StringInfo str, const Scan *node)
{
_outPlanInfo(str, (const Plan *) node);
WRITE_UINT_FIELD(scanrelid);
}
/*
* print the basic stuff of all nodes that inherit from Join
*/
static void
_outJoinPlanInfo(StringInfo str, const Join *node)
{
_outPlanInfo(str, (const Plan *) node);
WRITE_ENUM_FIELD(jointype, JoinType);
WRITE_BOOL_FIELD(inner_unique);
WRITE_NODE_FIELD(joinqual);
}
static void
_outResult(StringInfo str, const Result *node)
{
WRITE_NODE_TYPE("RESULT");
_outPlanInfo(str, (const Plan *) node);
WRITE_NODE_FIELD(resconstantqual);
}
Move targetlist SRF handling from expression evaluation to new executor node. Evaluation of set returning functions (SRFs_ in the targetlist (like SELECT generate_series(1,5)) so far was done in the expression evaluation (i.e. ExecEvalExpr()) and projection (i.e. ExecProject/ExecTargetList) code. This meant that most executor nodes performing projection, and most expression evaluation functions, had to deal with the possibility that an evaluated expression could return a set of return values. That's bad because it leads to repeated code in a lot of places. It also, and that's my (Andres's) motivation, made it a lot harder to implement a more efficient way of doing expression evaluation. To fix this, introduce a new executor node (ProjectSet) that can evaluate targetlists containing one or more SRFs. To avoid the complexity of the old way of handling nested expressions returning sets (e.g. having to pass up ExprDoneCond, and dealing with arguments to functions returning sets etc.), those SRFs can only be at the top level of the node's targetlist. The planner makes sure (via split_pathtarget_at_srfs()) that SRF evaluation is only necessary in ProjectSet nodes and that SRFs are only present at the top level of the node's targetlist. If there are nested SRFs the planner creates multiple stacked ProjectSet nodes. The ProjectSet nodes always get input from an underlying node. We also discussed and prototyped evaluating targetlist SRFs using ROWS FROM(), but that turned out to be more complicated than we'd hoped. While moving SRF evaluation to ProjectSet would allow to retain the old "least common multiple" behavior when multiple SRFs are present in one targetlist (i.e. continue returning rows until all SRFs are at the end of their input at the same time), we decided to instead only return rows till all SRFs are exhausted, returning NULL for already exhausted ones. We deemed the previous behavior to be too confusing, unexpected and actually not particularly useful. As a side effect, the previously prohibited case of multiple set returning arguments to a function, is now allowed. Not because it's particularly desirable, but because it ends up working and there seems to be no argument for adding code to prohibit it. Currently the behavior for COALESCE and CASE containing SRFs has changed, returning multiple rows from the expression, even when the SRF containing "arm" of the expression is not evaluated. That's because the SRFs are evaluated in a separate ProjectSet node. As that's quite confusing, we're likely to instead prohibit SRFs in those places. But that's still being discussed, and the code would reside in places not touched here, so that's a task for later. There's a lot of, now superfluous, code dealing with set return expressions around. But as the changes to get rid of those are verbose largely boring, it seems better for readability to keep the cleanup as a separate commit. Author: Tom Lane and Andres Freund Discussion: https://postgr.es/m/20160822214023.aaxz5l4igypowyri@alap3.anarazel.de
2017-01-18 21:46:50 +01:00
static void
_outProjectSet(StringInfo str, const ProjectSet *node)
{
WRITE_NODE_TYPE("PROJECTSET");
_outPlanInfo(str, (const Plan *) node);
}
static void
_outModifyTable(StringInfo str, const ModifyTable *node)
{
WRITE_NODE_TYPE("MODIFYTABLE");
_outPlanInfo(str, (const Plan *) node);
WRITE_ENUM_FIELD(operation, CmdType);
WRITE_BOOL_FIELD(canSetTag);
WRITE_UINT_FIELD(nominalRelation);
WRITE_UINT_FIELD(rootRelation);
WRITE_BOOL_FIELD(partColsUpdated);
WRITE_NODE_FIELD(resultRelations);
Rework planning and execution of UPDATE and DELETE. This patch makes two closely related sets of changes: 1. For UPDATE, the subplan of the ModifyTable node now only delivers the new values of the changed columns (i.e., the expressions computed in the query's SET clause) plus row identity information such as CTID. ModifyTable must re-fetch the original tuple to merge in the old values of any unchanged columns. The core advantage of this is that the changed columns are uniform across all tables of an inherited or partitioned target relation, whereas the other columns might not be. A secondary advantage, when the UPDATE involves joins, is that less data needs to pass through the plan tree. The disadvantage of course is an extra fetch of each tuple to be updated. However, that seems to be very nearly free in context; even worst-case tests don't show it to add more than a couple percent to the total query cost. At some point it might be interesting to combine the re-fetch with the tuple access that ModifyTable must do anyway to mark the old tuple dead; but that would require a good deal of refactoring and it seems it wouldn't buy all that much, so this patch doesn't attempt it. 2. For inherited UPDATE/DELETE, instead of generating a separate subplan for each target relation, we now generate a single subplan that is just exactly like a SELECT's plan, then stick ModifyTable on top of that. To let ModifyTable know which target relation a given incoming row refers to, a tableoid junk column is added to the row identity information. This gets rid of the horrid hack that was inheritance_planner(), eliminating O(N^2) planning cost and memory consumption in cases where there were many unprunable target relations. Point 2 of course requires point 1, so that there is a uniform definition of the non-junk columns to be returned by the subplan. We can't insist on uniform definition of the row identity junk columns however, if we want to keep the ability to have both plain and foreign tables in a partitioning hierarchy. Since it wouldn't scale very far to have every child table have its own row identity column, this patch includes provisions to merge similar row identity columns into one column of the subplan result. In particular, we can merge the whole-row Vars typically used as row identity by FDWs into one column by pretending they are type RECORD. (It's still okay for the actual composite Datums to be labeled with the table's rowtype OID, though.) There is more that can be done to file down residual inefficiencies in this patch, but it seems to be committable now. FDW authors should note several API changes: * The argument list for AddForeignUpdateTargets() has changed, and so has the method it must use for adding junk columns to the query. Call add_row_identity_var() instead of manipulating the parse tree directly. You might want to reconsider exactly what you're adding, too. * PlanDirectModify() must now work a little harder to find the ForeignScan plan node; if the foreign table is part of a partitioning hierarchy then the ForeignScan might not be the direct child of ModifyTable. See postgres_fdw for sample code. * To check whether a relation is a target relation, it's no longer sufficient to compare its relid to root->parse->resultRelation. Instead, check it against all_result_relids or leaf_result_relids, as appropriate. Amit Langote and Tom Lane Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
WRITE_NODE_FIELD(updateColnosLists);
WRITE_NODE_FIELD(withCheckOptionLists);
WRITE_NODE_FIELD(returningLists);
WRITE_NODE_FIELD(fdwPrivLists);
WRITE_BITMAPSET_FIELD(fdwDirectModifyPlans);
WRITE_NODE_FIELD(rowMarks);
WRITE_INT_FIELD(epqParam);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
WRITE_ENUM_FIELD(onConflictAction, OnConflictAction);
WRITE_NODE_FIELD(arbiterIndexes);
WRITE_NODE_FIELD(onConflictSet);
Fix mishandling of resjunk columns in ON CONFLICT ... UPDATE tlists. It's unusual to have any resjunk columns in an ON CONFLICT ... UPDATE list, but it can happen when MULTIEXPR_SUBLINK SubPlans are present. If it happens, the ON CONFLICT UPDATE code path would end up storing tuples that include the values of the extra resjunk columns. That's fairly harmless in the short run, but if new columns are added to the table then the values would become accessible, possibly leading to malfunctions if they don't match the datatypes of the new columns. This had escaped notice through a confluence of missing sanity checks, including * There's no cross-check that a tuple presented to heap_insert or heap_update matches the table rowtype. While it's difficult to check that fully at reasonable cost, we can easily add assertions that there aren't too many columns. * The output-column-assignment cases in execExprInterp.c lacked any sanity checks on the output column numbers, which seems like an oversight considering there are plenty of assertion checks on input column numbers. Add assertions there too. * We failed to apply nodeModifyTable's ExecCheckPlanOutput() to the ON CONFLICT UPDATE tlist. That wouldn't have caught this specific error, since that function is chartered to ignore resjunk columns; but it sure seems like a bad omission now that we've seen this bug. In HEAD, the right way to fix this is to make the processing of ON CONFLICT UPDATE tlists work the same as regular UPDATE tlists now do, that is don't add "SET x = x" entries, and use ExecBuildUpdateProjection to evaluate the tlist and combine it with old values of the not-set columns. This adds a little complication to ExecBuildUpdateProjection, but allows removal of a comparable amount of now-dead code from the planner. In the back branches, the most expedient solution seems to be to (a) use an output slot for the ON CONFLICT UPDATE projection that actually matches the target table, and then (b) invent a variant of ExecBuildProjectionInfo that can be told to not store values resulting from resjunk columns, so it doesn't try to store into nonexistent columns of the output slot. (We can't simply ignore the resjunk columns altogether; they have to be evaluated for MULTIEXPR_SUBLINK to work.) This works back to v10. In 9.6, projections work much differently and we can't cheaply give them such an option. The 9.6 version of this patch works by inserting a JunkFilter when it's necessary to get rid of resjunk columns. In addition, v11 and up have the reverse problem when trying to perform ON CONFLICT UPDATE on a partitioned table. Through a further oversight, adjust_partition_tlist() discarded resjunk columns when re-ordering the ON CONFLICT UPDATE tlist to match a partition. This accidentally prevented the storing-bogus-tuples problem, but at the cost that MULTIEXPR_SUBLINK cases didn't work, typically crashing if more than one row has to be updated. Fix by preserving resjunk columns in that routine. (I failed to resist the temptation to add more assertions there too, and to do some minor code beautification.) Per report from Andres Freund. Back-patch to all supported branches. Security: CVE-2021-32028
2021-05-10 17:02:29 +02:00
WRITE_NODE_FIELD(onConflictCols);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
WRITE_NODE_FIELD(onConflictWhere);
WRITE_UINT_FIELD(exclRelRTI);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
WRITE_NODE_FIELD(exclRelTlist);
Add support for MERGE SQL command MERGE performs actions that modify rows in the target table using a source table or query. MERGE provides a single SQL statement that can conditionally INSERT/UPDATE/DELETE rows -- a task that would otherwise require multiple PL statements. For example, MERGE INTO target AS t USING source AS s ON t.tid = s.sid WHEN MATCHED AND t.balance > s.delta THEN UPDATE SET balance = t.balance - s.delta WHEN MATCHED THEN DELETE WHEN NOT MATCHED AND s.delta > 0 THEN INSERT VALUES (s.sid, s.delta) WHEN NOT MATCHED THEN DO NOTHING; MERGE works with regular tables, partitioned tables and inheritance hierarchies, including column and row security enforcement, as well as support for row and statement triggers and transition tables therein. MERGE is optimized for OLTP and is parameterizable, though also useful for large scale ETL/ELT. MERGE is not intended to be used in preference to existing single SQL commands for INSERT, UPDATE or DELETE since there is some overhead. MERGE can be used from PL/pgSQL. MERGE does not support targetting updatable views or foreign tables, and RETURNING clauses are not allowed either. These limitations are likely fixable with sufficient effort. Rewrite rules are also not supported, but it's not clear that we'd want to support them. Author: Pavan Deolasee <pavan.deolasee@gmail.com> Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Author: Amit Langote <amitlangote09@gmail.com> Author: Simon Riggs <simon.riggs@enterprisedb.com> Reviewed-by: Peter Eisentraut <peter.eisentraut@enterprisedb.com> Reviewed-by: Andres Freund <andres@anarazel.de> (earlier versions) Reviewed-by: Peter Geoghegan <pg@bowt.ie> (earlier versions) Reviewed-by: Robert Haas <robertmhaas@gmail.com> (earlier versions) Reviewed-by: Japin Li <japinli@hotmail.com> Reviewed-by: Justin Pryzby <pryzby@telsasoft.com> Reviewed-by: Tomas Vondra <tomas.vondra@enterprisedb.com> Reviewed-by: Zhihong Yu <zyu@yugabyte.com> Discussion: https://postgr.es/m/CANP8+jKitBSrB7oTgT9CY2i1ObfOt36z0XMraQc+Xrz8QB0nXA@mail.gmail.com Discussion: https://postgr.es/m/CAH2-WzkJdBuxj9PO=2QaO9-3h3xGbQPZ34kJH=HukRekwM-GZg@mail.gmail.com Discussion: https://postgr.es/m/20201231134736.GA25392@alvherre.pgsql
2022-03-28 16:45:58 +02:00
WRITE_NODE_FIELD(mergeActionLists);
}
static void
_outAppend(StringInfo str, const Append *node)
{
WRITE_NODE_TYPE("APPEND");
_outPlanInfo(str, (const Plan *) node);
Further adjust EXPLAIN's choices of table alias names. This patch causes EXPLAIN to always assign a separate table alias to the parent RTE of an append relation (inheritance set); before, such RTEs were ignored if not actually scanned by the plan. Since the child RTEs now always have that same alias to start with (cf. commit 55a1954da), the net effect is that the parent RTE usually gets the alias used or implied by the query text, and the children all get that alias with "_N" appended. (The exception to "usually" is if there are duplicate aliases in different subtrees of the original query; then some of those original RTEs will also have "_N" appended.) This results in more uniform output for partitioned-table plans than we had before: the partitioned table itself gets the original alias, and all child tables have aliases with "_N", rather than the previous behavior where one of the children would get an alias without "_N". The reason for giving the parent RTE an alias, even if it isn't scanned by the plan, is that we now use the parent's alias to qualify Vars that refer to an appendrel output column and appear above the Append or MergeAppend that computes the appendrel. But below the append, Vars refer to some one of the child relations, and are displayed that way. This seems clearer than the old behavior where a Var that could carry values from any child relation was displayed as if it referred to only one of them. While at it, change ruleutils.c so that the code paths used by EXPLAIN deal in Plan trees not PlanState trees. This effectively reverts a decision made in commit 1cc29fe7c, which seemed like a good idea at the time to make ruleutils.c consistent with explain.c. However, it's problematic because we'd really like to allow executor startup pruning to remove all the children of an append node when possible, leaving no child PlanState to resolve Vars against. (That's not done here, but will be in the next patch.) This requires different handling of subplans and initplans than before, but is otherwise a pretty straightforward change. Discussion: https://postgr.es/m/001001d4f44b$2a2cca50$7e865ef0$@lab.ntt.co.jp
2019-12-11 23:05:18 +01:00
WRITE_BITMAPSET_FIELD(apprelids);
WRITE_NODE_FIELD(appendplans);
Add support for asynchronous execution. This implements asynchronous execution, which runs multiple parts of a non-parallel-aware Append concurrently rather than serially to improve performance when possible. Currently, the only node type that can be run concurrently is a ForeignScan that is an immediate child of such an Append. In the case where such ForeignScans access data on different remote servers, this would run those ForeignScans concurrently, and overlap the remote operations to be performed simultaneously, so it'll improve the performance especially when the operations involve time-consuming ones such as remote join and remote aggregation. We may extend this to other node types such as joins or aggregates over ForeignScans in the future. This also adds the support for postgres_fdw, which is enabled by the table-level/server-level option "async_capable". The default is false. Robert Haas, Kyotaro Horiguchi, Thomas Munro, and myself. This commit is mostly based on the patch proposed by Robert Haas, but also uses stuff from the patch proposed by Kyotaro Horiguchi and from the patch proposed by Thomas Munro. Reviewed by Kyotaro Horiguchi, Konstantin Knizhnik, Andrey Lepikhov, Movead Li, Thomas Munro, Justin Pryzby, and others. Discussion: https://postgr.es/m/CA%2BTgmoaXQEt4tZ03FtQhnzeDEMzBck%2BLrni0UWHVVgOTnA6C1w%40mail.gmail.com Discussion: https://postgr.es/m/CA%2BhUKGLBRyu0rHrDCMC4%3DRn3252gogyp1SjOgG8SEKKZv%3DFwfQ%40mail.gmail.com Discussion: https://postgr.es/m/20200228.170650.667613673625155850.horikyota.ntt%40gmail.com
2021-03-31 11:45:00 +02:00
WRITE_INT_FIELD(nasyncplans);
WRITE_INT_FIELD(first_partial_plan);
WRITE_NODE_FIELD(part_prune_info);
}
static void
_outMergeAppend(StringInfo str, const MergeAppend *node)
{
WRITE_NODE_TYPE("MERGEAPPEND");
_outPlanInfo(str, (const Plan *) node);
Further adjust EXPLAIN's choices of table alias names. This patch causes EXPLAIN to always assign a separate table alias to the parent RTE of an append relation (inheritance set); before, such RTEs were ignored if not actually scanned by the plan. Since the child RTEs now always have that same alias to start with (cf. commit 55a1954da), the net effect is that the parent RTE usually gets the alias used or implied by the query text, and the children all get that alias with "_N" appended. (The exception to "usually" is if there are duplicate aliases in different subtrees of the original query; then some of those original RTEs will also have "_N" appended.) This results in more uniform output for partitioned-table plans than we had before: the partitioned table itself gets the original alias, and all child tables have aliases with "_N", rather than the previous behavior where one of the children would get an alias without "_N". The reason for giving the parent RTE an alias, even if it isn't scanned by the plan, is that we now use the parent's alias to qualify Vars that refer to an appendrel output column and appear above the Append or MergeAppend that computes the appendrel. But below the append, Vars refer to some one of the child relations, and are displayed that way. This seems clearer than the old behavior where a Var that could carry values from any child relation was displayed as if it referred to only one of them. While at it, change ruleutils.c so that the code paths used by EXPLAIN deal in Plan trees not PlanState trees. This effectively reverts a decision made in commit 1cc29fe7c, which seemed like a good idea at the time to make ruleutils.c consistent with explain.c. However, it's problematic because we'd really like to allow executor startup pruning to remove all the children of an append node when possible, leaving no child PlanState to resolve Vars against. (That's not done here, but will be in the next patch.) This requires different handling of subplans and initplans than before, but is otherwise a pretty straightforward change. Discussion: https://postgr.es/m/001001d4f44b$2a2cca50$7e865ef0$@lab.ntt.co.jp
2019-12-11 23:05:18 +01:00
WRITE_BITMAPSET_FIELD(apprelids);
WRITE_NODE_FIELD(mergeplans);
WRITE_INT_FIELD(numCols);
WRITE_ATTRNUMBER_ARRAY(sortColIdx, node->numCols);
WRITE_OID_ARRAY(sortOperators, node->numCols);
WRITE_OID_ARRAY(collations, node->numCols);
WRITE_BOOL_ARRAY(nullsFirst, node->numCols);
WRITE_NODE_FIELD(part_prune_info);
}
static void
_outRecursiveUnion(StringInfo str, const RecursiveUnion *node)
{
WRITE_NODE_TYPE("RECURSIVEUNION");
_outPlanInfo(str, (const Plan *) node);
WRITE_INT_FIELD(wtParam);
WRITE_INT_FIELD(numCols);
WRITE_ATTRNUMBER_ARRAY(dupColIdx, node->numCols);
WRITE_OID_ARRAY(dupOperators, node->numCols);
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
WRITE_OID_ARRAY(dupCollations, node->numCols);
WRITE_LONG_FIELD(numGroups);
}
static void
_outBitmapAnd(StringInfo str, const BitmapAnd *node)
{
WRITE_NODE_TYPE("BITMAPAND");
_outPlanInfo(str, (const Plan *) node);
WRITE_NODE_FIELD(bitmapplans);
}
static void
_outBitmapOr(StringInfo str, const BitmapOr *node)
{
WRITE_NODE_TYPE("BITMAPOR");
_outPlanInfo(str, (const Plan *) node);
WRITE_BOOL_FIELD(isshared);
WRITE_NODE_FIELD(bitmapplans);
}
static void
_outGather(StringInfo str, const Gather *node)
{
WRITE_NODE_TYPE("GATHER");
_outPlanInfo(str, (const Plan *) node);
WRITE_INT_FIELD(num_workers);
Force rescanning of parallel-aware scan nodes below a Gather[Merge]. The ExecReScan machinery contains various optimizations for postponing or skipping rescans of plan subtrees; for example a HashAgg node may conclude that it can re-use the table it built before, instead of re-reading its input subtree. But that is wrong if the input contains a parallel-aware table scan node, since the portion of the table scanned by the leader process is likely to vary from one rescan to the next. This explains the timing-dependent buildfarm failures we saw after commit a2b70c89c. The established mechanism for showing that a plan node's output is potentially variable is to mark it as depending on some runtime Param. Hence, to fix this, invent a dummy Param (one that has a PARAM_EXEC parameter number, but carries no actual value) associated with each Gather or GatherMerge node, mark parallel-aware nodes below that node as dependent on that Param, and arrange for ExecReScanGather[Merge] to flag that Param as changed whenever the Gather[Merge] node is rescanned. This solution breaks an undocumented assumption made by the parallel executor logic, namely that all rescans of nodes below a Gather[Merge] will happen synchronously during the ReScan of the top node itself. But that's fundamentally contrary to the design of the ExecReScan code, and so was doomed to fail someday anyway (even if you want to argue that the bug being fixed here wasn't a failure of that assumption). A follow-on patch will address that issue. In the meantime, the worst that's expected to happen is that given very bad timing luck, the leader might have to do all the work during a rescan, because workers think they have nothing to do, if they are able to start up before the eventual ReScan of the leader's parallel-aware table scan node has reset the shared scan state. Although this problem exists in 9.6, there does not seem to be any way for it to manifest there. Without GatherMerge, it seems that a plan tree that has a rescan-short-circuiting node below Gather will always also have one above it that will short-circuit in the same cases, preventing the Gather from being rescanned. Hence we won't take the risk of back-patching this change into 9.6. But v10 needs it. Discussion: https://postgr.es/m/CAA4eK1JkByysFJNh9M349u_nNjqETuEnY_y1VUc_kJiU0bxtaQ@mail.gmail.com
2017-08-30 15:29:55 +02:00
WRITE_INT_FIELD(rescan_param);
WRITE_BOOL_FIELD(single_copy);
WRITE_BOOL_FIELD(invisible);
WRITE_BITMAPSET_FIELD(initParam);
}
static void
_outGatherMerge(StringInfo str, const GatherMerge *node)
{
WRITE_NODE_TYPE("GATHERMERGE");
_outPlanInfo(str, (const Plan *) node);
WRITE_INT_FIELD(num_workers);
Force rescanning of parallel-aware scan nodes below a Gather[Merge]. The ExecReScan machinery contains various optimizations for postponing or skipping rescans of plan subtrees; for example a HashAgg node may conclude that it can re-use the table it built before, instead of re-reading its input subtree. But that is wrong if the input contains a parallel-aware table scan node, since the portion of the table scanned by the leader process is likely to vary from one rescan to the next. This explains the timing-dependent buildfarm failures we saw after commit a2b70c89c. The established mechanism for showing that a plan node's output is potentially variable is to mark it as depending on some runtime Param. Hence, to fix this, invent a dummy Param (one that has a PARAM_EXEC parameter number, but carries no actual value) associated with each Gather or GatherMerge node, mark parallel-aware nodes below that node as dependent on that Param, and arrange for ExecReScanGather[Merge] to flag that Param as changed whenever the Gather[Merge] node is rescanned. This solution breaks an undocumented assumption made by the parallel executor logic, namely that all rescans of nodes below a Gather[Merge] will happen synchronously during the ReScan of the top node itself. But that's fundamentally contrary to the design of the ExecReScan code, and so was doomed to fail someday anyway (even if you want to argue that the bug being fixed here wasn't a failure of that assumption). A follow-on patch will address that issue. In the meantime, the worst that's expected to happen is that given very bad timing luck, the leader might have to do all the work during a rescan, because workers think they have nothing to do, if they are able to start up before the eventual ReScan of the leader's parallel-aware table scan node has reset the shared scan state. Although this problem exists in 9.6, there does not seem to be any way for it to manifest there. Without GatherMerge, it seems that a plan tree that has a rescan-short-circuiting node below Gather will always also have one above it that will short-circuit in the same cases, preventing the Gather from being rescanned. Hence we won't take the risk of back-patching this change into 9.6. But v10 needs it. Discussion: https://postgr.es/m/CAA4eK1JkByysFJNh9M349u_nNjqETuEnY_y1VUc_kJiU0bxtaQ@mail.gmail.com
2017-08-30 15:29:55 +02:00
WRITE_INT_FIELD(rescan_param);
WRITE_INT_FIELD(numCols);
WRITE_ATTRNUMBER_ARRAY(sortColIdx, node->numCols);
WRITE_OID_ARRAY(sortOperators, node->numCols);
WRITE_OID_ARRAY(collations, node->numCols);
WRITE_BOOL_ARRAY(nullsFirst, node->numCols);
WRITE_BITMAPSET_FIELD(initParam);
}
static void
_outScan(StringInfo str, const Scan *node)
{
WRITE_NODE_TYPE("SCAN");
_outScanInfo(str, node);
}
static void
_outSeqScan(StringInfo str, const SeqScan *node)
{
WRITE_NODE_TYPE("SEQSCAN");
_outScanInfo(str, (const Scan *) node);
}
Redesign tablesample method API, and do extensive code review. The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2015-07-25 20:39:00 +02:00
static void
_outSampleScan(StringInfo str, const SampleScan *node)
{
WRITE_NODE_TYPE("SAMPLESCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_NODE_FIELD(tablesample);
}
static void
_outIndexScan(StringInfo str, const IndexScan *node)
{
WRITE_NODE_TYPE("INDEXSCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_OID_FIELD(indexid);
WRITE_NODE_FIELD(indexqual);
WRITE_NODE_FIELD(indexqualorig);
WRITE_NODE_FIELD(indexorderby);
WRITE_NODE_FIELD(indexorderbyorig);
WRITE_NODE_FIELD(indexorderbyops);
WRITE_ENUM_FIELD(indexorderdir, ScanDirection);
}
static void
_outIndexOnlyScan(StringInfo str, const IndexOnlyScan *node)
{
WRITE_NODE_TYPE("INDEXONLYSCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_OID_FIELD(indexid);
WRITE_NODE_FIELD(indexqual);
Fix index-only scan plans, take 2. Commit 4ace45677 failed to fix the problem fully, because the same issue of attempting to fetch a non-returnable index column can occur when rechecking the indexqual after using a lossy index operator. Moreover, it broke EXPLAIN for such indexquals (which indicates a gap in our test cases :-(). Revert the code changes of 4ace45677 in favor of adding a new field to struct IndexOnlyScan, containing a version of the indexqual that can be executed against the index-returned tuple without using any non-returnable columns. (The restrictions imposed by check_index_only guarantee this is possible, although we may have to recompute indexed expressions.) Support construction of that during setrefs.c processing by marking IndexOnlyScan.indextlist entries as resjunk if they can't be returned, rather than removing them entirely. (We could alternatively require setrefs.c to look up the IndexOptInfo again, but abusing resjunk this way seems like a reasonably safe way to avoid needing to do that.) This solution isn't great from an API-stability standpoint: if there are any extensions out there that build IndexOnlyScan structs directly, they'll be broken in the next minor releases. However, only a very invasive extension would be likely to do such a thing. There's no change in the Path representation, so typical planner extensions shouldn't have a problem. As before, back-patch to all supported branches. Discussion: https://postgr.es/m/3179992.1641150853@sss.pgh.pa.us Discussion: https://postgr.es/m/17350-b5bdcf476e5badbb@postgresql.org
2022-01-03 21:42:27 +01:00
WRITE_NODE_FIELD(recheckqual);
WRITE_NODE_FIELD(indexorderby);
WRITE_NODE_FIELD(indextlist);
WRITE_ENUM_FIELD(indexorderdir, ScanDirection);
}
static void
_outBitmapIndexScan(StringInfo str, const BitmapIndexScan *node)
{
WRITE_NODE_TYPE("BITMAPINDEXSCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_OID_FIELD(indexid);
WRITE_BOOL_FIELD(isshared);
WRITE_NODE_FIELD(indexqual);
WRITE_NODE_FIELD(indexqualorig);
}
static void
_outBitmapHeapScan(StringInfo str, const BitmapHeapScan *node)
{
WRITE_NODE_TYPE("BITMAPHEAPSCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_NODE_FIELD(bitmapqualorig);
}
static void
_outTidScan(StringInfo str, const TidScan *node)
{
WRITE_NODE_TYPE("TIDSCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_NODE_FIELD(tidquals);
}
static void
_outTidRangeScan(StringInfo str, const TidRangeScan *node)
{
WRITE_NODE_TYPE("TIDRANGESCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_NODE_FIELD(tidrangequals);
}
static void
_outSubqueryScan(StringInfo str, const SubqueryScan *node)
{
WRITE_NODE_TYPE("SUBQUERYSCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_NODE_FIELD(subplan);
WRITE_ENUM_FIELD(scanstatus, SubqueryScanStatus);
}
static void
_outFunctionScan(StringInfo str, const FunctionScan *node)
{
WRITE_NODE_TYPE("FUNCTIONSCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_NODE_FIELD(functions);
WRITE_BOOL_FIELD(funcordinality);
}
static void
_outTableFuncScan(StringInfo str, const TableFuncScan *node)
{
WRITE_NODE_TYPE("TABLEFUNCSCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_NODE_FIELD(tablefunc);
}
static void
_outValuesScan(StringInfo str, const ValuesScan *node)
{
WRITE_NODE_TYPE("VALUESSCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_NODE_FIELD(values_lists);
}
static void
_outCteScan(StringInfo str, const CteScan *node)
{
WRITE_NODE_TYPE("CTESCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_INT_FIELD(ctePlanId);
WRITE_INT_FIELD(cteParam);
}
static void
_outNamedTuplestoreScan(StringInfo str, const NamedTuplestoreScan *node)
{
WRITE_NODE_TYPE("NAMEDTUPLESTORESCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_STRING_FIELD(enrname);
}
static void
_outWorkTableScan(StringInfo str, const WorkTableScan *node)
{
WRITE_NODE_TYPE("WORKTABLESCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_INT_FIELD(wtParam);
}
static void
_outForeignScan(StringInfo str, const ForeignScan *node)
{
WRITE_NODE_TYPE("FOREIGNSCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_ENUM_FIELD(operation, CmdType);
WRITE_UINT_FIELD(resultRelation);
Code review for foreign/custom join pushdown patch. Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design decisions that seem pretty questionable to me, and there was quite a lot of stuff not to like about the documentation and comments. Clean up as follows: * Consider foreign joins only between foreign tables on the same server, rather than between any two foreign tables with the same underlying FDW handler function. In most if not all cases, the FDW would simply have had to apply the same-server restriction itself (far more expensively, both for lack of caching and because it would be repeated for each combination of input sub-joins), or else risk nasty bugs. Anyone who's really intent on doing something outside this restriction can always use the set_join_pathlist_hook. * Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist to better reflect what they're for, and allow these custom scan tlists to be used even for base relations. * Change make_foreignscan() API to include passing the fdw_scan_tlist value, since the FDW is required to set that. Backwards compatibility doesn't seem like an adequate reason to expect FDWs to set it in some ad-hoc extra step, and anyway existing FDWs can just pass NIL. * Change the API of path-generating subroutines of add_paths_to_joinrel, and in particular that of GetForeignJoinPaths and set_join_pathlist_hook, so that various less-used parameters are passed in a struct rather than as separate parameter-list entries. The objective here is to reduce the probability that future additions to those parameter lists will result in source-level API breaks for users of these hooks. It's possible that this is even a small win for the core code, since most CPU architectures can't pass more than half a dozen parameters efficiently anyway. I kept root, joinrel, outerrel, innerrel, and jointype as separate parameters to reduce code churn in joinpath.c --- in particular, putting jointype into the struct would have been problematic because of the subroutines' habit of changing their local copies of that variable. * Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all right for it to know about IndexOnlyScan, but if the list is to grow we should refactor the knowledge out to the callers. * Restore nodeForeignscan.c's previous use of the relcache to avoid extra GetFdwRoutine lookups for base-relation scans. * Lots of cleanup of documentation and missed comments. Re-order some code additions into more logical places.
2015-05-10 20:36:30 +02:00
WRITE_OID_FIELD(fs_server);
Revise FDW planning API, again. Further reflection shows that a single callback isn't very workable if we desire to let FDWs generate multiple Paths, because that forces the FDW to do all work necessary to generate a valid Plan node for each Path. Instead split the former PlanForeignScan API into three steps: GetForeignRelSize, GetForeignPaths, GetForeignPlan. We had already bit the bullet of breaking the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain, and it's substantially more flexible for complex FDWs. Add an fdw_private field to RelOptInfo so that the new functions can save state there rather than possibly having to recalculate information two or three times. In addition, we'd not thought through what would be needed to allow an FDW to set up subexpressions of its choice for runtime execution. We could treat ForeignScan.fdw_private as an executable expression but that seems likely to break existing FDWs unnecessarily (in particular, it would restrict the set of node types allowable in fdw_private to those supported by expression_tree_walker). Instead, invent a separate field fdw_exprs which will receive the postprocessing appropriate for expression trees. (One field is enough since it can be a list of expressions; also, we assume the corresponding expression state tree(s) will be held within fdw_state, so we don't need to add anything to ForeignScanState.) Per review of Hanada Shigeru's pgsql_fdw patch. We may need to tweak this further as we continue to work on that patch, but to me it feels a lot closer to being right now.
2012-03-09 18:48:48 +01:00
WRITE_NODE_FIELD(fdw_exprs);
WRITE_NODE_FIELD(fdw_private);
Code review for foreign/custom join pushdown patch. Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design decisions that seem pretty questionable to me, and there was quite a lot of stuff not to like about the documentation and comments. Clean up as follows: * Consider foreign joins only between foreign tables on the same server, rather than between any two foreign tables with the same underlying FDW handler function. In most if not all cases, the FDW would simply have had to apply the same-server restriction itself (far more expensively, both for lack of caching and because it would be repeated for each combination of input sub-joins), or else risk nasty bugs. Anyone who's really intent on doing something outside this restriction can always use the set_join_pathlist_hook. * Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist to better reflect what they're for, and allow these custom scan tlists to be used even for base relations. * Change make_foreignscan() API to include passing the fdw_scan_tlist value, since the FDW is required to set that. Backwards compatibility doesn't seem like an adequate reason to expect FDWs to set it in some ad-hoc extra step, and anyway existing FDWs can just pass NIL. * Change the API of path-generating subroutines of add_paths_to_joinrel, and in particular that of GetForeignJoinPaths and set_join_pathlist_hook, so that various less-used parameters are passed in a struct rather than as separate parameter-list entries. The objective here is to reduce the probability that future additions to those parameter lists will result in source-level API breaks for users of these hooks. It's possible that this is even a small win for the core code, since most CPU architectures can't pass more than half a dozen parameters efficiently anyway. I kept root, joinrel, outerrel, innerrel, and jointype as separate parameters to reduce code churn in joinpath.c --- in particular, putting jointype into the struct would have been problematic because of the subroutines' habit of changing their local copies of that variable. * Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all right for it to know about IndexOnlyScan, but if the list is to grow we should refactor the knowledge out to the callers. * Restore nodeForeignscan.c's previous use of the relcache to avoid extra GetFdwRoutine lookups for base-relation scans. * Lots of cleanup of documentation and missed comments. Re-order some code additions into more logical places.
2015-05-10 20:36:30 +02:00
WRITE_NODE_FIELD(fdw_scan_tlist);
WRITE_NODE_FIELD(fdw_recheck_quals);
Code review for foreign/custom join pushdown patch. Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design decisions that seem pretty questionable to me, and there was quite a lot of stuff not to like about the documentation and comments. Clean up as follows: * Consider foreign joins only between foreign tables on the same server, rather than between any two foreign tables with the same underlying FDW handler function. In most if not all cases, the FDW would simply have had to apply the same-server restriction itself (far more expensively, both for lack of caching and because it would be repeated for each combination of input sub-joins), or else risk nasty bugs. Anyone who's really intent on doing something outside this restriction can always use the set_join_pathlist_hook. * Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist to better reflect what they're for, and allow these custom scan tlists to be used even for base relations. * Change make_foreignscan() API to include passing the fdw_scan_tlist value, since the FDW is required to set that. Backwards compatibility doesn't seem like an adequate reason to expect FDWs to set it in some ad-hoc extra step, and anyway existing FDWs can just pass NIL. * Change the API of path-generating subroutines of add_paths_to_joinrel, and in particular that of GetForeignJoinPaths and set_join_pathlist_hook, so that various less-used parameters are passed in a struct rather than as separate parameter-list entries. The objective here is to reduce the probability that future additions to those parameter lists will result in source-level API breaks for users of these hooks. It's possible that this is even a small win for the core code, since most CPU architectures can't pass more than half a dozen parameters efficiently anyway. I kept root, joinrel, outerrel, innerrel, and jointype as separate parameters to reduce code churn in joinpath.c --- in particular, putting jointype into the struct would have been problematic because of the subroutines' habit of changing their local copies of that variable. * Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all right for it to know about IndexOnlyScan, but if the list is to grow we should refactor the knowledge out to the callers. * Restore nodeForeignscan.c's previous use of the relcache to avoid extra GetFdwRoutine lookups for base-relation scans. * Lots of cleanup of documentation and missed comments. Re-order some code additions into more logical places.
2015-05-10 20:36:30 +02:00
WRITE_BITMAPSET_FIELD(fs_relids);
Revise FDW planning API, again. Further reflection shows that a single callback isn't very workable if we desire to let FDWs generate multiple Paths, because that forces the FDW to do all work necessary to generate a valid Plan node for each Path. Instead split the former PlanForeignScan API into three steps: GetForeignRelSize, GetForeignPaths, GetForeignPlan. We had already bit the bullet of breaking the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain, and it's substantially more flexible for complex FDWs. Add an fdw_private field to RelOptInfo so that the new functions can save state there rather than possibly having to recalculate information two or three times. In addition, we'd not thought through what would be needed to allow an FDW to set up subexpressions of its choice for runtime execution. We could treat ForeignScan.fdw_private as an executable expression but that seems likely to break existing FDWs unnecessarily (in particular, it would restrict the set of node types allowable in fdw_private to those supported by expression_tree_walker). Instead, invent a separate field fdw_exprs which will receive the postprocessing appropriate for expression trees. (One field is enough since it can be a list of expressions; also, we assume the corresponding expression state tree(s) will be held within fdw_state, so we don't need to add anything to ForeignScanState.) Per review of Hanada Shigeru's pgsql_fdw patch. We may need to tweak this further as we continue to work on that patch, but to me it feels a lot closer to being right now.
2012-03-09 18:48:48 +01:00
WRITE_BOOL_FIELD(fsSystemCol);
}
static void
_outCustomScan(StringInfo str, const CustomScan *node)
{
WRITE_NODE_TYPE("CUSTOMSCAN");
_outScanInfo(str, (const Scan *) node);
WRITE_UINT_FIELD(flags);
WRITE_NODE_FIELD(custom_plans);
WRITE_NODE_FIELD(custom_exprs);
WRITE_NODE_FIELD(custom_private);
Code review for foreign/custom join pushdown patch. Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design decisions that seem pretty questionable to me, and there was quite a lot of stuff not to like about the documentation and comments. Clean up as follows: * Consider foreign joins only between foreign tables on the same server, rather than between any two foreign tables with the same underlying FDW handler function. In most if not all cases, the FDW would simply have had to apply the same-server restriction itself (far more expensively, both for lack of caching and because it would be repeated for each combination of input sub-joins), or else risk nasty bugs. Anyone who's really intent on doing something outside this restriction can always use the set_join_pathlist_hook. * Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist to better reflect what they're for, and allow these custom scan tlists to be used even for base relations. * Change make_foreignscan() API to include passing the fdw_scan_tlist value, since the FDW is required to set that. Backwards compatibility doesn't seem like an adequate reason to expect FDWs to set it in some ad-hoc extra step, and anyway existing FDWs can just pass NIL. * Change the API of path-generating subroutines of add_paths_to_joinrel, and in particular that of GetForeignJoinPaths and set_join_pathlist_hook, so that various less-used parameters are passed in a struct rather than as separate parameter-list entries. The objective here is to reduce the probability that future additions to those parameter lists will result in source-level API breaks for users of these hooks. It's possible that this is even a small win for the core code, since most CPU architectures can't pass more than half a dozen parameters efficiently anyway. I kept root, joinrel, outerrel, innerrel, and jointype as separate parameters to reduce code churn in joinpath.c --- in particular, putting jointype into the struct would have been problematic because of the subroutines' habit of changing their local copies of that variable. * Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all right for it to know about IndexOnlyScan, but if the list is to grow we should refactor the knowledge out to the callers. * Restore nodeForeignscan.c's previous use of the relcache to avoid extra GetFdwRoutine lookups for base-relation scans. * Lots of cleanup of documentation and missed comments. Re-order some code additions into more logical places.
2015-05-10 20:36:30 +02:00
WRITE_NODE_FIELD(custom_scan_tlist);
WRITE_BITMAPSET_FIELD(custom_relids);
/* CustomName is a key to lookup CustomScanMethods */
appendStringInfoString(str, " :methods ");
outToken(str, node->methods->CustomName);
}
static void
_outNestLoop(StringInfo str, const NestLoop *node)
{
WRITE_NODE_TYPE("NESTLOOP");
_outJoinPlanInfo(str, (const Join *) node);
WRITE_NODE_FIELD(nestParams);
}
static void
_outMergeJoin(StringInfo str, const MergeJoin *node)
{
int numCols;
WRITE_NODE_TYPE("MERGEJOIN");
_outJoinPlanInfo(str, (const Join *) node);
WRITE_BOOL_FIELD(skip_mark_restore);
WRITE_NODE_FIELD(mergeclauses);
numCols = list_length(node->mergeclauses);
WRITE_OID_ARRAY(mergeFamilies, numCols);
WRITE_OID_ARRAY(mergeCollations, numCols);
WRITE_INT_ARRAY(mergeStrategies, numCols);
WRITE_BOOL_ARRAY(mergeNullsFirst, numCols);
}
static void
_outHashJoin(StringInfo str, const HashJoin *node)
{
WRITE_NODE_TYPE("HASHJOIN");
_outJoinPlanInfo(str, (const Join *) node);
WRITE_NODE_FIELD(hashclauses);
Fix representation of hash keys in Hash/HashJoin nodes. In 5f32b29c1819 I changed the creation of HashState.hashkeys to actually use HashState as the parent (instead of HashJoinState, which was incorrect, as they were executed below HashState), to fix the problem of hashkeys expressions otherwise relying on slot types appropriate for HashJoinState, rather than HashState as would be correct. That reliance was only introduced in 12, which is why it previously worked to use HashJoinState as the parent (although I'd be unsurprised if there were problematic cases). Unfortunately that's not a sufficient solution, because before this commit, the to-be-hashed expressions referenced inner/outer as appropriate for the HashJoin, not Hash. That didn't have obvious bad consequences, because the slots containing the tuples were put into ecxt_innertuple when hashing a tuple for HashState (even though Hash doesn't have an inner plan). There are less common cases where this can cause visible problems however (rather than just confusion when inspecting such executor trees). E.g. "ERROR: bogus varno: 65000", when explaining queries containing a HashJoin where the subsidiary Hash node's hash keys reference a subplan. While normally hashkeys aren't displayed by EXPLAIN, if one of those expressions references a subplan, that subplan may be printed as part of the Hash node - which then failed because an inner plan was referenced, and Hash doesn't have that. It seems quite possible that there's other broken cases, too. Fix the problem by properly splitting the expression for the HashJoin and Hash nodes at plan time, and have them reference the proper subsidiary node. While other workarounds are possible, fixing this correctly seems easy enough. It was a pretty ugly hack to have ExecInitHashJoin put the expression into the already initialized HashState, in the first place. I decided to not just split inner/outer hashkeys inside make_hashjoin(), but also to separate out hashoperators and hashcollations at plan time. Otherwise we would have ended up having two very similar loops, one at plan time and the other during executor startup. The work seems to more appropriately belong to plan time, anyway. Reported-By: Nikita Glukhov, Alexander Korotkov Author: Andres Freund Reviewed-By: Tom Lane, in an earlier version Discussion: https://postgr.es/m/CAPpHfdvGVegF_TKKRiBrSmatJL2dR9uwFCuR+teQ_8tEXU8mxg@mail.gmail.com Backpatch: 12-
2019-08-02 09:02:46 +02:00
WRITE_NODE_FIELD(hashoperators);
WRITE_NODE_FIELD(hashcollations);
WRITE_NODE_FIELD(hashkeys);
}
static void
_outAgg(StringInfo str, const Agg *node)
{
WRITE_NODE_TYPE("AGG");
_outPlanInfo(str, (const Plan *) node);
WRITE_ENUM_FIELD(aggstrategy, AggStrategy);
WRITE_ENUM_FIELD(aggsplit, AggSplit);
WRITE_INT_FIELD(numCols);
WRITE_ATTRNUMBER_ARRAY(grpColIdx, node->numCols);
WRITE_OID_ARRAY(grpOperators, node->numCols);
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
WRITE_OID_ARRAY(grpCollations, node->numCols);
WRITE_LONG_FIELD(numGroups);
WRITE_UINT64_FIELD(transitionSpace);
WRITE_BITMAPSET_FIELD(aggParams);
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
WRITE_NODE_FIELD(groupingSets);
WRITE_NODE_FIELD(chain);
}
static void
_outWindowAgg(StringInfo str, const WindowAgg *node)
{
WRITE_NODE_TYPE("WINDOWAGG");
_outPlanInfo(str, (const Plan *) node);
WRITE_UINT_FIELD(winref);
WRITE_INT_FIELD(partNumCols);
WRITE_ATTRNUMBER_ARRAY(partColIdx, node->partNumCols);
WRITE_OID_ARRAY(partOperators, node->partNumCols);
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
WRITE_OID_ARRAY(partCollations, node->partNumCols);
WRITE_INT_FIELD(ordNumCols);
WRITE_ATTRNUMBER_ARRAY(ordColIdx, node->ordNumCols);
WRITE_OID_ARRAY(ordOperators, node->ordNumCols);
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
WRITE_OID_ARRAY(ordCollations, node->ordNumCols);
WRITE_INT_FIELD(frameOptions);
WRITE_NODE_FIELD(startOffset);
WRITE_NODE_FIELD(endOffset);
Teach planner and executor about monotonic window funcs Window functions such as row_number() always return a value higher than the previously returned value for tuples in any given window partition. Traditionally queries such as; SELECT * FROM ( SELECT *, row_number() over (order by c) rn FROM t ) t WHERE rn <= 10; were executed fairly inefficiently. Neither the query planner nor the executor knew that once rn made it to 11 that nothing further would match the outer query's WHERE clause. It would blindly continue until all tuples were exhausted from the subquery. Here we implement means to make the above execute more efficiently. This is done by way of adding a pg_proc.prosupport function to various of the built-in window functions and adding supporting code to allow the support function to inform the planner if the window function is monotonically increasing, monotonically decreasing, both or neither. The planner is then able to make use of that information and possibly allow the executor to short-circuit execution by way of adding a "run condition" to the WindowAgg to allow it to determine if some of its execution work can be skipped. This "run condition" is not like a normal filter. These run conditions are only built using quals comparing values to monotonic window functions. For monotonic increasing functions, quals making use of the btree operators for <, <= and = can be used (assuming the window function column is on the left). You can see here that once such a condition becomes false that a monotonic increasing function could never make it subsequently true again. For monotonically decreasing functions the >, >= and = btree operators for the given type can be used for run conditions. The best-case situation for this is when there is a single WindowAgg node without a PARTITION BY clause. Here when the run condition becomes false the WindowAgg node can simply return NULL. No more tuples will ever match the run condition. It's a little more complex when there is a PARTITION BY clause. In this case, we cannot return NULL as we must still process other partitions. To speed this case up we pull tuples from the outer plan to check if they're from the same partition and simply discard them if they are. When we find a tuple belonging to another partition we start processing as normal again until the run condition becomes false or we run out of tuples to process. When there are multiple WindowAgg nodes to evaluate then this complicates the situation. For intermediate WindowAggs we must ensure we always return all tuples to the calling node. Any filtering done could lead to incorrect results in WindowAgg nodes above. For all intermediate nodes, we can still save some work when the run condition becomes false. We've no need to evaluate the WindowFuncs anymore. Other WindowAgg nodes cannot reference the value of these and these tuples will not appear in the final result anyway. The savings here are small in comparison to what can be saved in the top-level WingowAgg, but still worthwhile. Intermediate WindowAgg nodes never filter out tuples, but here we change WindowAgg so that the top-level WindowAgg filters out tuples that don't match the intermediate WindowAgg node's run condition. Such filters appear in the "Filter" clause in EXPLAIN for the top-level WindowAgg node. Here we add prosupport functions to allow the above to work for; row_number(), rank(), dense_rank(), count(*) and count(expr). It appears technically possible to do the same for min() and max(), however, it seems unlikely to be useful enough, so that's not done here. Bump catversion Author: David Rowley Reviewed-by: Andy Fan, Zhihong Yu Discussion: https://postgr.es/m/CAApHDvqvp3At8++yF8ij06sdcoo1S_b2YoaT9D4Nf+MObzsrLQ@mail.gmail.com
2022-04-08 00:34:36 +02:00
WRITE_NODE_FIELD(runCondition);
WRITE_NODE_FIELD(runConditionOrig);
Support all SQL:2011 options for window frame clauses. This patch adds the ability to use "RANGE offset PRECEDING/FOLLOWING" frame boundaries in window functions. We'd punted on that back in the original patch to add window functions, because it was not clear how to do it in a reasonably data-type-extensible fashion. That problem is resolved here by adding the ability for btree operator classes to provide an "in_range" support function that defines how to add or subtract the RANGE offset value. Factoring it this way also allows the operator class to avoid overflow problems near the ends of the datatype's range, if it wishes to expend effort on that. (In the committed patch, the integer opclasses handle that issue, but it did not seem worth the trouble to avoid overflow failures for datetime types.) The patch includes in_range support for the integer_ops opfamily (int2/int4/int8) as well as the standard datetime types. Support for other numeric types has been requested, but that seems like suitable material for a follow-on patch. In addition, the patch adds GROUPS mode which counts the offset in ORDER-BY peer groups rather than rows, and it adds the frame_exclusion options specified by SQL:2011. As far as I can see, we are now fully up to spec on window framing options. Existing behaviors remain unchanged, except that I changed the errcode for a couple of existing error reports to meet the SQL spec's expectation that negative "offset" values should be reported as SQLSTATE 22013. Internally and in relevant parts of the documentation, we now consistently use the terminology "offset PRECEDING/FOLLOWING" rather than "value PRECEDING/FOLLOWING", since the term "value" is confusingly vague. Oliver Ford, reviewed and whacked around some by me Discussion: https://postgr.es/m/CAGMVOdu9sivPAxbNN0X+q19Sfv9edEPv=HibOJhB14TJv_RCQg@mail.gmail.com
2018-02-07 06:06:50 +01:00
WRITE_OID_FIELD(startInRangeFunc);
WRITE_OID_FIELD(endInRangeFunc);
WRITE_OID_FIELD(inRangeColl);
WRITE_BOOL_FIELD(inRangeAsc);
WRITE_BOOL_FIELD(inRangeNullsFirst);
Teach planner and executor about monotonic window funcs Window functions such as row_number() always return a value higher than the previously returned value for tuples in any given window partition. Traditionally queries such as; SELECT * FROM ( SELECT *, row_number() over (order by c) rn FROM t ) t WHERE rn <= 10; were executed fairly inefficiently. Neither the query planner nor the executor knew that once rn made it to 11 that nothing further would match the outer query's WHERE clause. It would blindly continue until all tuples were exhausted from the subquery. Here we implement means to make the above execute more efficiently. This is done by way of adding a pg_proc.prosupport function to various of the built-in window functions and adding supporting code to allow the support function to inform the planner if the window function is monotonically increasing, monotonically decreasing, both or neither. The planner is then able to make use of that information and possibly allow the executor to short-circuit execution by way of adding a "run condition" to the WindowAgg to allow it to determine if some of its execution work can be skipped. This "run condition" is not like a normal filter. These run conditions are only built using quals comparing values to monotonic window functions. For monotonic increasing functions, quals making use of the btree operators for <, <= and = can be used (assuming the window function column is on the left). You can see here that once such a condition becomes false that a monotonic increasing function could never make it subsequently true again. For monotonically decreasing functions the >, >= and = btree operators for the given type can be used for run conditions. The best-case situation for this is when there is a single WindowAgg node without a PARTITION BY clause. Here when the run condition becomes false the WindowAgg node can simply return NULL. No more tuples will ever match the run condition. It's a little more complex when there is a PARTITION BY clause. In this case, we cannot return NULL as we must still process other partitions. To speed this case up we pull tuples from the outer plan to check if they're from the same partition and simply discard them if they are. When we find a tuple belonging to another partition we start processing as normal again until the run condition becomes false or we run out of tuples to process. When there are multiple WindowAgg nodes to evaluate then this complicates the situation. For intermediate WindowAggs we must ensure we always return all tuples to the calling node. Any filtering done could lead to incorrect results in WindowAgg nodes above. For all intermediate nodes, we can still save some work when the run condition becomes false. We've no need to evaluate the WindowFuncs anymore. Other WindowAgg nodes cannot reference the value of these and these tuples will not appear in the final result anyway. The savings here are small in comparison to what can be saved in the top-level WingowAgg, but still worthwhile. Intermediate WindowAgg nodes never filter out tuples, but here we change WindowAgg so that the top-level WindowAgg filters out tuples that don't match the intermediate WindowAgg node's run condition. Such filters appear in the "Filter" clause in EXPLAIN for the top-level WindowAgg node. Here we add prosupport functions to allow the above to work for; row_number(), rank(), dense_rank(), count(*) and count(expr). It appears technically possible to do the same for min() and max(), however, it seems unlikely to be useful enough, so that's not done here. Bump catversion Author: David Rowley Reviewed-by: Andy Fan, Zhihong Yu Discussion: https://postgr.es/m/CAApHDvqvp3At8++yF8ij06sdcoo1S_b2YoaT9D4Nf+MObzsrLQ@mail.gmail.com
2022-04-08 00:34:36 +02:00
WRITE_BOOL_FIELD(topWindow);
}
static void
_outGroup(StringInfo str, const Group *node)
{
WRITE_NODE_TYPE("GROUP");
_outPlanInfo(str, (const Plan *) node);
WRITE_INT_FIELD(numCols);
WRITE_ATTRNUMBER_ARRAY(grpColIdx, node->numCols);
WRITE_OID_ARRAY(grpOperators, node->numCols);
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
WRITE_OID_ARRAY(grpCollations, node->numCols);
}
static void
_outMaterial(StringInfo str, const Material *node)
{
WRITE_NODE_TYPE("MATERIAL");
_outPlanInfo(str, (const Plan *) node);
}
Add Result Cache executor node (take 2) Here we add a new executor node type named "Result Cache". The planner can include this node type in the plan to have the executor cache the results from the inner side of parameterized nested loop joins. This allows caching of tuples for sets of parameters so that in the event that the node sees the same parameter values again, it can just return the cached tuples instead of rescanning the inner side of the join all over again. Internally, result cache uses a hash table in order to quickly find tuples that have been previously cached. For certain data sets, this can significantly improve the performance of joins. The best cases for using this new node type are for join problems where a large portion of the tuples from the inner side of the join have no join partner on the outer side of the join. In such cases, hash join would have to hash values that are never looked up, thus bloating the hash table and possibly causing it to multi-batch. Merge joins would have to skip over all of the unmatched rows. If we use a nested loop join with a result cache, then we only cache tuples that have at least one join partner on the outer side of the join. The benefits of using a parameterized nested loop with a result cache increase when there are fewer distinct values being looked up and the number of lookups of each value is large. Also, hash probes to lookup the cache can be much faster than the hash probe in a hash join as it's common that the result cache's hash table is much smaller than the hash join's due to result cache only caching useful tuples rather than all tuples from the inner side of the join. This variation in hash probe performance is more significant when the hash join's hash table no longer fits into the CPU's L3 cache, but the result cache's hash table does. The apparent "random" access of hash buckets with each hash probe can cause a poor L3 cache hit ratio for large hash tables. Smaller hash tables generally perform better. The hash table used for the cache limits itself to not exceeding work_mem * hash_mem_multiplier in size. We maintain a dlist of keys for this cache and when we're adding new tuples and realize we've exceeded the memory budget, we evict cache entries starting with the least recently used ones until we have enough memory to add the new tuples to the cache. For parameterized nested loop joins, we now consider using one of these result cache nodes in between the nested loop node and its inner node. We determine when this might be useful based on cost, which is primarily driven off of what the expected cache hit ratio will be. Estimating the cache hit ratio relies on having good distinct estimates on the nested loop's parameters. For now, the planner will only consider using a result cache for parameterized nested loop joins. This works for both normal joins and also for LATERAL type joins to subqueries. It is possible to use this new node for other uses in the future. For example, to cache results from correlated subqueries. However, that's not done here due to some difficulties obtaining a distinct estimation on the outer plan to calculate the estimated cache hit ratio. Currently we plan the inner plan before planning the outer plan so there is no good way to know if a result cache would be useful or not since we can't estimate the number of times the subplan will be called until the outer plan is generated. The functionality being added here is newly introducing a dependency on the return value of estimate_num_groups() during the join search. Previously, during the join search, we only ever needed to perform selectivity estimations. With this commit, we need to use estimate_num_groups() in order to estimate what the hit ratio on the result cache will be. In simple terms, if we expect 10 distinct values and we expect 1000 outer rows, then we'll estimate the hit ratio to be 99%. Since cache hits are very cheap compared to scanning the underlying nodes on the inner side of the nested loop join, then this will significantly reduce the planner's cost for the join. However, it's fairly easy to see here that things will go bad when estimate_num_groups() incorrectly returns a value that's significantly lower than the actual number of distinct values. If this happens then that may cause us to make use of a nested loop join with a result cache instead of some other join type, such as a merge or hash join. Our distinct estimations have been known to be a source of trouble in the past, so the extra reliance on them here could cause the planner to choose slower plans than it did previous to having this feature. Distinct estimations are also fairly hard to estimate accurately when several tables have been joined already or when a WHERE clause filters out a set of values that are correlated to the expressions we're estimating the number of distinct value for. For now, the costing we perform during query planning for result caches does put quite a bit of faith in the distinct estimations being accurate. When these are accurate then we should generally see faster execution times for plans containing a result cache. However, in the real world, we may find that we need to either change the costings to put less trust in the distinct estimations being accurate or perhaps even disable this feature by default. There's always an element of risk when we teach the query planner to do new tricks that it decides to use that new trick at the wrong time and causes a regression. Users may opt to get the old behavior by turning the feature off using the enable_resultcache GUC. Currently, this is enabled by default. It remains to be seen if we'll maintain that setting for the release. Additionally, the name "Result Cache" is the best name I could think of for this new node at the time I started writing the patch. Nobody seems to strongly dislike the name. A few people did suggest other names but no other name seemed to dominate in the brief discussion that there was about names. Let's allow the beta period to see if the current name pleases enough people. If there's some consensus on a better name, then we can change it before the release. Please see the 2nd discussion link below for the discussion on the "Result Cache" name. Author: David Rowley Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie Tested-By: Konstantin Knizhnik Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
static void
_outMemoize(StringInfo str, const Memoize *node)
Add Result Cache executor node (take 2) Here we add a new executor node type named "Result Cache". The planner can include this node type in the plan to have the executor cache the results from the inner side of parameterized nested loop joins. This allows caching of tuples for sets of parameters so that in the event that the node sees the same parameter values again, it can just return the cached tuples instead of rescanning the inner side of the join all over again. Internally, result cache uses a hash table in order to quickly find tuples that have been previously cached. For certain data sets, this can significantly improve the performance of joins. The best cases for using this new node type are for join problems where a large portion of the tuples from the inner side of the join have no join partner on the outer side of the join. In such cases, hash join would have to hash values that are never looked up, thus bloating the hash table and possibly causing it to multi-batch. Merge joins would have to skip over all of the unmatched rows. If we use a nested loop join with a result cache, then we only cache tuples that have at least one join partner on the outer side of the join. The benefits of using a parameterized nested loop with a result cache increase when there are fewer distinct values being looked up and the number of lookups of each value is large. Also, hash probes to lookup the cache can be much faster than the hash probe in a hash join as it's common that the result cache's hash table is much smaller than the hash join's due to result cache only caching useful tuples rather than all tuples from the inner side of the join. This variation in hash probe performance is more significant when the hash join's hash table no longer fits into the CPU's L3 cache, but the result cache's hash table does. The apparent "random" access of hash buckets with each hash probe can cause a poor L3 cache hit ratio for large hash tables. Smaller hash tables generally perform better. The hash table used for the cache limits itself to not exceeding work_mem * hash_mem_multiplier in size. We maintain a dlist of keys for this cache and when we're adding new tuples and realize we've exceeded the memory budget, we evict cache entries starting with the least recently used ones until we have enough memory to add the new tuples to the cache. For parameterized nested loop joins, we now consider using one of these result cache nodes in between the nested loop node and its inner node. We determine when this might be useful based on cost, which is primarily driven off of what the expected cache hit ratio will be. Estimating the cache hit ratio relies on having good distinct estimates on the nested loop's parameters. For now, the planner will only consider using a result cache for parameterized nested loop joins. This works for both normal joins and also for LATERAL type joins to subqueries. It is possible to use this new node for other uses in the future. For example, to cache results from correlated subqueries. However, that's not done here due to some difficulties obtaining a distinct estimation on the outer plan to calculate the estimated cache hit ratio. Currently we plan the inner plan before planning the outer plan so there is no good way to know if a result cache would be useful or not since we can't estimate the number of times the subplan will be called until the outer plan is generated. The functionality being added here is newly introducing a dependency on the return value of estimate_num_groups() during the join search. Previously, during the join search, we only ever needed to perform selectivity estimations. With this commit, we need to use estimate_num_groups() in order to estimate what the hit ratio on the result cache will be. In simple terms, if we expect 10 distinct values and we expect 1000 outer rows, then we'll estimate the hit ratio to be 99%. Since cache hits are very cheap compared to scanning the underlying nodes on the inner side of the nested loop join, then this will significantly reduce the planner's cost for the join. However, it's fairly easy to see here that things will go bad when estimate_num_groups() incorrectly returns a value that's significantly lower than the actual number of distinct values. If this happens then that may cause us to make use of a nested loop join with a result cache instead of some other join type, such as a merge or hash join. Our distinct estimations have been known to be a source of trouble in the past, so the extra reliance on them here could cause the planner to choose slower plans than it did previous to having this feature. Distinct estimations are also fairly hard to estimate accurately when several tables have been joined already or when a WHERE clause filters out a set of values that are correlated to the expressions we're estimating the number of distinct value for. For now, the costing we perform during query planning for result caches does put quite a bit of faith in the distinct estimations being accurate. When these are accurate then we should generally see faster execution times for plans containing a result cache. However, in the real world, we may find that we need to either change the costings to put less trust in the distinct estimations being accurate or perhaps even disable this feature by default. There's always an element of risk when we teach the query planner to do new tricks that it decides to use that new trick at the wrong time and causes a regression. Users may opt to get the old behavior by turning the feature off using the enable_resultcache GUC. Currently, this is enabled by default. It remains to be seen if we'll maintain that setting for the release. Additionally, the name "Result Cache" is the best name I could think of for this new node at the time I started writing the patch. Nobody seems to strongly dislike the name. A few people did suggest other names but no other name seemed to dominate in the brief discussion that there was about names. Let's allow the beta period to see if the current name pleases enough people. If there's some consensus on a better name, then we can change it before the release. Please see the 2nd discussion link below for the discussion on the "Result Cache" name. Author: David Rowley Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie Tested-By: Konstantin Knizhnik Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
{
WRITE_NODE_TYPE("MEMOIZE");
Add Result Cache executor node (take 2) Here we add a new executor node type named "Result Cache". The planner can include this node type in the plan to have the executor cache the results from the inner side of parameterized nested loop joins. This allows caching of tuples for sets of parameters so that in the event that the node sees the same parameter values again, it can just return the cached tuples instead of rescanning the inner side of the join all over again. Internally, result cache uses a hash table in order to quickly find tuples that have been previously cached. For certain data sets, this can significantly improve the performance of joins. The best cases for using this new node type are for join problems where a large portion of the tuples from the inner side of the join have no join partner on the outer side of the join. In such cases, hash join would have to hash values that are never looked up, thus bloating the hash table and possibly causing it to multi-batch. Merge joins would have to skip over all of the unmatched rows. If we use a nested loop join with a result cache, then we only cache tuples that have at least one join partner on the outer side of the join. The benefits of using a parameterized nested loop with a result cache increase when there are fewer distinct values being looked up and the number of lookups of each value is large. Also, hash probes to lookup the cache can be much faster than the hash probe in a hash join as it's common that the result cache's hash table is much smaller than the hash join's due to result cache only caching useful tuples rather than all tuples from the inner side of the join. This variation in hash probe performance is more significant when the hash join's hash table no longer fits into the CPU's L3 cache, but the result cache's hash table does. The apparent "random" access of hash buckets with each hash probe can cause a poor L3 cache hit ratio for large hash tables. Smaller hash tables generally perform better. The hash table used for the cache limits itself to not exceeding work_mem * hash_mem_multiplier in size. We maintain a dlist of keys for this cache and when we're adding new tuples and realize we've exceeded the memory budget, we evict cache entries starting with the least recently used ones until we have enough memory to add the new tuples to the cache. For parameterized nested loop joins, we now consider using one of these result cache nodes in between the nested loop node and its inner node. We determine when this might be useful based on cost, which is primarily driven off of what the expected cache hit ratio will be. Estimating the cache hit ratio relies on having good distinct estimates on the nested loop's parameters. For now, the planner will only consider using a result cache for parameterized nested loop joins. This works for both normal joins and also for LATERAL type joins to subqueries. It is possible to use this new node for other uses in the future. For example, to cache results from correlated subqueries. However, that's not done here due to some difficulties obtaining a distinct estimation on the outer plan to calculate the estimated cache hit ratio. Currently we plan the inner plan before planning the outer plan so there is no good way to know if a result cache would be useful or not since we can't estimate the number of times the subplan will be called until the outer plan is generated. The functionality being added here is newly introducing a dependency on the return value of estimate_num_groups() during the join search. Previously, during the join search, we only ever needed to perform selectivity estimations. With this commit, we need to use estimate_num_groups() in order to estimate what the hit ratio on the result cache will be. In simple terms, if we expect 10 distinct values and we expect 1000 outer rows, then we'll estimate the hit ratio to be 99%. Since cache hits are very cheap compared to scanning the underlying nodes on the inner side of the nested loop join, then this will significantly reduce the planner's cost for the join. However, it's fairly easy to see here that things will go bad when estimate_num_groups() incorrectly returns a value that's significantly lower than the actual number of distinct values. If this happens then that may cause us to make use of a nested loop join with a result cache instead of some other join type, such as a merge or hash join. Our distinct estimations have been known to be a source of trouble in the past, so the extra reliance on them here could cause the planner to choose slower plans than it did previous to having this feature. Distinct estimations are also fairly hard to estimate accurately when several tables have been joined already or when a WHERE clause filters out a set of values that are correlated to the expressions we're estimating the number of distinct value for. For now, the costing we perform during query planning for result caches does put quite a bit of faith in the distinct estimations being accurate. When these are accurate then we should generally see faster execution times for plans containing a result cache. However, in the real world, we may find that we need to either change the costings to put less trust in the distinct estimations being accurate or perhaps even disable this feature by default. There's always an element of risk when we teach the query planner to do new tricks that it decides to use that new trick at the wrong time and causes a regression. Users may opt to get the old behavior by turning the feature off using the enable_resultcache GUC. Currently, this is enabled by default. It remains to be seen if we'll maintain that setting for the release. Additionally, the name "Result Cache" is the best name I could think of for this new node at the time I started writing the patch. Nobody seems to strongly dislike the name. A few people did suggest other names but no other name seemed to dominate in the brief discussion that there was about names. Let's allow the beta period to see if the current name pleases enough people. If there's some consensus on a better name, then we can change it before the release. Please see the 2nd discussion link below for the discussion on the "Result Cache" name. Author: David Rowley Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie Tested-By: Konstantin Knizhnik Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
_outPlanInfo(str, (const Plan *) node);
WRITE_INT_FIELD(numKeys);
WRITE_OID_ARRAY(hashOperators, node->numKeys);
WRITE_OID_ARRAY(collations, node->numKeys);
WRITE_NODE_FIELD(param_exprs);
WRITE_BOOL_FIELD(singlerow);
WRITE_BOOL_FIELD(binary_mode);
Add Result Cache executor node (take 2) Here we add a new executor node type named "Result Cache". The planner can include this node type in the plan to have the executor cache the results from the inner side of parameterized nested loop joins. This allows caching of tuples for sets of parameters so that in the event that the node sees the same parameter values again, it can just return the cached tuples instead of rescanning the inner side of the join all over again. Internally, result cache uses a hash table in order to quickly find tuples that have been previously cached. For certain data sets, this can significantly improve the performance of joins. The best cases for using this new node type are for join problems where a large portion of the tuples from the inner side of the join have no join partner on the outer side of the join. In such cases, hash join would have to hash values that are never looked up, thus bloating the hash table and possibly causing it to multi-batch. Merge joins would have to skip over all of the unmatched rows. If we use a nested loop join with a result cache, then we only cache tuples that have at least one join partner on the outer side of the join. The benefits of using a parameterized nested loop with a result cache increase when there are fewer distinct values being looked up and the number of lookups of each value is large. Also, hash probes to lookup the cache can be much faster than the hash probe in a hash join as it's common that the result cache's hash table is much smaller than the hash join's due to result cache only caching useful tuples rather than all tuples from the inner side of the join. This variation in hash probe performance is more significant when the hash join's hash table no longer fits into the CPU's L3 cache, but the result cache's hash table does. The apparent "random" access of hash buckets with each hash probe can cause a poor L3 cache hit ratio for large hash tables. Smaller hash tables generally perform better. The hash table used for the cache limits itself to not exceeding work_mem * hash_mem_multiplier in size. We maintain a dlist of keys for this cache and when we're adding new tuples and realize we've exceeded the memory budget, we evict cache entries starting with the least recently used ones until we have enough memory to add the new tuples to the cache. For parameterized nested loop joins, we now consider using one of these result cache nodes in between the nested loop node and its inner node. We determine when this might be useful based on cost, which is primarily driven off of what the expected cache hit ratio will be. Estimating the cache hit ratio relies on having good distinct estimates on the nested loop's parameters. For now, the planner will only consider using a result cache for parameterized nested loop joins. This works for both normal joins and also for LATERAL type joins to subqueries. It is possible to use this new node for other uses in the future. For example, to cache results from correlated subqueries. However, that's not done here due to some difficulties obtaining a distinct estimation on the outer plan to calculate the estimated cache hit ratio. Currently we plan the inner plan before planning the outer plan so there is no good way to know if a result cache would be useful or not since we can't estimate the number of times the subplan will be called until the outer plan is generated. The functionality being added here is newly introducing a dependency on the return value of estimate_num_groups() during the join search. Previously, during the join search, we only ever needed to perform selectivity estimations. With this commit, we need to use estimate_num_groups() in order to estimate what the hit ratio on the result cache will be. In simple terms, if we expect 10 distinct values and we expect 1000 outer rows, then we'll estimate the hit ratio to be 99%. Since cache hits are very cheap compared to scanning the underlying nodes on the inner side of the nested loop join, then this will significantly reduce the planner's cost for the join. However, it's fairly easy to see here that things will go bad when estimate_num_groups() incorrectly returns a value that's significantly lower than the actual number of distinct values. If this happens then that may cause us to make use of a nested loop join with a result cache instead of some other join type, such as a merge or hash join. Our distinct estimations have been known to be a source of trouble in the past, so the extra reliance on them here could cause the planner to choose slower plans than it did previous to having this feature. Distinct estimations are also fairly hard to estimate accurately when several tables have been joined already or when a WHERE clause filters out a set of values that are correlated to the expressions we're estimating the number of distinct value for. For now, the costing we perform during query planning for result caches does put quite a bit of faith in the distinct estimations being accurate. When these are accurate then we should generally see faster execution times for plans containing a result cache. However, in the real world, we may find that we need to either change the costings to put less trust in the distinct estimations being accurate or perhaps even disable this feature by default. There's always an element of risk when we teach the query planner to do new tricks that it decides to use that new trick at the wrong time and causes a regression. Users may opt to get the old behavior by turning the feature off using the enable_resultcache GUC. Currently, this is enabled by default. It remains to be seen if we'll maintain that setting for the release. Additionally, the name "Result Cache" is the best name I could think of for this new node at the time I started writing the patch. Nobody seems to strongly dislike the name. A few people did suggest other names but no other name seemed to dominate in the brief discussion that there was about names. Let's allow the beta period to see if the current name pleases enough people. If there's some consensus on a better name, then we can change it before the release. Please see the 2nd discussion link below for the discussion on the "Result Cache" name. Author: David Rowley Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie Tested-By: Konstantin Knizhnik Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
WRITE_UINT_FIELD(est_entries);
WRITE_BITMAPSET_FIELD(keyparamids);
Add Result Cache executor node (take 2) Here we add a new executor node type named "Result Cache". The planner can include this node type in the plan to have the executor cache the results from the inner side of parameterized nested loop joins. This allows caching of tuples for sets of parameters so that in the event that the node sees the same parameter values again, it can just return the cached tuples instead of rescanning the inner side of the join all over again. Internally, result cache uses a hash table in order to quickly find tuples that have been previously cached. For certain data sets, this can significantly improve the performance of joins. The best cases for using this new node type are for join problems where a large portion of the tuples from the inner side of the join have no join partner on the outer side of the join. In such cases, hash join would have to hash values that are never looked up, thus bloating the hash table and possibly causing it to multi-batch. Merge joins would have to skip over all of the unmatched rows. If we use a nested loop join with a result cache, then we only cache tuples that have at least one join partner on the outer side of the join. The benefits of using a parameterized nested loop with a result cache increase when there are fewer distinct values being looked up and the number of lookups of each value is large. Also, hash probes to lookup the cache can be much faster than the hash probe in a hash join as it's common that the result cache's hash table is much smaller than the hash join's due to result cache only caching useful tuples rather than all tuples from the inner side of the join. This variation in hash probe performance is more significant when the hash join's hash table no longer fits into the CPU's L3 cache, but the result cache's hash table does. The apparent "random" access of hash buckets with each hash probe can cause a poor L3 cache hit ratio for large hash tables. Smaller hash tables generally perform better. The hash table used for the cache limits itself to not exceeding work_mem * hash_mem_multiplier in size. We maintain a dlist of keys for this cache and when we're adding new tuples and realize we've exceeded the memory budget, we evict cache entries starting with the least recently used ones until we have enough memory to add the new tuples to the cache. For parameterized nested loop joins, we now consider using one of these result cache nodes in between the nested loop node and its inner node. We determine when this might be useful based on cost, which is primarily driven off of what the expected cache hit ratio will be. Estimating the cache hit ratio relies on having good distinct estimates on the nested loop's parameters. For now, the planner will only consider using a result cache for parameterized nested loop joins. This works for both normal joins and also for LATERAL type joins to subqueries. It is possible to use this new node for other uses in the future. For example, to cache results from correlated subqueries. However, that's not done here due to some difficulties obtaining a distinct estimation on the outer plan to calculate the estimated cache hit ratio. Currently we plan the inner plan before planning the outer plan so there is no good way to know if a result cache would be useful or not since we can't estimate the number of times the subplan will be called until the outer plan is generated. The functionality being added here is newly introducing a dependency on the return value of estimate_num_groups() during the join search. Previously, during the join search, we only ever needed to perform selectivity estimations. With this commit, we need to use estimate_num_groups() in order to estimate what the hit ratio on the result cache will be. In simple terms, if we expect 10 distinct values and we expect 1000 outer rows, then we'll estimate the hit ratio to be 99%. Since cache hits are very cheap compared to scanning the underlying nodes on the inner side of the nested loop join, then this will significantly reduce the planner's cost for the join. However, it's fairly easy to see here that things will go bad when estimate_num_groups() incorrectly returns a value that's significantly lower than the actual number of distinct values. If this happens then that may cause us to make use of a nested loop join with a result cache instead of some other join type, such as a merge or hash join. Our distinct estimations have been known to be a source of trouble in the past, so the extra reliance on them here could cause the planner to choose slower plans than it did previous to having this feature. Distinct estimations are also fairly hard to estimate accurately when several tables have been joined already or when a WHERE clause filters out a set of values that are correlated to the expressions we're estimating the number of distinct value for. For now, the costing we perform during query planning for result caches does put quite a bit of faith in the distinct estimations being accurate. When these are accurate then we should generally see faster execution times for plans containing a result cache. However, in the real world, we may find that we need to either change the costings to put less trust in the distinct estimations being accurate or perhaps even disable this feature by default. There's always an element of risk when we teach the query planner to do new tricks that it decides to use that new trick at the wrong time and causes a regression. Users may opt to get the old behavior by turning the feature off using the enable_resultcache GUC. Currently, this is enabled by default. It remains to be seen if we'll maintain that setting for the release. Additionally, the name "Result Cache" is the best name I could think of for this new node at the time I started writing the patch. Nobody seems to strongly dislike the name. A few people did suggest other names but no other name seemed to dominate in the brief discussion that there was about names. Let's allow the beta period to see if the current name pleases enough people. If there's some consensus on a better name, then we can change it before the release. Please see the 2nd discussion link below for the discussion on the "Result Cache" name. Author: David Rowley Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie Tested-By: Konstantin Knizhnik Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
}
static void
Implement Incremental Sort Incremental Sort is an optimized variant of multikey sort for cases when the input is already sorted by a prefix of the requested sort keys. For example when the relation is already sorted by (key1, key2) and we need to sort it by (key1, key2, key3) we can simply split the input rows into groups having equal values in (key1, key2), and only sort/compare the remaining column key3. This has a number of benefits: - Reduced memory consumption, because only a single group (determined by values in the sorted prefix) needs to be kept in memory. This may also eliminate the need to spill to disk. - Lower startup cost, because Incremental Sort produce results after each prefix group, which is beneficial for plans where startup cost matters (like for example queries with LIMIT clause). We consider both Sort and Incremental Sort, and decide based on costing. The implemented algorithm operates in two different modes: - Fetching a minimum number of tuples without check of equality on the prefix keys, and sorting on all columns when safe. - Fetching all tuples for a single prefix group and then sorting by comparing only the remaining (non-prefix) keys. We always start in the first mode, and employ a heuristic to switch into the second mode if we believe it's beneficial - the goal is to minimize the number of unnecessary comparions while keeping memory consumption below work_mem. This is a very old patch series. The idea was originally proposed by Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the patch was taken over by James Coleman, who wrote and rewrote most of the current code. There were many reviewers/contributors since 2013 - I've done my best to pick the most active ones, and listed them in this commit message. Author: James Coleman, Alexander Korotkov Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
_outSortInfo(StringInfo str, const Sort *node)
{
_outPlanInfo(str, (const Plan *) node);
WRITE_INT_FIELD(numCols);
WRITE_ATTRNUMBER_ARRAY(sortColIdx, node->numCols);
WRITE_OID_ARRAY(sortOperators, node->numCols);
WRITE_OID_ARRAY(collations, node->numCols);
WRITE_BOOL_ARRAY(nullsFirst, node->numCols);
}
Implement Incremental Sort Incremental Sort is an optimized variant of multikey sort for cases when the input is already sorted by a prefix of the requested sort keys. For example when the relation is already sorted by (key1, key2) and we need to sort it by (key1, key2, key3) we can simply split the input rows into groups having equal values in (key1, key2), and only sort/compare the remaining column key3. This has a number of benefits: - Reduced memory consumption, because only a single group (determined by values in the sorted prefix) needs to be kept in memory. This may also eliminate the need to spill to disk. - Lower startup cost, because Incremental Sort produce results after each prefix group, which is beneficial for plans where startup cost matters (like for example queries with LIMIT clause). We consider both Sort and Incremental Sort, and decide based on costing. The implemented algorithm operates in two different modes: - Fetching a minimum number of tuples without check of equality on the prefix keys, and sorting on all columns when safe. - Fetching all tuples for a single prefix group and then sorting by comparing only the remaining (non-prefix) keys. We always start in the first mode, and employ a heuristic to switch into the second mode if we believe it's beneficial - the goal is to minimize the number of unnecessary comparions while keeping memory consumption below work_mem. This is a very old patch series. The idea was originally proposed by Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the patch was taken over by James Coleman, who wrote and rewrote most of the current code. There were many reviewers/contributors since 2013 - I've done my best to pick the most active ones, and listed them in this commit message. Author: James Coleman, Alexander Korotkov Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
static void
_outSort(StringInfo str, const Sort *node)
{
WRITE_NODE_TYPE("SORT");
_outSortInfo(str, node);
}
static void
_outIncrementalSort(StringInfo str, const IncrementalSort *node)
{
WRITE_NODE_TYPE("INCREMENTALSORT");
_outSortInfo(str, (const Sort *) node);
WRITE_INT_FIELD(nPresortedCols);
}
static void
_outUnique(StringInfo str, const Unique *node)
{
WRITE_NODE_TYPE("UNIQUE");
_outPlanInfo(str, (const Plan *) node);
WRITE_INT_FIELD(numCols);
WRITE_ATTRNUMBER_ARRAY(uniqColIdx, node->numCols);
WRITE_OID_ARRAY(uniqOperators, node->numCols);
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
WRITE_OID_ARRAY(uniqCollations, node->numCols);
}
static void
_outHash(StringInfo str, const Hash *node)
{
WRITE_NODE_TYPE("HASH");
_outPlanInfo(str, (const Plan *) node);
Fix representation of hash keys in Hash/HashJoin nodes. In 5f32b29c1819 I changed the creation of HashState.hashkeys to actually use HashState as the parent (instead of HashJoinState, which was incorrect, as they were executed below HashState), to fix the problem of hashkeys expressions otherwise relying on slot types appropriate for HashJoinState, rather than HashState as would be correct. That reliance was only introduced in 12, which is why it previously worked to use HashJoinState as the parent (although I'd be unsurprised if there were problematic cases). Unfortunately that's not a sufficient solution, because before this commit, the to-be-hashed expressions referenced inner/outer as appropriate for the HashJoin, not Hash. That didn't have obvious bad consequences, because the slots containing the tuples were put into ecxt_innertuple when hashing a tuple for HashState (even though Hash doesn't have an inner plan). There are less common cases where this can cause visible problems however (rather than just confusion when inspecting such executor trees). E.g. "ERROR: bogus varno: 65000", when explaining queries containing a HashJoin where the subsidiary Hash node's hash keys reference a subplan. While normally hashkeys aren't displayed by EXPLAIN, if one of those expressions references a subplan, that subplan may be printed as part of the Hash node - which then failed because an inner plan was referenced, and Hash doesn't have that. It seems quite possible that there's other broken cases, too. Fix the problem by properly splitting the expression for the HashJoin and Hash nodes at plan time, and have them reference the proper subsidiary node. While other workarounds are possible, fixing this correctly seems easy enough. It was a pretty ugly hack to have ExecInitHashJoin put the expression into the already initialized HashState, in the first place. I decided to not just split inner/outer hashkeys inside make_hashjoin(), but also to separate out hashoperators and hashcollations at plan time. Otherwise we would have ended up having two very similar loops, one at plan time and the other during executor startup. The work seems to more appropriately belong to plan time, anyway. Reported-By: Nikita Glukhov, Alexander Korotkov Author: Andres Freund Reviewed-By: Tom Lane, in an earlier version Discussion: https://postgr.es/m/CAPpHfdvGVegF_TKKRiBrSmatJL2dR9uwFCuR+teQ_8tEXU8mxg@mail.gmail.com Backpatch: 12-
2019-08-02 09:02:46 +02:00
WRITE_NODE_FIELD(hashkeys);
WRITE_OID_FIELD(skewTable);
WRITE_INT_FIELD(skewColumn);
WRITE_BOOL_FIELD(skewInherit);
Add parallel-aware hash joins. Introduce parallel-aware hash joins that appear in EXPLAIN plans as Parallel Hash Join with Parallel Hash. While hash joins could already appear in parallel queries, they were previously always parallel-oblivious and had a partial subplan only on the outer side, meaning that the work of the inner subplan was duplicated in every worker. After this commit, the planner will consider using a partial subplan on the inner side too, using the Parallel Hash node to divide the work over the available CPU cores and combine its results in shared memory. If the join needs to be split into multiple batches in order to respect work_mem, then workers process different batches as much as possible and then work together on the remaining batches. The advantages of a parallel-aware hash join over a parallel-oblivious hash join used in a parallel query are that it: * avoids wasting memory on duplicated hash tables * avoids wasting disk space on duplicated batch files * divides the work of building the hash table over the CPUs One disadvantage is that there is some communication between the participating CPUs which might outweigh the benefits of parallelism in the case of small hash tables. This is avoided by the planner's existing reluctance to supply partial plans for small scans, but it may be necessary to estimate synchronization costs in future if that situation changes. Another is that outer batch 0 must be written to disk if multiple batches are required. A potential future advantage of parallel-aware hash joins is that right and full outer joins could be supported, since there is a single set of matched bits for each hashtable, but that is not yet implemented. A new GUC enable_parallel_hash is defined to control the feature, defaulting to on. Author: Thomas Munro Reviewed-By: Andres Freund, Robert Haas Tested-By: Rafia Sabih, Prabhat Sahu Discussion: https://postgr.es/m/CAEepm=2W=cOkiZxcg6qiFQP-dHUe09aqTrEMM7yJDrHMhDv_RA@mail.gmail.com https://postgr.es/m/CAEepm=37HKyJ4U6XOLi=JgfSHM3o6B-GaeO-6hkOmneTDkH+Uw@mail.gmail.com
2017-12-21 08:39:21 +01:00
WRITE_FLOAT_FIELD(rows_total, "%.0f");
}
static void
_outSetOp(StringInfo str, const SetOp *node)
{
WRITE_NODE_TYPE("SETOP");
_outPlanInfo(str, (const Plan *) node);
WRITE_ENUM_FIELD(cmd, SetOpCmd);
WRITE_ENUM_FIELD(strategy, SetOpStrategy);
WRITE_INT_FIELD(numCols);
WRITE_ATTRNUMBER_ARRAY(dupColIdx, node->numCols);
WRITE_OID_ARRAY(dupOperators, node->numCols);
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:09:32 +01:00
WRITE_OID_ARRAY(dupCollations, node->numCols);
WRITE_INT_FIELD(flagColIdx);
WRITE_INT_FIELD(firstFlag);
WRITE_LONG_FIELD(numGroups);
}
static void
_outLockRows(StringInfo str, const LockRows *node)
{
WRITE_NODE_TYPE("LOCKROWS");
_outPlanInfo(str, (const Plan *) node);
WRITE_NODE_FIELD(rowMarks);
WRITE_INT_FIELD(epqParam);
}
static void
_outLimit(StringInfo str, const Limit *node)
{
WRITE_NODE_TYPE("LIMIT");
_outPlanInfo(str, (const Plan *) node);
WRITE_NODE_FIELD(limitOffset);
WRITE_NODE_FIELD(limitCount);
WRITE_ENUM_FIELD(limitOption, LimitOption);
WRITE_INT_FIELD(uniqNumCols);
WRITE_ATTRNUMBER_ARRAY(uniqColIdx, node->uniqNumCols);
WRITE_OID_ARRAY(uniqOperators, node->uniqNumCols);
WRITE_OID_ARRAY(uniqCollations, node->uniqNumCols);
}
static void
_outNestLoopParam(StringInfo str, const NestLoopParam *node)
{
WRITE_NODE_TYPE("NESTLOOPPARAM");
WRITE_INT_FIELD(paramno);
WRITE_NODE_FIELD(paramval);
}
static void
_outPlanRowMark(StringInfo str, const PlanRowMark *node)
{
WRITE_NODE_TYPE("PLANROWMARK");
WRITE_UINT_FIELD(rti);
WRITE_UINT_FIELD(prti);
WRITE_UINT_FIELD(rowmarkId);
WRITE_ENUM_FIELD(markType, RowMarkType);
2015-03-15 23:41:47 +01:00
WRITE_INT_FIELD(allMarkTypes);
WRITE_ENUM_FIELD(strength, LockClauseStrength);
WRITE_ENUM_FIELD(waitPolicy, LockWaitPolicy);
WRITE_BOOL_FIELD(isParent);
}
static void
_outPartitionPruneInfo(StringInfo str, const PartitionPruneInfo *node)
{
WRITE_NODE_TYPE("PARTITIONPRUNEINFO");
WRITE_NODE_FIELD(prune_infos);
WRITE_BITMAPSET_FIELD(other_subplans);
}
static void
_outPartitionedRelPruneInfo(StringInfo str, const PartitionedRelPruneInfo *node)
{
WRITE_NODE_TYPE("PARTITIONEDRELPRUNEINFO");
WRITE_UINT_FIELD(rtindex);
WRITE_BITMAPSET_FIELD(present_parts);
WRITE_INT_FIELD(nparts);
WRITE_INT_ARRAY(subplan_map, node->nparts);
WRITE_INT_ARRAY(subpart_map, node->nparts);
Allow ATTACH PARTITION with only ShareUpdateExclusiveLock. We still require AccessExclusiveLock on the partition itself, because otherwise an insert that violates the newly-imposed partition constraint could be in progress at the same time that we're changing that constraint; only the lock level on the parent relation is weakened. To make this safe, we have to cope with (at least) three separate problems. First, relevant DDL might commit while we're in the process of building a PartitionDesc. If so, find_inheritance_children() might see a new partition while the RELOID system cache still has the old partition bound cached, and even before invalidation messages have been queued. To fix that, if we see that the pg_class tuple seems to be missing or to have a null relpartbound, refetch the value directly from the table. We can't get the wrong value, because DETACH PARTITION still requires AccessExclusiveLock throughout; if we ever want to change that, this will need more thought. In testing, I found it quite difficult to hit even the null-relpartbound case; the race condition is extremely tight, but the theoretical risk is there. Second, successive calls to RelationGetPartitionDesc might not return the same answer. The query planner will get confused if lookup up the PartitionDesc for a particular relation does not return a consistent answer for the entire duration of query planning. Likewise, query execution will get confused if the same relation seems to have a different PartitionDesc at different times. Invent a new PartitionDirectory concept and use it to ensure consistency. This ensures that a single invocation of either the planner or the executor sees the same view of the PartitionDesc from beginning to end, but it does not guarantee that the planner and the executor see the same view. Since this allows pointers to old PartitionDesc entries to survive even after a relcache rebuild, also postpone removing the old PartitionDesc entry until we're certain no one is using it. For the most part, it seems to be OK for the planner and executor to have different views of the PartitionDesc, because the executor will just ignore any concurrently added partitions which were unknown at plan time; those partitions won't be part of the inheritance expansion, but invalidation messages will trigger replanning at some point. Normally, this happens by the time the very next command is executed, but if the next command acquires no locks and executes a prepared query, it can manage not to notice until a new transaction is started. We might want to tighten that up, but it's material for a separate patch. There would still be a small window where a query that started just after an ATTACH PARTITION command committed might fail to notice its results -- but only if the command starts before the commit has been acknowledged to the user. All in all, the warts here around serializability seem small enough to be worth accepting for the considerable advantage of being able to add partitions without a full table lock. Although in general the consequences of new partitions showing up between planning and execution are limited to the query not noticing the new partitions, run-time partition pruning will get confused in that case, so that's the third problem that this patch fixes. Run-time partition pruning assumes that indexes into the PartitionDesc are stable between planning and execution. So, add code so that if new partitions are added between plan time and execution time, the indexes stored in the subplan_map[] and subpart_map[] arrays within the plan's PartitionedRelPruneInfo get adjusted accordingly. There does not seem to be a simple way to generalize this scheme to cope with partitions that are removed, mostly because they could then get added back again with different bounds, but it works OK for added partitions. This code does not try to ensure that every backend participating in a parallel query sees the same view of the PartitionDesc. That currently doesn't matter, because we never pass PartitionDesc indexes between backends. Each backend will ignore the concurrently added partitions which it notices, and it doesn't matter if different backends are ignoring different sets of concurrently added partitions. If in the future that matters, for example because we allow writes in parallel query and want all participants to do tuple routing to the same set of partitions, the PartitionDirectory concept could be improved to share PartitionDescs across backends. There is a draft patch to serialize and restore PartitionDescs on the thread where this patch was discussed, which may be a useful place to start. Patch by me. Thanks to Alvaro Herrera, David Rowley, Simon Riggs, Amit Langote, and Michael Paquier for discussion, and to Alvaro Herrera for some review. Discussion: http://postgr.es/m/CA+Tgmobt2upbSocvvDej3yzokd7AkiT+PvgFH+a9-5VV1oJNSQ@mail.gmail.com Discussion: http://postgr.es/m/CA+TgmoZE0r9-cyA-aY6f8WFEROaDLLL7Vf81kZ8MtFCkxpeQSw@mail.gmail.com Discussion: http://postgr.es/m/CA+TgmoY13KQZF-=HNTrt9UYWYx3_oYOQpu9ioNT49jGgiDpUEA@mail.gmail.com
2019-03-07 17:13:12 +01:00
WRITE_OID_ARRAY(relid_map, node->nparts);
Restructure creation of run-time pruning steps. Previously, gen_partprune_steps() always built executor pruning steps using all suitable clauses, including those containing PARAM_EXEC Params. This meant that the pruning steps were only completely safe for executor run-time (scan start) pruning. To prune at executor startup, we had to ignore the steps involving exec Params. But this doesn't really work in general, since there may be logic changes needed as well --- for example, pruning according to the last operator's btree strategy is the wrong thing if we're not applying that operator. The rules embodied in gen_partprune_steps() and its minions are sufficiently complicated that tracking their incremental effects in other logic seems quite impractical. Short of a complete redesign, the only safe fix seems to be to run gen_partprune_steps() twice, once to create executor startup pruning steps and then again for run-time pruning steps. We can save a few cycles however by noting during the first scan whether we rejected any clauses because they involved exec Params --- if not, we don't need to do the second scan. In support of this, refactor the internal APIs in partprune.c to make more use of passing information in the GeneratePruningStepsContext struct, rather than as separate arguments. This is, I hope, the last piece of our response to a bug report from Alan Jackson. Back-patch to v11 where this code came in. Discussion: https://postgr.es/m/FAD28A83-AC73-489E-A058-2681FA31D648@tvsquared.com
2019-05-18 01:44:19 +02:00
WRITE_NODE_FIELD(initial_pruning_steps);
WRITE_NODE_FIELD(exec_pruning_steps);
WRITE_BITMAPSET_FIELD(execparamids);
}
static void
_outPartitionPruneStepOp(StringInfo str, const PartitionPruneStepOp *node)
{
WRITE_NODE_TYPE("PARTITIONPRUNESTEPOP");
WRITE_INT_FIELD(step.step_id);
WRITE_INT_FIELD(opstrategy);
WRITE_NODE_FIELD(exprs);
WRITE_NODE_FIELD(cmpfns);
WRITE_BITMAPSET_FIELD(nullkeys);
}
static void
_outPartitionPruneStepCombine(StringInfo str, const PartitionPruneStepCombine *node)
{
WRITE_NODE_TYPE("PARTITIONPRUNESTEPCOMBINE");
WRITE_INT_FIELD(step.step_id);
WRITE_ENUM_FIELD(combineOp, PartitionPruneCombineOp);
WRITE_NODE_FIELD(source_stepids);
}
static void
_outPlanInvalItem(StringInfo str, const PlanInvalItem *node)
{
WRITE_NODE_TYPE("PLANINVALITEM");
WRITE_INT_FIELD(cacheId);
WRITE_UINT_FIELD(hashValue);
}
/*****************************************************************************
*
* Stuff from primnodes.h.
*
*****************************************************************************/
static void
_outAlias(StringInfo str, const Alias *node)
{
WRITE_NODE_TYPE("ALIAS");
WRITE_STRING_FIELD(aliasname);
WRITE_NODE_FIELD(colnames);
}
static void
_outRangeVar(StringInfo str, const RangeVar *node)
{
WRITE_NODE_TYPE("RANGEVAR");
/*
* we deliberately ignore catalogname here, since it is presently not
* semantically meaningful
*/
WRITE_STRING_FIELD(schemaname);
WRITE_STRING_FIELD(relname);
WRITE_BOOL_FIELD(inh);
WRITE_CHAR_FIELD(relpersistence);
WRITE_NODE_FIELD(alias);
WRITE_LOCATION_FIELD(location);
}
static void
_outTableFunc(StringInfo str, const TableFunc *node)
{
WRITE_NODE_TYPE("TABLEFUNC");
WRITE_ENUM_FIELD(functype, TableFuncType);
WRITE_NODE_FIELD(ns_uris);
WRITE_NODE_FIELD(ns_names);
WRITE_NODE_FIELD(docexpr);
WRITE_NODE_FIELD(rowexpr);
WRITE_NODE_FIELD(colnames);
WRITE_NODE_FIELD(coltypes);
WRITE_NODE_FIELD(coltypmods);
WRITE_NODE_FIELD(colcollations);
WRITE_NODE_FIELD(colexprs);
WRITE_NODE_FIELD(coldefexprs);
WRITE_NODE_FIELD(colvalexprs);
WRITE_BITMAPSET_FIELD(notnulls);
WRITE_NODE_FIELD(plan);
WRITE_INT_FIELD(ordinalitycol);
WRITE_LOCATION_FIELD(location);
}
static void
_outIntoClause(StringInfo str, const IntoClause *node)
{
WRITE_NODE_TYPE("INTOCLAUSE");
WRITE_NODE_FIELD(rel);
WRITE_NODE_FIELD(colNames);
WRITE_STRING_FIELD(accessMethod);
WRITE_NODE_FIELD(options);
WRITE_ENUM_FIELD(onCommit, OnCommitAction);
WRITE_STRING_FIELD(tableSpaceName);
WRITE_NODE_FIELD(viewQuery);
WRITE_BOOL_FIELD(skipData);
}
static void
_outVar(StringInfo str, const Var *node)
{
WRITE_NODE_TYPE("VAR");
Remove arbitrary 64K-or-so limit on rangetable size. Up to now the size of a query's rangetable has been limited by the constants INNER_VAR et al, which mustn't be equal to any real rangetable index. 65000 doubtless seemed like enough for anybody, and it still is orders of magnitude larger than the number of joins we can realistically handle. However, we need a rangetable entry for each child partition that is (or might be) processed by a query. Queries with a few thousand partitions are getting more realistic, so that the day when that limit becomes a problem is in sight, even if it's not here yet. Hence, let's raise the limit. Rather than just increase the values of INNER_VAR et al, this patch adopts the approach of making them small negative values, so that rangetables could theoretically become as long as INT_MAX. The bulk of the patch is concerned with changing Var.varno and some related variables from "Index" (unsigned int) to plain "int". This is basically cosmetic, with little actual effect other than to help debuggers print their values nicely. As such, I've only bothered with changing places that could actually see INNER_VAR et al, which the parser and most of the planner don't. We do have to be careful in places that are performing less/greater comparisons on varnos, but there are very few such places, other than the IS_SPECIAL_VARNO macro itself. A notable side effect of this patch is that while it used to be possible to add INNER_VAR et al to a Bitmapset, that will now draw an error. I don't see any likelihood that it wouldn't be a bug to include these fake varnos in a bitmapset of real varnos, so I think this is all to the good. Although this touches outfuncs/readfuncs, I don't think a catversion bump is required, since stored rules would never contain Vars with these fake varnos. Andrey Lepikhov and Tom Lane, after a suggestion by Peter Eisentraut Discussion: https://postgr.es/m/43c7f2f5-1e27-27aa-8c65-c91859d15190@postgrespro.ru
2021-09-15 20:11:21 +02:00
WRITE_INT_FIELD(varno);
WRITE_INT_FIELD(varattno);
WRITE_OID_FIELD(vartype);
WRITE_INT_FIELD(vartypmod);
WRITE_OID_FIELD(varcollid);
WRITE_UINT_FIELD(varlevelsup);
Reconsider the representation of join alias Vars. The core idea of this patch is to make the parser generate join alias Vars (that is, ones with varno pointing to a JOIN RTE) only when the alias Var is actually different from any raw join input, that is a type coercion and/or COALESCE is necessary to generate the join output value. Otherwise just generate varno/varattno pointing to the relevant join input column. In effect, this means that the planner's flatten_join_alias_vars() transformation is already done in the parser, for all cases except (a) columns that are merged by JOIN USING and are transformed in the process, and (b) whole-row join Vars. In principle that would allow us to skip doing flatten_join_alias_vars() in many more queries than we do now, but we don't have quite enough infrastructure to know that we can do so --- in particular there's no cheap way to know whether there are any whole-row join Vars. I'm not sure if it's worth the trouble to add a Query-level flag for that, and in any case it seems like fit material for a separate patch. But even without skipping the work entirely, this should make flatten_join_alias_vars() faster, particularly where there are nested joins that it previously had to flatten recursively. An essential part of this change is to replace Var nodes' varnoold/varoattno fields with varnosyn/varattnosyn, which have considerably more tightly-defined meanings than the old fields: when they differ from varno/varattno, they identify the Var's position in an aliased JOIN RTE, and the join alias is what ruleutils.c should print for the Var. This is necessary because the varno change destroyed ruleutils.c's ability to find the JOIN RTE from the Var's varno. Another way in which this change broke ruleutils.c is that it's no longer feasible to determine, from a JOIN RTE's joinaliasvars list, which join columns correspond to which columns of the join's immediate input relations. (If those are sub-joins, the joinaliasvars entries may point to columns of their base relations, not the sub-joins.) But that was a horrid mess requiring a lot of fragile assumptions already, so let's just bite the bullet and add some more JOIN RTE fields to make it more straightforward to figure that out. I added two integer-List fields containing the relevant column numbers from the left and right input rels, plus a count of how many merged columns there are. This patch depends on the ParseNamespaceColumn infrastructure that I added in commit 5815696bc. The biggest bit of code change is restructuring transformFromClauseItem's handling of JOINs so that the ParseNamespaceColumn data is propagated upward correctly. Other than that and the ruleutils fixes, everything pretty much just works, though some processing is now inessential. I grabbed two pieces of low-hanging fruit in that line: 1. In find_expr_references, we don't need to recurse into join alias Vars anymore. There aren't any except for references to merged USING columns, which are more properly handled when we scan the join's RTE. This change actually fixes an edge-case issue: we will now record a dependency on any type-coercion function present in a USING column's joinaliasvar, even if that join column has no references in the query text. The odds of the missing dependency causing a problem seem quite small: you'd have to posit somebody dropping an implicit cast between two data types, without removing the types themselves, and then having a stored rule containing a whole-row Var for a join whose USING merge depends on that cast. So I don't feel a great need to change this in the back branches. But in theory this way is more correct. 2. markRTEForSelectPriv and markTargetListOrigin don't need to recurse into join alias Vars either, because the cases they care about don't apply to alias Vars for USING columns that are semantically distinct from the underlying columns. This removes the only case in which markVarForSelectPriv could be called with NULL for the RTE, so adjust the comments to describe that hack as being strictly internal to markRTEForSelectPriv. catversion bump required due to changes in stored rules. Discussion: https://postgr.es/m/7115.1577986646@sss.pgh.pa.us
2020-01-09 17:56:59 +01:00
WRITE_UINT_FIELD(varnosyn);
WRITE_INT_FIELD(varattnosyn);
WRITE_LOCATION_FIELD(location);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#endif /* OBSOLETE */
static void
_outConst(StringInfo str, const Const *node)
{
WRITE_NODE_TYPE("CONST");
WRITE_OID_FIELD(consttype);
WRITE_INT_FIELD(consttypmod);
WRITE_OID_FIELD(constcollid);
WRITE_INT_FIELD(constlen);
WRITE_BOOL_FIELD(constbyval);
WRITE_BOOL_FIELD(constisnull);
WRITE_LOCATION_FIELD(location);
appendStringInfoString(str, " :constvalue ");
if (node->constisnull)
appendStringInfoString(str, "<>");
else
outDatum(str, node->constvalue, node->constlen, node->constbyval);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#ifdef OBSOLETE
static void
_outParam(StringInfo str, const Param *node)
{
WRITE_NODE_TYPE("PARAM");
WRITE_ENUM_FIELD(paramkind, ParamKind);
WRITE_INT_FIELD(paramid);
WRITE_OID_FIELD(paramtype);
WRITE_INT_FIELD(paramtypmod);
WRITE_OID_FIELD(paramcollid);
WRITE_LOCATION_FIELD(location);
}
static void
_outAggref(StringInfo str, const Aggref *node)
{
WRITE_NODE_TYPE("AGGREF");
WRITE_OID_FIELD(aggfnoid);
WRITE_OID_FIELD(aggtype);
WRITE_OID_FIELD(aggcollid);
WRITE_OID_FIELD(inputcollid);
Fix handling of argument and result datatypes for partial aggregation. When doing partial aggregation, the args list of the upper (combining) Aggref node is replaced by a Var representing the output of the partial aggregation steps, which has either the aggregate's transition data type or a serialized representation of that. However, nodeAgg.c blindly continued to use the args list as an indication of the user-level argument types. This broke resolution of polymorphic transition datatypes at executor startup (though it accidentally failed to fail for the ANYARRAY case, which is likely the only one anyone had tested). Moreover, the constructed FuncExpr passed to the finalfunc contained completely wrong information, which would have led to bogus answers or crashes for any case where the finalfunc examined that information (which is only likely to be with polymorphic aggregates using a non-polymorphic transition type). As an independent bug, apply_partialaggref_adjustment neglected to resolve a polymorphic transition datatype before assigning it as the output type of the lower-level Aggref node. This again accidentally failed to fail for ANYARRAY but would be unlikely to work in other cases. To fix the first problem, record the user-level argument types in a separate OID-list field of Aggref, and look to that rather than the args list when asking what the argument types were. (It turns out to be convenient to include any "direct" arguments in this list too, although those are not currently subject to being overwritten.) Rather than adding yet another resolve_aggregate_transtype() call to fix the second problem, add an aggtranstype field to Aggref, and store the resolved transition type OID there when the planner first computes it. (By doing this in the planner and not the parser, we can allow the aggregate's transition type to change from time to time, although no DDL support yet exists for that.) This saves nothing of consequence for simple non-polymorphic aggregates, but for polymorphic transition types we save a catalog lookup during executor startup as well as several planner lookups that are new in 9.6 due to parallel query planning. In passing, fix an error that was introduced into count_agg_clauses_walker some time ago: it was applying exprTypmod() to something that wasn't an expression node at all, but a TargetEntry. exprTypmod silently returned -1 so that there was not an obvious failure, but this broke the intended sensitivity of aggregate space consumption estimates to the typmod of varchar and similar data types. This part needs to be back-patched. Catversion bump due to change of stored Aggref nodes. Discussion: <8229.1466109074@sss.pgh.pa.us>
2016-06-18 03:44:37 +02:00
WRITE_OID_FIELD(aggtranstype);
WRITE_NODE_FIELD(aggargtypes);
Support ordered-set (WITHIN GROUP) aggregates. This patch introduces generic support for ordered-set and hypothetical-set aggregate functions, as well as implementations of the instances defined in SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(), percent_rank(), cume_dist()). We also added mode() though it is not in the spec, as well as versions of percentile_cont() and percentile_disc() that can compute multiple percentile values in one pass over the data. Unlike the original submission, this patch puts full control of the sorting process in the hands of the aggregate's support functions. To allow the support functions to find out how they're supposed to sort, a new API function AggGetAggref() is added to nodeAgg.c. This allows retrieval of the aggregate call's Aggref node, which may have other uses beyond the immediate need. There is also support for ordered-set aggregates to install cleanup callback functions, so that they can be sure that infrastructure such as tuplesort objects gets cleaned up. In passing, make some fixes in the recently-added support for variadic aggregates, and make some editorial adjustments in the recent FILTER additions for aggregates. Also, simplify use of IsBinaryCoercible() by allowing it to succeed whenever the target type is ANY or ANYELEMENT. It was inconsistent that it dealt with other polymorphic target types but not these. Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing, and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
WRITE_NODE_FIELD(aggdirectargs);
WRITE_NODE_FIELD(args);
WRITE_NODE_FIELD(aggorder);
WRITE_NODE_FIELD(aggdistinct);
WRITE_NODE_FIELD(aggfilter);
WRITE_BOOL_FIELD(aggstar);
WRITE_BOOL_FIELD(aggvariadic);
Support ordered-set (WITHIN GROUP) aggregates. This patch introduces generic support for ordered-set and hypothetical-set aggregate functions, as well as implementations of the instances defined in SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(), percent_rank(), cume_dist()). We also added mode() though it is not in the spec, as well as versions of percentile_cont() and percentile_disc() that can compute multiple percentile values in one pass over the data. Unlike the original submission, this patch puts full control of the sorting process in the hands of the aggregate's support functions. To allow the support functions to find out how they're supposed to sort, a new API function AggGetAggref() is added to nodeAgg.c. This allows retrieval of the aggregate call's Aggref node, which may have other uses beyond the immediate need. There is also support for ordered-set aggregates to install cleanup callback functions, so that they can be sure that infrastructure such as tuplesort objects gets cleaned up. In passing, make some fixes in the recently-added support for variadic aggregates, and make some editorial adjustments in the recent FILTER additions for aggregates. Also, simplify use of IsBinaryCoercible() by allowing it to succeed whenever the target type is ANY or ANYELEMENT. It was inconsistent that it dealt with other polymorphic target types but not these. Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing, and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
WRITE_CHAR_FIELD(aggkind);
WRITE_UINT_FIELD(agglevelsup);
WRITE_ENUM_FIELD(aggsplit, AggSplit);
WRITE_INT_FIELD(aggno);
WRITE_INT_FIELD(aggtransno);
WRITE_LOCATION_FIELD(location);
}
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
static void
_outGroupingFunc(StringInfo str, const GroupingFunc *node)
{
WRITE_NODE_TYPE("GROUPINGFUNC");
WRITE_NODE_FIELD(args);
WRITE_NODE_FIELD(refs);
WRITE_NODE_FIELD(cols);
WRITE_UINT_FIELD(agglevelsup);
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
WRITE_LOCATION_FIELD(location);
}
static void
_outWindowFunc(StringInfo str, const WindowFunc *node)
{
WRITE_NODE_TYPE("WINDOWFUNC");
WRITE_OID_FIELD(winfnoid);
WRITE_OID_FIELD(wintype);
WRITE_OID_FIELD(wincollid);
WRITE_OID_FIELD(inputcollid);
WRITE_NODE_FIELD(args);
WRITE_NODE_FIELD(aggfilter);
WRITE_UINT_FIELD(winref);
WRITE_BOOL_FIELD(winstar);
WRITE_BOOL_FIELD(winagg);
WRITE_LOCATION_FIELD(location);
}
static void
_outSubscriptingRef(StringInfo str, const SubscriptingRef *node)
{
WRITE_NODE_TYPE("SUBSCRIPTINGREF");
WRITE_OID_FIELD(refcontainertype);
WRITE_OID_FIELD(refelemtype);
Support subscripting of arbitrary types, not only arrays. This patch generalizes the subscripting infrastructure so that any data type can be subscripted, if it provides a handler function to define what that means. Traditional variable-length (varlena) arrays all use array_subscript_handler(), while the existing fixed-length types that support subscripting use raw_array_subscript_handler(). It's expected that other types that want to use subscripting notation will define their own handlers. (This patch provides no such new features, though; it only lays the foundation for them.) To do this, move the parser's semantic processing of subscripts (including coercion to whatever data type is required) into a method callback supplied by the handler. On the execution side, replace the ExecEvalSubscriptingRef* layer of functions with direct calls to callback-supplied execution routines. (Thus, essentially no new run-time overhead should be caused by this patch. Indeed, there is room to remove some overhead by supplying specialized execution routines. This patch does a little bit in that line, but more could be done.) Additional work is required here and there to remove formerly hard-wired assumptions about the result type, collation, etc of a SubscriptingRef expression node; and to remove assumptions that the subscript values must be integers. One useful side-effect of this is that we now have a less squishy mechanism for identifying whether a data type is a "true" array: instead of wiring in weird rules about typlen, we can look to see if pg_type.typsubscript == F_ARRAY_SUBSCRIPT_HANDLER. For this to be bulletproof, we have to forbid user-defined types from using that handler directly; but there seems no good reason for them to do so. This patch also removes assumptions that the number of subscripts is limited to MAXDIM (6), or indeed has any hard-wired limit. That limit still applies to types handled by array_subscript_handler or raw_array_subscript_handler, but to discourage other dependencies on this constant, I've moved it from c.h to utils/array.h. Dmitry Dolgov, reviewed at various times by Tom Lane, Arthur Zakirov, Peter Eisentraut, Pavel Stehule Discussion: https://postgr.es/m/CA+q6zcVDuGBv=M0FqBYX8DPebS3F_0KQ6OVFobGJPM507_SZ_w@mail.gmail.com Discussion: https://postgr.es/m/CA+q6zcVovR+XY4mfk-7oNk-rF91gH0PebnNfuUjuuDsyHjOcVA@mail.gmail.com
2020-12-09 18:40:37 +01:00
WRITE_OID_FIELD(refrestype);
WRITE_INT_FIELD(reftypmod);
WRITE_OID_FIELD(refcollid);
WRITE_NODE_FIELD(refupperindexpr);
WRITE_NODE_FIELD(reflowerindexpr);
WRITE_NODE_FIELD(refexpr);
WRITE_NODE_FIELD(refassgnexpr);
}
static void
_outFuncExpr(StringInfo str, const FuncExpr *node)
{
WRITE_NODE_TYPE("FUNCEXPR");
WRITE_OID_FIELD(funcid);
WRITE_OID_FIELD(funcresulttype);
WRITE_BOOL_FIELD(funcretset);
WRITE_BOOL_FIELD(funcvariadic);
WRITE_ENUM_FIELD(funcformat, CoercionForm);
WRITE_OID_FIELD(funccollid);
WRITE_OID_FIELD(inputcollid);
WRITE_NODE_FIELD(args);
WRITE_LOCATION_FIELD(location);
}
static void
_outNamedArgExpr(StringInfo str, const NamedArgExpr *node)
{
WRITE_NODE_TYPE("NAMEDARGEXPR");
WRITE_NODE_FIELD(arg);
WRITE_STRING_FIELD(name);
WRITE_INT_FIELD(argnumber);
WRITE_LOCATION_FIELD(location);
}
static void
_outOpExpr(StringInfo str, const OpExpr *node)
{
WRITE_NODE_TYPE("OPEXPR");
WRITE_OID_FIELD(opno);
WRITE_OID_FIELD(opfuncid);
WRITE_OID_FIELD(opresulttype);
WRITE_BOOL_FIELD(opretset);
WRITE_OID_FIELD(opcollid);
WRITE_OID_FIELD(inputcollid);
WRITE_NODE_FIELD(args);
WRITE_LOCATION_FIELD(location);
}
static void
_outDistinctExpr(StringInfo str, const DistinctExpr *node)
{
WRITE_NODE_TYPE("DISTINCTEXPR");
WRITE_OID_FIELD(opno);
WRITE_OID_FIELD(opfuncid);
WRITE_OID_FIELD(opresulttype);
WRITE_BOOL_FIELD(opretset);
WRITE_OID_FIELD(opcollid);
WRITE_OID_FIELD(inputcollid);
WRITE_NODE_FIELD(args);
WRITE_LOCATION_FIELD(location);
}
static void
_outNullIfExpr(StringInfo str, const NullIfExpr *node)
{
WRITE_NODE_TYPE("NULLIFEXPR");
WRITE_OID_FIELD(opno);
WRITE_OID_FIELD(opfuncid);
WRITE_OID_FIELD(opresulttype);
WRITE_BOOL_FIELD(opretset);
WRITE_OID_FIELD(opcollid);
WRITE_OID_FIELD(inputcollid);
WRITE_NODE_FIELD(args);
WRITE_LOCATION_FIELD(location);
}
static void
_outScalarArrayOpExpr(StringInfo str, const ScalarArrayOpExpr *node)
{
WRITE_NODE_TYPE("SCALARARRAYOPEXPR");
WRITE_OID_FIELD(opno);
WRITE_OID_FIELD(opfuncid);
Speedup ScalarArrayOpExpr evaluation ScalarArrayOpExprs with "useOr=true" and a set of Consts on the righthand side have traditionally been evaluated by using a linear search over the array. When these arrays contain large numbers of elements then this linear search could become a significant part of execution time. Here we add a new method of evaluating ScalarArrayOpExpr expressions to allow them to be evaluated by first building a hash table containing each element, then on subsequent evaluations, we just probe that hash table to determine if there is a match. The planner is in charge of determining when this optimization is possible and it enables it by setting hashfuncid in the ScalarArrayOpExpr. The executor will only perform the hash table evaluation when the hashfuncid is set. This means that not all cases are optimized. For example CHECK constraints containing an IN clause won't go through the planner, so won't get the hashfuncid set. We could maybe do something about that at some later date. The reason we're not doing it now is from fear that we may slow down cases where the expression is evaluated only once. Those cases can be common, for example, a single row INSERT to a table with a CHECK constraint containing an IN clause. In the planner, we enable this when there are suitable hash functions for the ScalarArrayOpExpr's operator and only when there is at least MIN_ARRAY_SIZE_FOR_HASHED_SAOP elements in the array. The threshold is currently set to 9. Author: James Coleman, David Rowley Reviewed-by: David Rowley, Tomas Vondra, Heikki Linnakangas Discussion: https://postgr.es/m/CAAaqYe8x62+=wn0zvNKCj55tPpg-JBHzhZFFc6ANovdqFw7-dA@mail.gmail.com
2021-04-08 13:51:22 +02:00
WRITE_OID_FIELD(hashfuncid);
WRITE_OID_FIELD(negfuncid);
WRITE_BOOL_FIELD(useOr);
WRITE_OID_FIELD(inputcollid);
WRITE_NODE_FIELD(args);
WRITE_LOCATION_FIELD(location);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#endif /* OBSOLETE */
static void
_outBoolExpr(StringInfo str, const BoolExpr *node)
{
char *opstr = NULL;
WRITE_NODE_TYPE("BOOLEXPR");
/* do-it-yourself enum representation */
switch (node->boolop)
{
case AND_EXPR:
opstr = "and";
break;
case OR_EXPR:
opstr = "or";
break;
case NOT_EXPR:
opstr = "not";
break;
}
appendStringInfoString(str, " :boolop ");
outToken(str, opstr);
WRITE_NODE_FIELD(args);
WRITE_LOCATION_FIELD(location);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#ifdef OBSOLETE
static void
_outSubLink(StringInfo str, const SubLink *node)
{
WRITE_NODE_TYPE("SUBLINK");
WRITE_ENUM_FIELD(subLinkType, SubLinkType);
WRITE_INT_FIELD(subLinkId);
WRITE_NODE_FIELD(testexpr);
WRITE_NODE_FIELD(operName);
WRITE_NODE_FIELD(subselect);
WRITE_LOCATION_FIELD(location);
}
static void
_outSubPlan(StringInfo str, const SubPlan *node)
{
WRITE_NODE_TYPE("SUBPLAN");
WRITE_ENUM_FIELD(subLinkType, SubLinkType);
WRITE_NODE_FIELD(testexpr);
WRITE_NODE_FIELD(paramIds);
WRITE_INT_FIELD(plan_id);
WRITE_STRING_FIELD(plan_name);
WRITE_OID_FIELD(firstColType);
WRITE_INT_FIELD(firstColTypmod);
WRITE_OID_FIELD(firstColCollation);
WRITE_BOOL_FIELD(useHashTable);
WRITE_BOOL_FIELD(unknownEqFalse);
WRITE_BOOL_FIELD(parallel_safe);
WRITE_NODE_FIELD(setParam);
WRITE_NODE_FIELD(parParam);
WRITE_NODE_FIELD(args);
WRITE_FLOAT_FIELD(startup_cost, "%.2f");
WRITE_FLOAT_FIELD(per_call_cost, "%.2f");
}
static void
_outAlternativeSubPlan(StringInfo str, const AlternativeSubPlan *node)
{
WRITE_NODE_TYPE("ALTERNATIVESUBPLAN");
WRITE_NODE_FIELD(subplans);
}
static void
_outFieldSelect(StringInfo str, const FieldSelect *node)
{
WRITE_NODE_TYPE("FIELDSELECT");
WRITE_NODE_FIELD(arg);
WRITE_INT_FIELD(fieldnum);
WRITE_OID_FIELD(resulttype);
WRITE_INT_FIELD(resulttypmod);
WRITE_OID_FIELD(resultcollid);
}
static void
_outFieldStore(StringInfo str, const FieldStore *node)
{
WRITE_NODE_TYPE("FIELDSTORE");
WRITE_NODE_FIELD(arg);
WRITE_NODE_FIELD(newvals);
WRITE_NODE_FIELD(fieldnums);
WRITE_OID_FIELD(resulttype);
}
static void
_outRelabelType(StringInfo str, const RelabelType *node)
{
WRITE_NODE_TYPE("RELABELTYPE");
WRITE_NODE_FIELD(arg);
WRITE_OID_FIELD(resulttype);
WRITE_INT_FIELD(resulttypmod);
WRITE_OID_FIELD(resultcollid);
WRITE_ENUM_FIELD(relabelformat, CoercionForm);
WRITE_LOCATION_FIELD(location);
}
static void
_outCoerceViaIO(StringInfo str, const CoerceViaIO *node)
{
WRITE_NODE_TYPE("COERCEVIAIO");
WRITE_NODE_FIELD(arg);
WRITE_OID_FIELD(resulttype);
WRITE_OID_FIELD(resultcollid);
WRITE_ENUM_FIELD(coerceformat, CoercionForm);
WRITE_LOCATION_FIELD(location);
}
static void
_outArrayCoerceExpr(StringInfo str, const ArrayCoerceExpr *node)
{
WRITE_NODE_TYPE("ARRAYCOERCEEXPR");
WRITE_NODE_FIELD(arg);
Support arrays over domains. Allowing arrays with a domain type as their element type was left un-done in the original domain patch, but not for any very good reason. This omission leads to such surprising results as array_agg() not working on a domain column, because the parser can't identify a suitable output type for the polymorphic aggregate. In order to fix this, first clean up the APIs of coerce_to_domain() and some internal functions in parse_coerce.c so that we consistently pass around a CoercionContext along with CoercionForm. Previously, we sometimes passed an "isExplicit" boolean flag instead, which is strictly less information; and coerce_to_domain() didn't even get that, but instead had to reverse-engineer isExplicit from CoercionForm. That's contrary to the documentation in primnodes.h that says that CoercionForm only affects display and not semantics. I don't think this change fixes any live bugs, but it makes things more consistent. The main reason for doing it though is that now build_coercion_expression() receives ccontext, which it needs in order to be able to recursively invoke coerce_to_target_type(). Next, reimplement ArrayCoerceExpr so that the node does not directly know any details of what has to be done to the individual array elements while performing the array coercion. Instead, the per-element processing is represented by a sub-expression whose input is a source array element and whose output is a target array element. This simplifies life in parse_coerce.c, because it can build that sub-expression by a recursive invocation of coerce_to_target_type(). The executor now handles the per-element processing as a compiled expression instead of hard-wired code. The main advantage of this is that we can use a single ArrayCoerceExpr to handle as many as three successive steps per element: base type conversion, typmod coercion, and domain constraint checking. The old code used two stacked ArrayCoerceExprs to handle type + typmod coercion, which was pretty inefficient, and adding yet another array deconstruction to do domain constraint checking seemed very unappetizing. In the case where we just need a single, very simple coercion function, doing this straightforwardly leads to a noticeable increase in the per-array-element runtime cost. Hence, add an additional shortcut evalfunc in execExprInterp.c that skips unnecessary overhead for that specific form of expression. The runtime speed of simple cases is within 1% or so of where it was before, while cases that previously required two levels of array processing are significantly faster. Finally, create an implicit array type for every domain type, as we do for base types, enums, etc. Everything except the array-coercion case seems to just work without further effort. Tom Lane, reviewed by Andrew Dunstan Discussion: https://postgr.es/m/9852.1499791473@sss.pgh.pa.us
2017-09-30 19:40:56 +02:00
WRITE_NODE_FIELD(elemexpr);
WRITE_OID_FIELD(resulttype);
WRITE_INT_FIELD(resulttypmod);
WRITE_OID_FIELD(resultcollid);
WRITE_ENUM_FIELD(coerceformat, CoercionForm);
WRITE_LOCATION_FIELD(location);
}
static void
_outConvertRowtypeExpr(StringInfo str, const ConvertRowtypeExpr *node)
{
WRITE_NODE_TYPE("CONVERTROWTYPEEXPR");
WRITE_NODE_FIELD(arg);
WRITE_OID_FIELD(resulttype);
WRITE_ENUM_FIELD(convertformat, CoercionForm);
WRITE_LOCATION_FIELD(location);
}
static void
_outCollateExpr(StringInfo str, const CollateExpr *node)
{
WRITE_NODE_TYPE("COLLATEEXPR");
WRITE_NODE_FIELD(arg);
WRITE_OID_FIELD(collOid);
WRITE_LOCATION_FIELD(location);
}
static void
_outCaseExpr(StringInfo str, const CaseExpr *node)
{
WRITE_NODE_TYPE("CASEEXPR");
WRITE_OID_FIELD(casetype);
WRITE_OID_FIELD(casecollid);
WRITE_NODE_FIELD(arg);
WRITE_NODE_FIELD(args);
WRITE_NODE_FIELD(defresult);
WRITE_LOCATION_FIELD(location);
}
static void
_outCaseWhen(StringInfo str, const CaseWhen *node)
{
WRITE_NODE_TYPE("CASEWHEN");
WRITE_NODE_FIELD(expr);
WRITE_NODE_FIELD(result);
WRITE_LOCATION_FIELD(location);
}
static void
_outCaseTestExpr(StringInfo str, const CaseTestExpr *node)
{
WRITE_NODE_TYPE("CASETESTEXPR");
WRITE_OID_FIELD(typeId);
WRITE_INT_FIELD(typeMod);
WRITE_OID_FIELD(collation);
}
static void
_outArrayExpr(StringInfo str, const ArrayExpr *node)
{
WRITE_NODE_TYPE("ARRAYEXPR");
WRITE_OID_FIELD(array_typeid);
WRITE_OID_FIELD(array_collid);
WRITE_OID_FIELD(element_typeid);
WRITE_NODE_FIELD(elements);
WRITE_BOOL_FIELD(multidims);
WRITE_LOCATION_FIELD(location);
}
static void
_outRowExpr(StringInfo str, const RowExpr *node)
{
WRITE_NODE_TYPE("ROWEXPR");
WRITE_NODE_FIELD(args);
WRITE_OID_FIELD(row_typeid);
WRITE_ENUM_FIELD(row_format, CoercionForm);
WRITE_NODE_FIELD(colnames);
WRITE_LOCATION_FIELD(location);
}
static void
_outRowCompareExpr(StringInfo str, const RowCompareExpr *node)
{
WRITE_NODE_TYPE("ROWCOMPAREEXPR");
WRITE_ENUM_FIELD(rctype, RowCompareType);
WRITE_NODE_FIELD(opnos);
WRITE_NODE_FIELD(opfamilies);
WRITE_NODE_FIELD(inputcollids);
WRITE_NODE_FIELD(largs);
WRITE_NODE_FIELD(rargs);
}
static void
_outCoalesceExpr(StringInfo str, const CoalesceExpr *node)
{
WRITE_NODE_TYPE("COALESCEEXPR");
WRITE_OID_FIELD(coalescetype);
WRITE_OID_FIELD(coalescecollid);
WRITE_NODE_FIELD(args);
WRITE_LOCATION_FIELD(location);
}
static void
_outMinMaxExpr(StringInfo str, const MinMaxExpr *node)
{
WRITE_NODE_TYPE("MINMAXEXPR");
WRITE_OID_FIELD(minmaxtype);
WRITE_OID_FIELD(minmaxcollid);
WRITE_OID_FIELD(inputcollid);
WRITE_ENUM_FIELD(op, MinMaxOp);
WRITE_NODE_FIELD(args);
WRITE_LOCATION_FIELD(location);
}
static void
_outSQLValueFunction(StringInfo str, const SQLValueFunction *node)
{
WRITE_NODE_TYPE("SQLVALUEFUNCTION");
WRITE_ENUM_FIELD(op, SQLValueFunctionOp);
WRITE_OID_FIELD(type);
WRITE_INT_FIELD(typmod);
WRITE_LOCATION_FIELD(location);
}
static void
_outXmlExpr(StringInfo str, const XmlExpr *node)
{
WRITE_NODE_TYPE("XMLEXPR");
2007-11-15 22:14:46 +01:00
WRITE_ENUM_FIELD(op, XmlExprOp);
WRITE_STRING_FIELD(name);
WRITE_NODE_FIELD(named_args);
WRITE_NODE_FIELD(arg_names);
WRITE_NODE_FIELD(args);
WRITE_ENUM_FIELD(xmloption, XmlOptionType);
WRITE_OID_FIELD(type);
WRITE_INT_FIELD(typmod);
WRITE_LOCATION_FIELD(location);
}
static void
_outNullTest(StringInfo str, const NullTest *node)
{
WRITE_NODE_TYPE("NULLTEST");
WRITE_NODE_FIELD(arg);
WRITE_ENUM_FIELD(nulltesttype, NullTestType);
WRITE_BOOL_FIELD(argisrow);
WRITE_LOCATION_FIELD(location);
}
static void
_outBooleanTest(StringInfo str, const BooleanTest *node)
{
WRITE_NODE_TYPE("BOOLEANTEST");
WRITE_NODE_FIELD(arg);
WRITE_ENUM_FIELD(booltesttype, BoolTestType);
WRITE_LOCATION_FIELD(location);
}
static void
_outCoerceToDomain(StringInfo str, const CoerceToDomain *node)
{
WRITE_NODE_TYPE("COERCETODOMAIN");
WRITE_NODE_FIELD(arg);
WRITE_OID_FIELD(resulttype);
WRITE_INT_FIELD(resulttypmod);
WRITE_OID_FIELD(resultcollid);
WRITE_ENUM_FIELD(coercionformat, CoercionForm);
WRITE_LOCATION_FIELD(location);
}
static void
_outCoerceToDomainValue(StringInfo str, const CoerceToDomainValue *node)
{
WRITE_NODE_TYPE("COERCETODOMAINVALUE");
WRITE_OID_FIELD(typeId);
WRITE_INT_FIELD(typeMod);
WRITE_OID_FIELD(collation);
WRITE_LOCATION_FIELD(location);
}
static void
_outSetToDefault(StringInfo str, const SetToDefault *node)
{
WRITE_NODE_TYPE("SETTODEFAULT");
WRITE_OID_FIELD(typeId);
WRITE_INT_FIELD(typeMod);
WRITE_OID_FIELD(collation);
WRITE_LOCATION_FIELD(location);
}
static void
_outCurrentOfExpr(StringInfo str, const CurrentOfExpr *node)
{
WRITE_NODE_TYPE("CURRENTOFEXPR");
WRITE_UINT_FIELD(cvarno);
WRITE_STRING_FIELD(cursor_name);
WRITE_INT_FIELD(cursor_param);
}
Code review for NextValueExpr expression node type. Add missing infrastructure for this node type, notably in ruleutils.c where its lack could demonstrably cause EXPLAIN to fail. Add outfuncs/readfuncs support. (outfuncs support is useful today for debugging purposes. The readfuncs support may never be needed, since at present it would only matter for parallel query and NextValueExpr should never appear in a parallelizable query; but it seems like a bad idea to have a primnode type that isn't fully supported here.) Teach planner infrastructure that NextValueExpr is a volatile, parallel-unsafe, non-leaky expression node with cost cpu_operator_cost. Given its limited scope of usage, there *might* be no live bug today from the lack of that knowledge, but it's certainly going to bite us on the rear someday. Teach pg_stat_statements about the new node type, too. While at it, also teach cost_qual_eval() that MinMaxExpr, SQLValueFunction, XmlExpr, and CoerceToDomain should be charged as cpu_operator_cost. Failing to do this for SQLValueFunction was an oversight in my commit 0bb51aa96. The others are longer-standing oversights, but no time like the present to fix them. (In principle, CoerceToDomain could have cost much higher than this, but it doesn't presently seem worth trying to examine the domain's constraints here.) Modify execExprInterp.c to execute NextValueExpr as an out-of-line function; it seems quite unlikely to me that it's worth insisting that it be inlined in all expression eval methods. Besides, providing the out-of-line function doesn't stop anyone from inlining if they want to. Adjust some places where NextValueExpr support had been inserted with the aid of a dartboard rather than keeping it in the same order as elsewhere. Discussion: https://postgr.es/m/23862.1499981661@sss.pgh.pa.us
2017-07-14 21:25:43 +02:00
static void
_outNextValueExpr(StringInfo str, const NextValueExpr *node)
{
WRITE_NODE_TYPE("NEXTVALUEEXPR");
WRITE_OID_FIELD(seqid);
WRITE_OID_FIELD(typeId);
}
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
static void
_outInferenceElem(StringInfo str, const InferenceElem *node)
{
WRITE_NODE_TYPE("INFERENCEELEM");
WRITE_NODE_FIELD(expr);
WRITE_OID_FIELD(infercollid);
WRITE_OID_FIELD(inferopclass);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
}
static void
_outTargetEntry(StringInfo str, const TargetEntry *node)
{
WRITE_NODE_TYPE("TARGETENTRY");
WRITE_NODE_FIELD(expr);
WRITE_INT_FIELD(resno);
WRITE_STRING_FIELD(resname);
WRITE_UINT_FIELD(ressortgroupref);
WRITE_OID_FIELD(resorigtbl);
WRITE_INT_FIELD(resorigcol);
WRITE_BOOL_FIELD(resjunk);
}
static void
_outRangeTblRef(StringInfo str, const RangeTblRef *node)
{
WRITE_NODE_TYPE("RANGETBLREF");
WRITE_INT_FIELD(rtindex);
}
static void
_outJoinExpr(StringInfo str, const JoinExpr *node)
{
WRITE_NODE_TYPE("JOINEXPR");
WRITE_ENUM_FIELD(jointype, JoinType);
WRITE_BOOL_FIELD(isNatural);
WRITE_NODE_FIELD(larg);
WRITE_NODE_FIELD(rarg);
WRITE_NODE_FIELD(usingClause);
WRITE_NODE_FIELD(join_using_alias);
WRITE_NODE_FIELD(quals);
WRITE_NODE_FIELD(alias);
WRITE_INT_FIELD(rtindex);
}
static void
_outFromExpr(StringInfo str, const FromExpr *node)
{
WRITE_NODE_TYPE("FROMEXPR");
WRITE_NODE_FIELD(fromlist);
WRITE_NODE_FIELD(quals);
1999-01-21 17:08:55 +01:00
}
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
static void
_outOnConflictExpr(StringInfo str, const OnConflictExpr *node)
{
WRITE_NODE_TYPE("ONCONFLICTEXPR");
WRITE_ENUM_FIELD(action, OnConflictAction);
WRITE_NODE_FIELD(arbiterElems);
WRITE_NODE_FIELD(arbiterWhere);
WRITE_OID_FIELD(constraint);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
WRITE_NODE_FIELD(onConflictSet);
WRITE_NODE_FIELD(onConflictWhere);
WRITE_INT_FIELD(exclRelIndex);
WRITE_NODE_FIELD(exclRelTlist);
}
static void
_outJsonFormat(StringInfo str, const JsonFormat *node)
{
WRITE_NODE_TYPE("JSONFORMAT");
WRITE_ENUM_FIELD(format_type, JsonFormatType);
WRITE_ENUM_FIELD(encoding, JsonEncoding);
WRITE_LOCATION_FIELD(location);
}
static void
_outJsonReturning(StringInfo str, const JsonReturning *node)
{
WRITE_NODE_TYPE("JSONRETURNING");
WRITE_NODE_FIELD(format);
WRITE_OID_FIELD(typid);
WRITE_INT_FIELD(typmod);
}
static void
_outJsonValueExpr(StringInfo str, const JsonValueExpr *node)
{
WRITE_NODE_TYPE("JSONVALUEEXPR");
WRITE_NODE_FIELD(raw_expr);
WRITE_NODE_FIELD(formatted_expr);
WRITE_NODE_FIELD(format);
}
static void
_outJsonConstructorExpr(StringInfo str, const JsonConstructorExpr *node)
{
WRITE_NODE_TYPE("JSONCONSTRUCTOREXPR");
WRITE_ENUM_FIELD(type, JsonConstructorType);
WRITE_NODE_FIELD(args);
WRITE_NODE_FIELD(func);
WRITE_NODE_FIELD(coercion);
WRITE_NODE_FIELD(returning);
WRITE_BOOL_FIELD(absent_on_null);
WRITE_BOOL_FIELD(unique);
WRITE_LOCATION_FIELD(location);
}
static void
_outJsonIsPredicate(StringInfo str, const JsonIsPredicate *node)
{
WRITE_NODE_TYPE("JSONISPREDICATE");
WRITE_NODE_FIELD(expr);
WRITE_NODE_FIELD(format);
WRITE_ENUM_FIELD(item_type, JsonValueType);
WRITE_BOOL_FIELD(unique_keys);
WRITE_LOCATION_FIELD(location);
}
static void
_outJsonBehavior(StringInfo str, const JsonBehavior *node)
{
WRITE_NODE_TYPE("JSONBEHAVIOR");
WRITE_ENUM_FIELD(btype, JsonBehaviorType);
WRITE_NODE_FIELD(default_expr);
}
static void
_outJsonExpr(StringInfo str, const JsonExpr *node)
{
WRITE_NODE_TYPE("JSONEXPR");
WRITE_ENUM_FIELD(op, JsonExprOp);
WRITE_NODE_FIELD(formatted_expr);
WRITE_NODE_FIELD(result_coercion);
WRITE_NODE_FIELD(format);
WRITE_NODE_FIELD(path_spec);
WRITE_NODE_FIELD(passing_names);
WRITE_NODE_FIELD(passing_values);
WRITE_NODE_FIELD(returning);
WRITE_NODE_FIELD(on_empty);
WRITE_NODE_FIELD(on_error);
WRITE_NODE_FIELD(coercions);
WRITE_ENUM_FIELD(wrapper, JsonWrapper);
WRITE_BOOL_FIELD(omit_quotes);
WRITE_LOCATION_FIELD(location);
}
static void
_outJsonCoercion(StringInfo str, const JsonCoercion *node)
{
WRITE_NODE_TYPE("JSONCOERCION");
WRITE_NODE_FIELD(expr);
WRITE_BOOL_FIELD(via_populate);
WRITE_BOOL_FIELD(via_io);
WRITE_OID_FIELD(collation);
}
static void
_outJsonItemCoercions(StringInfo str, const JsonItemCoercions *node)
{
WRITE_NODE_TYPE("JSONITEMCOERCIONS");
WRITE_NODE_FIELD(null);
WRITE_NODE_FIELD(string);
WRITE_NODE_FIELD(numeric);
WRITE_NODE_FIELD(boolean);
WRITE_NODE_FIELD(date);
WRITE_NODE_FIELD(time);
WRITE_NODE_FIELD(timetz);
WRITE_NODE_FIELD(timestamp);
WRITE_NODE_FIELD(timestamptz);
WRITE_NODE_FIELD(composite);
}
static void
_outJsonTableParent(StringInfo str, const JsonTableParent *node)
{
WRITE_NODE_TYPE("JSONTABLEPARENT");
WRITE_NODE_FIELD(path);
WRITE_STRING_FIELD(name);
WRITE_NODE_FIELD(child);
WRITE_BOOL_FIELD(outerJoin);
WRITE_INT_FIELD(colMin);
WRITE_INT_FIELD(colMax);
WRITE_BOOL_FIELD(errorOnError);
}
static void
_outJsonTableSibling(StringInfo str, const JsonTableSibling *node)
{
WRITE_NODE_TYPE("JSONTABLESIBLING");
WRITE_NODE_FIELD(larg);
WRITE_NODE_FIELD(rarg);
WRITE_BOOL_FIELD(cross);
}
/*****************************************************************************
*
* Stuff from pathnodes.h.
*
*****************************************************************************/
/*
* print the basic stuff of all nodes that inherit from Path
*
* Note we do NOT print the parent, else we'd be in infinite recursion.
* We can print the parent's relids for identification purposes, though.
Add an explicit representation of the output targetlist to Paths. Up to now, there's been an assumption that all Paths for a given relation compute the same output column set (targetlist). However, there are good reasons to remove that assumption. For example, an indexscan on an expression index might be able to return the value of an expensive function "for free". While we have the ability to generate such a plan today in simple cases, we don't have a way to model that it's cheaper than a plan that computes the function from scratch, nor a way to create such a plan in join cases (where the function computation would normally happen at the topmost join node). Also, we need this so that we can have Paths representing post-scan/join steps, where the targetlist may well change from one step to the next. Therefore, invent a "struct PathTarget" representing the columns we expect a plan step to emit. It's convenient to include the output tuple width and tlist evaluation cost in this struct, and there will likely be additional fields in future. While Path nodes that actually do have custom outputs will need their own PathTargets, it will still be true that most Paths for a given relation will compute the same tlist. To reduce the overhead added by this patch, keep a "default PathTarget" in RelOptInfo, and allow Paths that compute that column set to just point to their parent RelOptInfo's reltarget. (In the patch as committed, actually every Path is like that, since we do not yet have any cases of custom PathTargets.) I took this opportunity to provide some more-honest costing of PlaceHolderVar evaluation. Up to now, the assumption that "scan/join reltargetlists have cost zero" was applied not only to Vars, where it's reasonable, but also PlaceHolderVars where it isn't. Now, we add the eval cost of a PlaceHolderVar's expression to the first plan level where it can be computed, by including it in the PathTarget cost field and adding that to the cost estimates for Paths. This isn't perfect yet but it's much better than before, and there is a way forward to improve it more. This costing change affects the join order chosen for a couple of the regression tests, changing expected row ordering.
2016-02-19 02:01:49 +01:00
* We print the pathtarget only if it's not the default one for the rel.
Revise parameterized-path mechanism to fix assorted issues. This patch adjusts the treatment of parameterized paths so that all paths with the same parameterization (same set of required outer rels) for the same relation will have the same rowcount estimate. We cache the rowcount estimates to ensure that property, and hopefully save a few cycles too. Doing this makes it practical for add_path_precheck to operate without a rowcount estimate: it need only assume that paths with different parameterizations never dominate each other, which is close enough to true anyway for coarse filtering, because normally a more-parameterized path should yield fewer rows thanks to having more join clauses to apply. In add_path, we do the full nine yards of comparing rowcount estimates along with everything else, so that we can discard parameterized paths that don't actually have an advantage. This fixes some issues I'd found with add_path rejecting parameterized paths on the grounds that they were more expensive than not-parameterized ones, even though they yielded many fewer rows and hence would be cheaper once subsequent joining was considered. To make the same-rowcounts assumption valid, we have to require that any parameterized path enforce *all* join clauses that could be obtained from the particular set of outer rels, even if not all of them are useful for indexing. This is required at both base scans and joins. It's a good thing anyway since the net impact is that join quals are checked at the lowest practical level in the join tree. Hence, discard the original rather ad-hoc mechanism for choosing parameterization joinquals, and build a better one that has a more principled rule for when clauses can be moved. The original rule was actually buggy anyway for lack of knowledge about which relations are part of an outer join's outer side; getting this right requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
* We also do not print the whole of param_info, since it's printed by
* _outRelOptInfo; it's sufficient and less cluttering to print just the
* required outer relids.
*/
static void
_outPathInfo(StringInfo str, const Path *node)
{
WRITE_ENUM_FIELD(pathtype, NodeTag);
appendStringInfoString(str, " :parent_relids ");
outBitmapset(str, node->parent->relids);
if (node->pathtarget != node->parent->reltarget)
WRITE_NODE_FIELD(pathtarget);
appendStringInfoString(str, " :required_outer ");
Revise parameterized-path mechanism to fix assorted issues. This patch adjusts the treatment of parameterized paths so that all paths with the same parameterization (same set of required outer rels) for the same relation will have the same rowcount estimate. We cache the rowcount estimates to ensure that property, and hopefully save a few cycles too. Doing this makes it practical for add_path_precheck to operate without a rowcount estimate: it need only assume that paths with different parameterizations never dominate each other, which is close enough to true anyway for coarse filtering, because normally a more-parameterized path should yield fewer rows thanks to having more join clauses to apply. In add_path, we do the full nine yards of comparing rowcount estimates along with everything else, so that we can discard parameterized paths that don't actually have an advantage. This fixes some issues I'd found with add_path rejecting parameterized paths on the grounds that they were more expensive than not-parameterized ones, even though they yielded many fewer rows and hence would be cheaper once subsequent joining was considered. To make the same-rowcounts assumption valid, we have to require that any parameterized path enforce *all* join clauses that could be obtained from the particular set of outer rels, even if not all of them are useful for indexing. This is required at both base scans and joins. It's a good thing anyway since the net impact is that join quals are checked at the lowest practical level in the join tree. Hence, discard the original rather ad-hoc mechanism for choosing parameterization joinquals, and build a better one that has a more principled rule for when clauses can be moved. The original rule was actually buggy anyway for lack of knowledge about which relations are part of an outer join's outer side; getting this right requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
if (node->param_info)
outBitmapset(str, node->param_info->ppi_req_outer);
Revise parameterized-path mechanism to fix assorted issues. This patch adjusts the treatment of parameterized paths so that all paths with the same parameterization (same set of required outer rels) for the same relation will have the same rowcount estimate. We cache the rowcount estimates to ensure that property, and hopefully save a few cycles too. Doing this makes it practical for add_path_precheck to operate without a rowcount estimate: it need only assume that paths with different parameterizations never dominate each other, which is close enough to true anyway for coarse filtering, because normally a more-parameterized path should yield fewer rows thanks to having more join clauses to apply. In add_path, we do the full nine yards of comparing rowcount estimates along with everything else, so that we can discard parameterized paths that don't actually have an advantage. This fixes some issues I'd found with add_path rejecting parameterized paths on the grounds that they were more expensive than not-parameterized ones, even though they yielded many fewer rows and hence would be cheaper once subsequent joining was considered. To make the same-rowcounts assumption valid, we have to require that any parameterized path enforce *all* join clauses that could be obtained from the particular set of outer rels, even if not all of them are useful for indexing. This is required at both base scans and joins. It's a good thing anyway since the net impact is that join quals are checked at the lowest practical level in the join tree. Hence, discard the original rather ad-hoc mechanism for choosing parameterization joinquals, and build a better one that has a more principled rule for when clauses can be moved. The original rule was actually buggy anyway for lack of knowledge about which relations are part of an outer join's outer side; getting this right requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
else
outBitmapset(str, NULL);
WRITE_BOOL_FIELD(parallel_aware);
WRITE_BOOL_FIELD(parallel_safe);
WRITE_INT_FIELD(parallel_workers);
WRITE_FLOAT_FIELD(rows, "%.0f");
WRITE_FLOAT_FIELD(startup_cost, "%.2f");
WRITE_FLOAT_FIELD(total_cost, "%.2f");
WRITE_NODE_FIELD(pathkeys);
}
/*
* print the basic stuff of all nodes that inherit from JoinPath
*/
static void
_outJoinPathInfo(StringInfo str, const JoinPath *node)
{
_outPathInfo(str, (const Path *) node);
WRITE_ENUM_FIELD(jointype, JoinType);
WRITE_BOOL_FIELD(inner_unique);
WRITE_NODE_FIELD(outerjoinpath);
WRITE_NODE_FIELD(innerjoinpath);
WRITE_NODE_FIELD(joinrestrictinfo);
}
static void
_outPath(StringInfo str, const Path *node)
{
WRITE_NODE_TYPE("PATH");
_outPathInfo(str, (const Path *) node);
}
static void
_outIndexPath(StringInfo str, const IndexPath *node)
{
WRITE_NODE_TYPE("INDEXPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(indexinfo);
WRITE_NODE_FIELD(indexclauses);
WRITE_NODE_FIELD(indexorderbys);
WRITE_NODE_FIELD(indexorderbycols);
WRITE_ENUM_FIELD(indexscandir, ScanDirection);
WRITE_FLOAT_FIELD(indextotalcost, "%.2f");
WRITE_FLOAT_FIELD(indexselectivity, "%.4f");
}
static void
_outBitmapHeapPath(StringInfo str, const BitmapHeapPath *node)
{
WRITE_NODE_TYPE("BITMAPHEAPPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(bitmapqual);
}
static void
_outBitmapAndPath(StringInfo str, const BitmapAndPath *node)
{
WRITE_NODE_TYPE("BITMAPANDPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(bitmapquals);
WRITE_FLOAT_FIELD(bitmapselectivity, "%.4f");
}
static void
_outBitmapOrPath(StringInfo str, const BitmapOrPath *node)
{
WRITE_NODE_TYPE("BITMAPORPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(bitmapquals);
WRITE_FLOAT_FIELD(bitmapselectivity, "%.4f");
}
static void
_outTidPath(StringInfo str, const TidPath *node)
{
WRITE_NODE_TYPE("TIDPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(tidquals);
}
static void
_outTidRangePath(StringInfo str, const TidRangePath *node)
{
WRITE_NODE_TYPE("TIDRANGEPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(tidrangequals);
}
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
static void
_outSubqueryScanPath(StringInfo str, const SubqueryScanPath *node)
{
WRITE_NODE_TYPE("SUBQUERYSCANPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
}
static void
_outForeignPath(StringInfo str, const ForeignPath *node)
{
WRITE_NODE_TYPE("FOREIGNPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(fdw_outerpath);
WRITE_NODE_FIELD(fdw_private);
}
static void
_outCustomPath(StringInfo str, const CustomPath *node)
{
WRITE_NODE_TYPE("CUSTOMPATH");
_outPathInfo(str, (const Path *) node);
WRITE_UINT_FIELD(flags);
WRITE_NODE_FIELD(custom_paths);
WRITE_NODE_FIELD(custom_private);
appendStringInfoString(str, " :methods ");
outToken(str, node->methods->CustomName);
}
static void
_outAppendPath(StringInfo str, const AppendPath *node)
{
WRITE_NODE_TYPE("APPENDPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpaths);
WRITE_INT_FIELD(first_partial_path);
Use Append rather than MergeAppend for scanning ordered partitions. If we need ordered output from a scan of a partitioned table, but the ordering matches the partition ordering, then we don't need to use a MergeAppend to combine the pre-ordered per-partition scan results: a plain Append will produce the same results. This both saves useless comparison work inside the MergeAppend proper, and allows us to start returning tuples after istarting up just the first child node not all of them. However, all is not peaches and cream, because if some of the child nodes have high startup costs then there will be big discontinuities in the tuples-returned-versus-elapsed-time curve. The planner's cost model cannot handle that (yet, anyway). If we model the Append's startup cost as being just the first child's startup cost, we may drastically underestimate the cost of fetching slightly more tuples than are available from the first child. Since we've had bad experiences with over-optimistic choices of "fast start" plans for ORDER BY LIMIT queries, that seems scary. As a klugy workaround, set the startup cost estimate for an ordered Append to be the sum of its children's startup costs (as MergeAppend would). This doesn't really describe reality, but it's less likely to cause a bad plan choice than an underestimated startup cost would. In practice, the cases where we really care about this optimization will have child plans that are IndexScans with zero startup cost, so that the overly conservative estimate is still just zero. David Rowley, reviewed by Julien Rouhaud and Antonin Houska Discussion: https://postgr.es/m/CAKJS1f-hAqhPLRk_RaSFTgYxd=Tz5hA7kQ2h4-DhJufQk8TGuw@mail.gmail.com
2019-04-06 01:20:30 +02:00
WRITE_FLOAT_FIELD(limit_tuples, "%.0f");
}
static void
_outMergeAppendPath(StringInfo str, const MergeAppendPath *node)
{
WRITE_NODE_TYPE("MERGEAPPENDPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpaths);
WRITE_FLOAT_FIELD(limit_tuples, "%.0f");
}
static void
In the planner, replace an empty FROM clause with a dummy RTE. The fact that "SELECT expression" has no base relations has long been a thorn in the side of the planner. It makes it hard to flatten a sub-query that looks like that, or is a trivial VALUES() item, because the planner generally uses relid sets to identify sub-relations, and such a sub-query would have an empty relid set if we flattened it. prepjointree.c contains some baroque logic that works around this in certain special cases --- but there is a much better answer. We can replace an empty FROM clause with a dummy RTE that acts like a table of one row and no columns, and then there are no such corner cases to worry about. Instead we need some logic to get rid of useless dummy RTEs, but that's simpler and covers more cases than what was there before. For really trivial cases, where the query is just "SELECT expression" and nothing else, there's a hazard that adding the extra RTE makes for a noticeable slowdown; even though it's not much processing, there's not that much for the planner to do overall. However testing says that the penalty is very small, close to the noise level. In more complex queries, this is able to find optimizations that we could not find before. The new RTE type is called RTE_RESULT, since the "scan" plan type it gives rise to is a Result node (the same plan we produced for a "SELECT expression" query before). To avoid confusion, rename the old ResultPath path type to GroupResultPath, reflecting that it's only used in degenerate grouping cases where we know the query produces just one grouped row. (It wouldn't work to unify the two cases, because there are different rules about where the associated quals live during query_planner.) Note: although this touches readfuncs.c, I don't think a catversion bump is required, because the added case can't occur in stored rules, only plans. Patch by me, reviewed by David Rowley and Mark Dilger Discussion: https://postgr.es/m/15944.1521127664@sss.pgh.pa.us
2019-01-28 23:54:10 +01:00
_outGroupResultPath(StringInfo str, const GroupResultPath *node)
{
In the planner, replace an empty FROM clause with a dummy RTE. The fact that "SELECT expression" has no base relations has long been a thorn in the side of the planner. It makes it hard to flatten a sub-query that looks like that, or is a trivial VALUES() item, because the planner generally uses relid sets to identify sub-relations, and such a sub-query would have an empty relid set if we flattened it. prepjointree.c contains some baroque logic that works around this in certain special cases --- but there is a much better answer. We can replace an empty FROM clause with a dummy RTE that acts like a table of one row and no columns, and then there are no such corner cases to worry about. Instead we need some logic to get rid of useless dummy RTEs, but that's simpler and covers more cases than what was there before. For really trivial cases, where the query is just "SELECT expression" and nothing else, there's a hazard that adding the extra RTE makes for a noticeable slowdown; even though it's not much processing, there's not that much for the planner to do overall. However testing says that the penalty is very small, close to the noise level. In more complex queries, this is able to find optimizations that we could not find before. The new RTE type is called RTE_RESULT, since the "scan" plan type it gives rise to is a Result node (the same plan we produced for a "SELECT expression" query before). To avoid confusion, rename the old ResultPath path type to GroupResultPath, reflecting that it's only used in degenerate grouping cases where we know the query produces just one grouped row. (It wouldn't work to unify the two cases, because there are different rules about where the associated quals live during query_planner.) Note: although this touches readfuncs.c, I don't think a catversion bump is required, because the added case can't occur in stored rules, only plans. Patch by me, reviewed by David Rowley and Mark Dilger Discussion: https://postgr.es/m/15944.1521127664@sss.pgh.pa.us
2019-01-28 23:54:10 +01:00
WRITE_NODE_TYPE("GROUPRESULTPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(quals);
}
static void
_outMaterialPath(StringInfo str, const MaterialPath *node)
{
WRITE_NODE_TYPE("MATERIALPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
}
Add Result Cache executor node (take 2) Here we add a new executor node type named "Result Cache". The planner can include this node type in the plan to have the executor cache the results from the inner side of parameterized nested loop joins. This allows caching of tuples for sets of parameters so that in the event that the node sees the same parameter values again, it can just return the cached tuples instead of rescanning the inner side of the join all over again. Internally, result cache uses a hash table in order to quickly find tuples that have been previously cached. For certain data sets, this can significantly improve the performance of joins. The best cases for using this new node type are for join problems where a large portion of the tuples from the inner side of the join have no join partner on the outer side of the join. In such cases, hash join would have to hash values that are never looked up, thus bloating the hash table and possibly causing it to multi-batch. Merge joins would have to skip over all of the unmatched rows. If we use a nested loop join with a result cache, then we only cache tuples that have at least one join partner on the outer side of the join. The benefits of using a parameterized nested loop with a result cache increase when there are fewer distinct values being looked up and the number of lookups of each value is large. Also, hash probes to lookup the cache can be much faster than the hash probe in a hash join as it's common that the result cache's hash table is much smaller than the hash join's due to result cache only caching useful tuples rather than all tuples from the inner side of the join. This variation in hash probe performance is more significant when the hash join's hash table no longer fits into the CPU's L3 cache, but the result cache's hash table does. The apparent "random" access of hash buckets with each hash probe can cause a poor L3 cache hit ratio for large hash tables. Smaller hash tables generally perform better. The hash table used for the cache limits itself to not exceeding work_mem * hash_mem_multiplier in size. We maintain a dlist of keys for this cache and when we're adding new tuples and realize we've exceeded the memory budget, we evict cache entries starting with the least recently used ones until we have enough memory to add the new tuples to the cache. For parameterized nested loop joins, we now consider using one of these result cache nodes in between the nested loop node and its inner node. We determine when this might be useful based on cost, which is primarily driven off of what the expected cache hit ratio will be. Estimating the cache hit ratio relies on having good distinct estimates on the nested loop's parameters. For now, the planner will only consider using a result cache for parameterized nested loop joins. This works for both normal joins and also for LATERAL type joins to subqueries. It is possible to use this new node for other uses in the future. For example, to cache results from correlated subqueries. However, that's not done here due to some difficulties obtaining a distinct estimation on the outer plan to calculate the estimated cache hit ratio. Currently we plan the inner plan before planning the outer plan so there is no good way to know if a result cache would be useful or not since we can't estimate the number of times the subplan will be called until the outer plan is generated. The functionality being added here is newly introducing a dependency on the return value of estimate_num_groups() during the join search. Previously, during the join search, we only ever needed to perform selectivity estimations. With this commit, we need to use estimate_num_groups() in order to estimate what the hit ratio on the result cache will be. In simple terms, if we expect 10 distinct values and we expect 1000 outer rows, then we'll estimate the hit ratio to be 99%. Since cache hits are very cheap compared to scanning the underlying nodes on the inner side of the nested loop join, then this will significantly reduce the planner's cost for the join. However, it's fairly easy to see here that things will go bad when estimate_num_groups() incorrectly returns a value that's significantly lower than the actual number of distinct values. If this happens then that may cause us to make use of a nested loop join with a result cache instead of some other join type, such as a merge or hash join. Our distinct estimations have been known to be a source of trouble in the past, so the extra reliance on them here could cause the planner to choose slower plans than it did previous to having this feature. Distinct estimations are also fairly hard to estimate accurately when several tables have been joined already or when a WHERE clause filters out a set of values that are correlated to the expressions we're estimating the number of distinct value for. For now, the costing we perform during query planning for result caches does put quite a bit of faith in the distinct estimations being accurate. When these are accurate then we should generally see faster execution times for plans containing a result cache. However, in the real world, we may find that we need to either change the costings to put less trust in the distinct estimations being accurate or perhaps even disable this feature by default. There's always an element of risk when we teach the query planner to do new tricks that it decides to use that new trick at the wrong time and causes a regression. Users may opt to get the old behavior by turning the feature off using the enable_resultcache GUC. Currently, this is enabled by default. It remains to be seen if we'll maintain that setting for the release. Additionally, the name "Result Cache" is the best name I could think of for this new node at the time I started writing the patch. Nobody seems to strongly dislike the name. A few people did suggest other names but no other name seemed to dominate in the brief discussion that there was about names. Let's allow the beta period to see if the current name pleases enough people. If there's some consensus on a better name, then we can change it before the release. Please see the 2nd discussion link below for the discussion on the "Result Cache" name. Author: David Rowley Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie Tested-By: Konstantin Knizhnik Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
static void
_outMemoizePath(StringInfo str, const MemoizePath *node)
Add Result Cache executor node (take 2) Here we add a new executor node type named "Result Cache". The planner can include this node type in the plan to have the executor cache the results from the inner side of parameterized nested loop joins. This allows caching of tuples for sets of parameters so that in the event that the node sees the same parameter values again, it can just return the cached tuples instead of rescanning the inner side of the join all over again. Internally, result cache uses a hash table in order to quickly find tuples that have been previously cached. For certain data sets, this can significantly improve the performance of joins. The best cases for using this new node type are for join problems where a large portion of the tuples from the inner side of the join have no join partner on the outer side of the join. In such cases, hash join would have to hash values that are never looked up, thus bloating the hash table and possibly causing it to multi-batch. Merge joins would have to skip over all of the unmatched rows. If we use a nested loop join with a result cache, then we only cache tuples that have at least one join partner on the outer side of the join. The benefits of using a parameterized nested loop with a result cache increase when there are fewer distinct values being looked up and the number of lookups of each value is large. Also, hash probes to lookup the cache can be much faster than the hash probe in a hash join as it's common that the result cache's hash table is much smaller than the hash join's due to result cache only caching useful tuples rather than all tuples from the inner side of the join. This variation in hash probe performance is more significant when the hash join's hash table no longer fits into the CPU's L3 cache, but the result cache's hash table does. The apparent "random" access of hash buckets with each hash probe can cause a poor L3 cache hit ratio for large hash tables. Smaller hash tables generally perform better. The hash table used for the cache limits itself to not exceeding work_mem * hash_mem_multiplier in size. We maintain a dlist of keys for this cache and when we're adding new tuples and realize we've exceeded the memory budget, we evict cache entries starting with the least recently used ones until we have enough memory to add the new tuples to the cache. For parameterized nested loop joins, we now consider using one of these result cache nodes in between the nested loop node and its inner node. We determine when this might be useful based on cost, which is primarily driven off of what the expected cache hit ratio will be. Estimating the cache hit ratio relies on having good distinct estimates on the nested loop's parameters. For now, the planner will only consider using a result cache for parameterized nested loop joins. This works for both normal joins and also for LATERAL type joins to subqueries. It is possible to use this new node for other uses in the future. For example, to cache results from correlated subqueries. However, that's not done here due to some difficulties obtaining a distinct estimation on the outer plan to calculate the estimated cache hit ratio. Currently we plan the inner plan before planning the outer plan so there is no good way to know if a result cache would be useful or not since we can't estimate the number of times the subplan will be called until the outer plan is generated. The functionality being added here is newly introducing a dependency on the return value of estimate_num_groups() during the join search. Previously, during the join search, we only ever needed to perform selectivity estimations. With this commit, we need to use estimate_num_groups() in order to estimate what the hit ratio on the result cache will be. In simple terms, if we expect 10 distinct values and we expect 1000 outer rows, then we'll estimate the hit ratio to be 99%. Since cache hits are very cheap compared to scanning the underlying nodes on the inner side of the nested loop join, then this will significantly reduce the planner's cost for the join. However, it's fairly easy to see here that things will go bad when estimate_num_groups() incorrectly returns a value that's significantly lower than the actual number of distinct values. If this happens then that may cause us to make use of a nested loop join with a result cache instead of some other join type, such as a merge or hash join. Our distinct estimations have been known to be a source of trouble in the past, so the extra reliance on them here could cause the planner to choose slower plans than it did previous to having this feature. Distinct estimations are also fairly hard to estimate accurately when several tables have been joined already or when a WHERE clause filters out a set of values that are correlated to the expressions we're estimating the number of distinct value for. For now, the costing we perform during query planning for result caches does put quite a bit of faith in the distinct estimations being accurate. When these are accurate then we should generally see faster execution times for plans containing a result cache. However, in the real world, we may find that we need to either change the costings to put less trust in the distinct estimations being accurate or perhaps even disable this feature by default. There's always an element of risk when we teach the query planner to do new tricks that it decides to use that new trick at the wrong time and causes a regression. Users may opt to get the old behavior by turning the feature off using the enable_resultcache GUC. Currently, this is enabled by default. It remains to be seen if we'll maintain that setting for the release. Additionally, the name "Result Cache" is the best name I could think of for this new node at the time I started writing the patch. Nobody seems to strongly dislike the name. A few people did suggest other names but no other name seemed to dominate in the brief discussion that there was about names. Let's allow the beta period to see if the current name pleases enough people. If there's some consensus on a better name, then we can change it before the release. Please see the 2nd discussion link below for the discussion on the "Result Cache" name. Author: David Rowley Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie Tested-By: Konstantin Knizhnik Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
{
WRITE_NODE_TYPE("MEMOIZEPATH");
Add Result Cache executor node (take 2) Here we add a new executor node type named "Result Cache". The planner can include this node type in the plan to have the executor cache the results from the inner side of parameterized nested loop joins. This allows caching of tuples for sets of parameters so that in the event that the node sees the same parameter values again, it can just return the cached tuples instead of rescanning the inner side of the join all over again. Internally, result cache uses a hash table in order to quickly find tuples that have been previously cached. For certain data sets, this can significantly improve the performance of joins. The best cases for using this new node type are for join problems where a large portion of the tuples from the inner side of the join have no join partner on the outer side of the join. In such cases, hash join would have to hash values that are never looked up, thus bloating the hash table and possibly causing it to multi-batch. Merge joins would have to skip over all of the unmatched rows. If we use a nested loop join with a result cache, then we only cache tuples that have at least one join partner on the outer side of the join. The benefits of using a parameterized nested loop with a result cache increase when there are fewer distinct values being looked up and the number of lookups of each value is large. Also, hash probes to lookup the cache can be much faster than the hash probe in a hash join as it's common that the result cache's hash table is much smaller than the hash join's due to result cache only caching useful tuples rather than all tuples from the inner side of the join. This variation in hash probe performance is more significant when the hash join's hash table no longer fits into the CPU's L3 cache, but the result cache's hash table does. The apparent "random" access of hash buckets with each hash probe can cause a poor L3 cache hit ratio for large hash tables. Smaller hash tables generally perform better. The hash table used for the cache limits itself to not exceeding work_mem * hash_mem_multiplier in size. We maintain a dlist of keys for this cache and when we're adding new tuples and realize we've exceeded the memory budget, we evict cache entries starting with the least recently used ones until we have enough memory to add the new tuples to the cache. For parameterized nested loop joins, we now consider using one of these result cache nodes in between the nested loop node and its inner node. We determine when this might be useful based on cost, which is primarily driven off of what the expected cache hit ratio will be. Estimating the cache hit ratio relies on having good distinct estimates on the nested loop's parameters. For now, the planner will only consider using a result cache for parameterized nested loop joins. This works for both normal joins and also for LATERAL type joins to subqueries. It is possible to use this new node for other uses in the future. For example, to cache results from correlated subqueries. However, that's not done here due to some difficulties obtaining a distinct estimation on the outer plan to calculate the estimated cache hit ratio. Currently we plan the inner plan before planning the outer plan so there is no good way to know if a result cache would be useful or not since we can't estimate the number of times the subplan will be called until the outer plan is generated. The functionality being added here is newly introducing a dependency on the return value of estimate_num_groups() during the join search. Previously, during the join search, we only ever needed to perform selectivity estimations. With this commit, we need to use estimate_num_groups() in order to estimate what the hit ratio on the result cache will be. In simple terms, if we expect 10 distinct values and we expect 1000 outer rows, then we'll estimate the hit ratio to be 99%. Since cache hits are very cheap compared to scanning the underlying nodes on the inner side of the nested loop join, then this will significantly reduce the planner's cost for the join. However, it's fairly easy to see here that things will go bad when estimate_num_groups() incorrectly returns a value that's significantly lower than the actual number of distinct values. If this happens then that may cause us to make use of a nested loop join with a result cache instead of some other join type, such as a merge or hash join. Our distinct estimations have been known to be a source of trouble in the past, so the extra reliance on them here could cause the planner to choose slower plans than it did previous to having this feature. Distinct estimations are also fairly hard to estimate accurately when several tables have been joined already or when a WHERE clause filters out a set of values that are correlated to the expressions we're estimating the number of distinct value for. For now, the costing we perform during query planning for result caches does put quite a bit of faith in the distinct estimations being accurate. When these are accurate then we should generally see faster execution times for plans containing a result cache. However, in the real world, we may find that we need to either change the costings to put less trust in the distinct estimations being accurate or perhaps even disable this feature by default. There's always an element of risk when we teach the query planner to do new tricks that it decides to use that new trick at the wrong time and causes a regression. Users may opt to get the old behavior by turning the feature off using the enable_resultcache GUC. Currently, this is enabled by default. It remains to be seen if we'll maintain that setting for the release. Additionally, the name "Result Cache" is the best name I could think of for this new node at the time I started writing the patch. Nobody seems to strongly dislike the name. A few people did suggest other names but no other name seemed to dominate in the brief discussion that there was about names. Let's allow the beta period to see if the current name pleases enough people. If there's some consensus on a better name, then we can change it before the release. Please see the 2nd discussion link below for the discussion on the "Result Cache" name. Author: David Rowley Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie Tested-By: Konstantin Knizhnik Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
WRITE_NODE_FIELD(hash_operators);
WRITE_NODE_FIELD(param_exprs);
WRITE_BOOL_FIELD(singlerow);
WRITE_BOOL_FIELD(binary_mode);
Add Result Cache executor node (take 2) Here we add a new executor node type named "Result Cache". The planner can include this node type in the plan to have the executor cache the results from the inner side of parameterized nested loop joins. This allows caching of tuples for sets of parameters so that in the event that the node sees the same parameter values again, it can just return the cached tuples instead of rescanning the inner side of the join all over again. Internally, result cache uses a hash table in order to quickly find tuples that have been previously cached. For certain data sets, this can significantly improve the performance of joins. The best cases for using this new node type are for join problems where a large portion of the tuples from the inner side of the join have no join partner on the outer side of the join. In such cases, hash join would have to hash values that are never looked up, thus bloating the hash table and possibly causing it to multi-batch. Merge joins would have to skip over all of the unmatched rows. If we use a nested loop join with a result cache, then we only cache tuples that have at least one join partner on the outer side of the join. The benefits of using a parameterized nested loop with a result cache increase when there are fewer distinct values being looked up and the number of lookups of each value is large. Also, hash probes to lookup the cache can be much faster than the hash probe in a hash join as it's common that the result cache's hash table is much smaller than the hash join's due to result cache only caching useful tuples rather than all tuples from the inner side of the join. This variation in hash probe performance is more significant when the hash join's hash table no longer fits into the CPU's L3 cache, but the result cache's hash table does. The apparent "random" access of hash buckets with each hash probe can cause a poor L3 cache hit ratio for large hash tables. Smaller hash tables generally perform better. The hash table used for the cache limits itself to not exceeding work_mem * hash_mem_multiplier in size. We maintain a dlist of keys for this cache and when we're adding new tuples and realize we've exceeded the memory budget, we evict cache entries starting with the least recently used ones until we have enough memory to add the new tuples to the cache. For parameterized nested loop joins, we now consider using one of these result cache nodes in between the nested loop node and its inner node. We determine when this might be useful based on cost, which is primarily driven off of what the expected cache hit ratio will be. Estimating the cache hit ratio relies on having good distinct estimates on the nested loop's parameters. For now, the planner will only consider using a result cache for parameterized nested loop joins. This works for both normal joins and also for LATERAL type joins to subqueries. It is possible to use this new node for other uses in the future. For example, to cache results from correlated subqueries. However, that's not done here due to some difficulties obtaining a distinct estimation on the outer plan to calculate the estimated cache hit ratio. Currently we plan the inner plan before planning the outer plan so there is no good way to know if a result cache would be useful or not since we can't estimate the number of times the subplan will be called until the outer plan is generated. The functionality being added here is newly introducing a dependency on the return value of estimate_num_groups() during the join search. Previously, during the join search, we only ever needed to perform selectivity estimations. With this commit, we need to use estimate_num_groups() in order to estimate what the hit ratio on the result cache will be. In simple terms, if we expect 10 distinct values and we expect 1000 outer rows, then we'll estimate the hit ratio to be 99%. Since cache hits are very cheap compared to scanning the underlying nodes on the inner side of the nested loop join, then this will significantly reduce the planner's cost for the join. However, it's fairly easy to see here that things will go bad when estimate_num_groups() incorrectly returns a value that's significantly lower than the actual number of distinct values. If this happens then that may cause us to make use of a nested loop join with a result cache instead of some other join type, such as a merge or hash join. Our distinct estimations have been known to be a source of trouble in the past, so the extra reliance on them here could cause the planner to choose slower plans than it did previous to having this feature. Distinct estimations are also fairly hard to estimate accurately when several tables have been joined already or when a WHERE clause filters out a set of values that are correlated to the expressions we're estimating the number of distinct value for. For now, the costing we perform during query planning for result caches does put quite a bit of faith in the distinct estimations being accurate. When these are accurate then we should generally see faster execution times for plans containing a result cache. However, in the real world, we may find that we need to either change the costings to put less trust in the distinct estimations being accurate or perhaps even disable this feature by default. There's always an element of risk when we teach the query planner to do new tricks that it decides to use that new trick at the wrong time and causes a regression. Users may opt to get the old behavior by turning the feature off using the enable_resultcache GUC. Currently, this is enabled by default. It remains to be seen if we'll maintain that setting for the release. Additionally, the name "Result Cache" is the best name I could think of for this new node at the time I started writing the patch. Nobody seems to strongly dislike the name. A few people did suggest other names but no other name seemed to dominate in the brief discussion that there was about names. Let's allow the beta period to see if the current name pleases enough people. If there's some consensus on a better name, then we can change it before the release. Please see the 2nd discussion link below for the discussion on the "Result Cache" name. Author: David Rowley Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie Tested-By: Konstantin Knizhnik Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
WRITE_FLOAT_FIELD(calls, "%.0f");
WRITE_UINT_FIELD(est_entries);
}
static void
_outUniquePath(StringInfo str, const UniquePath *node)
{
WRITE_NODE_TYPE("UNIQUEPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
WRITE_ENUM_FIELD(umethod, UniquePathMethod);
WRITE_NODE_FIELD(in_operators);
WRITE_NODE_FIELD(uniq_exprs);
}
static void
_outGatherPath(StringInfo str, const GatherPath *node)
{
WRITE_NODE_TYPE("GATHERPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
WRITE_BOOL_FIELD(single_copy);
WRITE_INT_FIELD(num_workers);
}
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
static void
_outProjectionPath(StringInfo str, const ProjectionPath *node)
{
WRITE_NODE_TYPE("PROJECTIONPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
Refactor planning of projection steps that don't need a Result plan node. The original upper-planner-pathification design (commit 3fc6e2d7f5b652b4) assumed that we could always determine during Path formation whether or not we would need a Result plan node to perform projection of a targetlist. That turns out not to work very well, though, because createplan.c still has some responsibilities for choosing the specific target list associated with sorting/grouping nodes (in particular it might choose to add resjunk columns for sorting). We might not ever refactor that --- doing so would push more work into Path formation, which isn't attractive --- and we certainly won't do so for 9.6. So, while create_projection_path and apply_projection_to_path can tell for sure what will happen if the subpath is projection-capable, they can't tell for sure when it isn't. This is at least a latent bug in apply_projection_to_path, which might think it can apply a target to a non-projecting node when the node will end up computing something different. Also, I'd tied the creation of a ProjectionPath node to whether or not a Result is needed, but it turns out that we sometimes need a ProjectionPath node anyway to avoid modifying a possibly-shared subpath node. Callers had to use create_projection_path for such cases, and we added code to them that knew about the potential omission of a Result node and attempted to adjust the cost estimates for that. That was uncertainly correct and definitely ugly/unmaintainable. To fix, have create_projection_path explicitly check whether a Result is needed and adjust its cost estimate accordingly, though it creates a ProjectionPath in either case. apply_projection_to_path is now mostly just an optimized version that can avoid creating an extra Path node when the input is known to not be shared with any other live path. (There is one case that create_projection_path doesn't handle, which is pushing parallel-safe expressions below a Gather node. We could make it do that by duplicating the GatherPath, but there seems no need as yet.) create_projection_plan still has to recheck the tlist-match condition, which means that if the matching situation does get changed by createplan.c then we'll have made a slightly incorrect cost estimate. But there seems no help for that in the near term, and I doubt it occurs often enough, let alone would change planning decisions often enough, to be worth stressing about. I added a "dummypp" field to ProjectionPath to track whether create_projection_path thinks a Result is needed. This is not really necessary as-committed because create_projection_plan doesn't look at the flag; but it seems like a good idea to remember what we thought when forming the cost estimate, if only for debugging purposes. In passing, get rid of the target_parallel parameter added to apply_projection_to_path by commit 54f5c5150. I don't think that's a good idea because it involves callers in what should be an internal decision, and opens us up to missing optimization opportunities if callers think they don't need to provide a valid flag, as most don't. For the moment, this just costs us an extra has_parallel_hazard call when planning a Gather. If that starts to look expensive, I think a better solution would be to teach PathTarget to carry/cache knowledge of parallel-safety of its contents.
2016-06-22 00:38:20 +02:00
WRITE_BOOL_FIELD(dummypp);
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
}
Move targetlist SRF handling from expression evaluation to new executor node. Evaluation of set returning functions (SRFs_ in the targetlist (like SELECT generate_series(1,5)) so far was done in the expression evaluation (i.e. ExecEvalExpr()) and projection (i.e. ExecProject/ExecTargetList) code. This meant that most executor nodes performing projection, and most expression evaluation functions, had to deal with the possibility that an evaluated expression could return a set of return values. That's bad because it leads to repeated code in a lot of places. It also, and that's my (Andres's) motivation, made it a lot harder to implement a more efficient way of doing expression evaluation. To fix this, introduce a new executor node (ProjectSet) that can evaluate targetlists containing one or more SRFs. To avoid the complexity of the old way of handling nested expressions returning sets (e.g. having to pass up ExprDoneCond, and dealing with arguments to functions returning sets etc.), those SRFs can only be at the top level of the node's targetlist. The planner makes sure (via split_pathtarget_at_srfs()) that SRF evaluation is only necessary in ProjectSet nodes and that SRFs are only present at the top level of the node's targetlist. If there are nested SRFs the planner creates multiple stacked ProjectSet nodes. The ProjectSet nodes always get input from an underlying node. We also discussed and prototyped evaluating targetlist SRFs using ROWS FROM(), but that turned out to be more complicated than we'd hoped. While moving SRF evaluation to ProjectSet would allow to retain the old "least common multiple" behavior when multiple SRFs are present in one targetlist (i.e. continue returning rows until all SRFs are at the end of their input at the same time), we decided to instead only return rows till all SRFs are exhausted, returning NULL for already exhausted ones. We deemed the previous behavior to be too confusing, unexpected and actually not particularly useful. As a side effect, the previously prohibited case of multiple set returning arguments to a function, is now allowed. Not because it's particularly desirable, but because it ends up working and there seems to be no argument for adding code to prohibit it. Currently the behavior for COALESCE and CASE containing SRFs has changed, returning multiple rows from the expression, even when the SRF containing "arm" of the expression is not evaluated. That's because the SRFs are evaluated in a separate ProjectSet node. As that's quite confusing, we're likely to instead prohibit SRFs in those places. But that's still being discussed, and the code would reside in places not touched here, so that's a task for later. There's a lot of, now superfluous, code dealing with set return expressions around. But as the changes to get rid of those are verbose largely boring, it seems better for readability to keep the cleanup as a separate commit. Author: Tom Lane and Andres Freund Discussion: https://postgr.es/m/20160822214023.aaxz5l4igypowyri@alap3.anarazel.de
2017-01-18 21:46:50 +01:00
static void
_outProjectSetPath(StringInfo str, const ProjectSetPath *node)
{
WRITE_NODE_TYPE("PROJECTSETPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
}
static void
_outSortPathInfo(StringInfo str, const SortPath *node)
{
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
}
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
static void
_outSortPath(StringInfo str, const SortPath *node)
{
WRITE_NODE_TYPE("SORTPATH");
_outSortPathInfo(str, node);
}
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
static void
_outIncrementalSortPath(StringInfo str, const IncrementalSortPath *node)
{
WRITE_NODE_TYPE("INCREMENTALSORTPATH");
_outSortPathInfo(str, (const SortPath *) node);
WRITE_INT_FIELD(nPresortedCols);
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
}
static void
_outGroupPath(StringInfo str, const GroupPath *node)
{
WRITE_NODE_TYPE("GROUPPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
WRITE_NODE_FIELD(groupClause);
WRITE_NODE_FIELD(qual);
}
static void
_outUpperUniquePath(StringInfo str, const UpperUniquePath *node)
{
WRITE_NODE_TYPE("UPPERUNIQUEPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
WRITE_INT_FIELD(numkeys);
}
static void
_outAggPath(StringInfo str, const AggPath *node)
{
WRITE_NODE_TYPE("AGGPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
WRITE_ENUM_FIELD(aggstrategy, AggStrategy);
WRITE_ENUM_FIELD(aggsplit, AggSplit);
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
WRITE_FLOAT_FIELD(numGroups, "%.0f");
WRITE_UINT64_FIELD(transitionSpace);
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
WRITE_NODE_FIELD(groupClause);
WRITE_NODE_FIELD(qual);
}
static void
_outRollupData(StringInfo str, const RollupData *node)
{
WRITE_NODE_TYPE("ROLLUP");
WRITE_NODE_FIELD(groupClause);
WRITE_NODE_FIELD(gsets);
WRITE_NODE_FIELD(gsets_data);
WRITE_FLOAT_FIELD(numGroups, "%.0f");
WRITE_BOOL_FIELD(hashable);
WRITE_BOOL_FIELD(is_hashed);
}
static void
_outGroupingSetData(StringInfo str, const GroupingSetData *node)
{
WRITE_NODE_TYPE("GSDATA");
WRITE_NODE_FIELD(set);
WRITE_FLOAT_FIELD(numGroups, "%.0f");
}
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
static void
_outGroupingSetsPath(StringInfo str, const GroupingSetsPath *node)
{
WRITE_NODE_TYPE("GROUPINGSETSPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
WRITE_ENUM_FIELD(aggstrategy, AggStrategy);
WRITE_NODE_FIELD(rollups);
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
WRITE_NODE_FIELD(qual);
WRITE_UINT64_FIELD(transitionSpace);
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
}
static void
_outMinMaxAggPath(StringInfo str, const MinMaxAggPath *node)
{
WRITE_NODE_TYPE("MINMAXAGGPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(mmaggregates);
WRITE_NODE_FIELD(quals);
}
static void
_outWindowAggPath(StringInfo str, const WindowAggPath *node)
{
WRITE_NODE_TYPE("WINDOWAGGPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
WRITE_NODE_FIELD(winclause);
Teach planner and executor about monotonic window funcs Window functions such as row_number() always return a value higher than the previously returned value for tuples in any given window partition. Traditionally queries such as; SELECT * FROM ( SELECT *, row_number() over (order by c) rn FROM t ) t WHERE rn <= 10; were executed fairly inefficiently. Neither the query planner nor the executor knew that once rn made it to 11 that nothing further would match the outer query's WHERE clause. It would blindly continue until all tuples were exhausted from the subquery. Here we implement means to make the above execute more efficiently. This is done by way of adding a pg_proc.prosupport function to various of the built-in window functions and adding supporting code to allow the support function to inform the planner if the window function is monotonically increasing, monotonically decreasing, both or neither. The planner is then able to make use of that information and possibly allow the executor to short-circuit execution by way of adding a "run condition" to the WindowAgg to allow it to determine if some of its execution work can be skipped. This "run condition" is not like a normal filter. These run conditions are only built using quals comparing values to monotonic window functions. For monotonic increasing functions, quals making use of the btree operators for <, <= and = can be used (assuming the window function column is on the left). You can see here that once such a condition becomes false that a monotonic increasing function could never make it subsequently true again. For monotonically decreasing functions the >, >= and = btree operators for the given type can be used for run conditions. The best-case situation for this is when there is a single WindowAgg node without a PARTITION BY clause. Here when the run condition becomes false the WindowAgg node can simply return NULL. No more tuples will ever match the run condition. It's a little more complex when there is a PARTITION BY clause. In this case, we cannot return NULL as we must still process other partitions. To speed this case up we pull tuples from the outer plan to check if they're from the same partition and simply discard them if they are. When we find a tuple belonging to another partition we start processing as normal again until the run condition becomes false or we run out of tuples to process. When there are multiple WindowAgg nodes to evaluate then this complicates the situation. For intermediate WindowAggs we must ensure we always return all tuples to the calling node. Any filtering done could lead to incorrect results in WindowAgg nodes above. For all intermediate nodes, we can still save some work when the run condition becomes false. We've no need to evaluate the WindowFuncs anymore. Other WindowAgg nodes cannot reference the value of these and these tuples will not appear in the final result anyway. The savings here are small in comparison to what can be saved in the top-level WingowAgg, but still worthwhile. Intermediate WindowAgg nodes never filter out tuples, but here we change WindowAgg so that the top-level WindowAgg filters out tuples that don't match the intermediate WindowAgg node's run condition. Such filters appear in the "Filter" clause in EXPLAIN for the top-level WindowAgg node. Here we add prosupport functions to allow the above to work for; row_number(), rank(), dense_rank(), count(*) and count(expr). It appears technically possible to do the same for min() and max(), however, it seems unlikely to be useful enough, so that's not done here. Bump catversion Author: David Rowley Reviewed-by: Andy Fan, Zhihong Yu Discussion: https://postgr.es/m/CAApHDvqvp3At8++yF8ij06sdcoo1S_b2YoaT9D4Nf+MObzsrLQ@mail.gmail.com
2022-04-08 00:34:36 +02:00
WRITE_NODE_FIELD(qual);
WRITE_BOOL_FIELD(topwindow);
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
}
static void
_outSetOpPath(StringInfo str, const SetOpPath *node)
{
WRITE_NODE_TYPE("SETOPPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
WRITE_ENUM_FIELD(cmd, SetOpCmd);
WRITE_ENUM_FIELD(strategy, SetOpStrategy);
WRITE_NODE_FIELD(distinctList);
WRITE_INT_FIELD(flagColIdx);
WRITE_INT_FIELD(firstFlag);
WRITE_FLOAT_FIELD(numGroups, "%.0f");
}
static void
_outRecursiveUnionPath(StringInfo str, const RecursiveUnionPath *node)
{
WRITE_NODE_TYPE("RECURSIVEUNIONPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(leftpath);
WRITE_NODE_FIELD(rightpath);
WRITE_NODE_FIELD(distinctList);
WRITE_INT_FIELD(wtParam);
WRITE_FLOAT_FIELD(numGroups, "%.0f");
}
static void
_outLockRowsPath(StringInfo str, const LockRowsPath *node)
{
WRITE_NODE_TYPE("LOCKROWSPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
WRITE_NODE_FIELD(rowMarks);
WRITE_INT_FIELD(epqParam);
}
static void
_outModifyTablePath(StringInfo str, const ModifyTablePath *node)
{
WRITE_NODE_TYPE("MODIFYTABLEPATH");
_outPathInfo(str, (const Path *) node);
Rework planning and execution of UPDATE and DELETE. This patch makes two closely related sets of changes: 1. For UPDATE, the subplan of the ModifyTable node now only delivers the new values of the changed columns (i.e., the expressions computed in the query's SET clause) plus row identity information such as CTID. ModifyTable must re-fetch the original tuple to merge in the old values of any unchanged columns. The core advantage of this is that the changed columns are uniform across all tables of an inherited or partitioned target relation, whereas the other columns might not be. A secondary advantage, when the UPDATE involves joins, is that less data needs to pass through the plan tree. The disadvantage of course is an extra fetch of each tuple to be updated. However, that seems to be very nearly free in context; even worst-case tests don't show it to add more than a couple percent to the total query cost. At some point it might be interesting to combine the re-fetch with the tuple access that ModifyTable must do anyway to mark the old tuple dead; but that would require a good deal of refactoring and it seems it wouldn't buy all that much, so this patch doesn't attempt it. 2. For inherited UPDATE/DELETE, instead of generating a separate subplan for each target relation, we now generate a single subplan that is just exactly like a SELECT's plan, then stick ModifyTable on top of that. To let ModifyTable know which target relation a given incoming row refers to, a tableoid junk column is added to the row identity information. This gets rid of the horrid hack that was inheritance_planner(), eliminating O(N^2) planning cost and memory consumption in cases where there were many unprunable target relations. Point 2 of course requires point 1, so that there is a uniform definition of the non-junk columns to be returned by the subplan. We can't insist on uniform definition of the row identity junk columns however, if we want to keep the ability to have both plain and foreign tables in a partitioning hierarchy. Since it wouldn't scale very far to have every child table have its own row identity column, this patch includes provisions to merge similar row identity columns into one column of the subplan result. In particular, we can merge the whole-row Vars typically used as row identity by FDWs into one column by pretending they are type RECORD. (It's still okay for the actual composite Datums to be labeled with the table's rowtype OID, though.) There is more that can be done to file down residual inefficiencies in this patch, but it seems to be committable now. FDW authors should note several API changes: * The argument list for AddForeignUpdateTargets() has changed, and so has the method it must use for adding junk columns to the query. Call add_row_identity_var() instead of manipulating the parse tree directly. You might want to reconsider exactly what you're adding, too. * PlanDirectModify() must now work a little harder to find the ForeignScan plan node; if the foreign table is part of a partitioning hierarchy then the ForeignScan might not be the direct child of ModifyTable. See postgres_fdw for sample code. * To check whether a relation is a target relation, it's no longer sufficient to compare its relid to root->parse->resultRelation. Instead, check it against all_result_relids or leaf_result_relids, as appropriate. Amit Langote and Tom Lane Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
WRITE_NODE_FIELD(subpath);
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
WRITE_ENUM_FIELD(operation, CmdType);
WRITE_BOOL_FIELD(canSetTag);
WRITE_UINT_FIELD(nominalRelation);
WRITE_UINT_FIELD(rootRelation);
WRITE_BOOL_FIELD(partColsUpdated);
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
WRITE_NODE_FIELD(resultRelations);
Rework planning and execution of UPDATE and DELETE. This patch makes two closely related sets of changes: 1. For UPDATE, the subplan of the ModifyTable node now only delivers the new values of the changed columns (i.e., the expressions computed in the query's SET clause) plus row identity information such as CTID. ModifyTable must re-fetch the original tuple to merge in the old values of any unchanged columns. The core advantage of this is that the changed columns are uniform across all tables of an inherited or partitioned target relation, whereas the other columns might not be. A secondary advantage, when the UPDATE involves joins, is that less data needs to pass through the plan tree. The disadvantage of course is an extra fetch of each tuple to be updated. However, that seems to be very nearly free in context; even worst-case tests don't show it to add more than a couple percent to the total query cost. At some point it might be interesting to combine the re-fetch with the tuple access that ModifyTable must do anyway to mark the old tuple dead; but that would require a good deal of refactoring and it seems it wouldn't buy all that much, so this patch doesn't attempt it. 2. For inherited UPDATE/DELETE, instead of generating a separate subplan for each target relation, we now generate a single subplan that is just exactly like a SELECT's plan, then stick ModifyTable on top of that. To let ModifyTable know which target relation a given incoming row refers to, a tableoid junk column is added to the row identity information. This gets rid of the horrid hack that was inheritance_planner(), eliminating O(N^2) planning cost and memory consumption in cases where there were many unprunable target relations. Point 2 of course requires point 1, so that there is a uniform definition of the non-junk columns to be returned by the subplan. We can't insist on uniform definition of the row identity junk columns however, if we want to keep the ability to have both plain and foreign tables in a partitioning hierarchy. Since it wouldn't scale very far to have every child table have its own row identity column, this patch includes provisions to merge similar row identity columns into one column of the subplan result. In particular, we can merge the whole-row Vars typically used as row identity by FDWs into one column by pretending they are type RECORD. (It's still okay for the actual composite Datums to be labeled with the table's rowtype OID, though.) There is more that can be done to file down residual inefficiencies in this patch, but it seems to be committable now. FDW authors should note several API changes: * The argument list for AddForeignUpdateTargets() has changed, and so has the method it must use for adding junk columns to the query. Call add_row_identity_var() instead of manipulating the parse tree directly. You might want to reconsider exactly what you're adding, too. * PlanDirectModify() must now work a little harder to find the ForeignScan plan node; if the foreign table is part of a partitioning hierarchy then the ForeignScan might not be the direct child of ModifyTable. See postgres_fdw for sample code. * To check whether a relation is a target relation, it's no longer sufficient to compare its relid to root->parse->resultRelation. Instead, check it against all_result_relids or leaf_result_relids, as appropriate. Amit Langote and Tom Lane Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
WRITE_NODE_FIELD(updateColnosLists);
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
WRITE_NODE_FIELD(withCheckOptionLists);
WRITE_NODE_FIELD(returningLists);
WRITE_NODE_FIELD(rowMarks);
WRITE_NODE_FIELD(onconflict);
WRITE_INT_FIELD(epqParam);
Add support for MERGE SQL command MERGE performs actions that modify rows in the target table using a source table or query. MERGE provides a single SQL statement that can conditionally INSERT/UPDATE/DELETE rows -- a task that would otherwise require multiple PL statements. For example, MERGE INTO target AS t USING source AS s ON t.tid = s.sid WHEN MATCHED AND t.balance > s.delta THEN UPDATE SET balance = t.balance - s.delta WHEN MATCHED THEN DELETE WHEN NOT MATCHED AND s.delta > 0 THEN INSERT VALUES (s.sid, s.delta) WHEN NOT MATCHED THEN DO NOTHING; MERGE works with regular tables, partitioned tables and inheritance hierarchies, including column and row security enforcement, as well as support for row and statement triggers and transition tables therein. MERGE is optimized for OLTP and is parameterizable, though also useful for large scale ETL/ELT. MERGE is not intended to be used in preference to existing single SQL commands for INSERT, UPDATE or DELETE since there is some overhead. MERGE can be used from PL/pgSQL. MERGE does not support targetting updatable views or foreign tables, and RETURNING clauses are not allowed either. These limitations are likely fixable with sufficient effort. Rewrite rules are also not supported, but it's not clear that we'd want to support them. Author: Pavan Deolasee <pavan.deolasee@gmail.com> Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Author: Amit Langote <amitlangote09@gmail.com> Author: Simon Riggs <simon.riggs@enterprisedb.com> Reviewed-by: Peter Eisentraut <peter.eisentraut@enterprisedb.com> Reviewed-by: Andres Freund <andres@anarazel.de> (earlier versions) Reviewed-by: Peter Geoghegan <pg@bowt.ie> (earlier versions) Reviewed-by: Robert Haas <robertmhaas@gmail.com> (earlier versions) Reviewed-by: Japin Li <japinli@hotmail.com> Reviewed-by: Justin Pryzby <pryzby@telsasoft.com> Reviewed-by: Tomas Vondra <tomas.vondra@enterprisedb.com> Reviewed-by: Zhihong Yu <zyu@yugabyte.com> Discussion: https://postgr.es/m/CANP8+jKitBSrB7oTgT9CY2i1ObfOt36z0XMraQc+Xrz8QB0nXA@mail.gmail.com Discussion: https://postgr.es/m/CAH2-WzkJdBuxj9PO=2QaO9-3h3xGbQPZ34kJH=HukRekwM-GZg@mail.gmail.com Discussion: https://postgr.es/m/20201231134736.GA25392@alvherre.pgsql
2022-03-28 16:45:58 +02:00
WRITE_NODE_FIELD(mergeActionLists);
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
}
static void
_outLimitPath(StringInfo str, const LimitPath *node)
{
WRITE_NODE_TYPE("LIMITPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
WRITE_NODE_FIELD(limitOffset);
WRITE_NODE_FIELD(limitCount);
WRITE_ENUM_FIELD(limitOption, LimitOption);
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
}
static void
_outGatherMergePath(StringInfo str, const GatherMergePath *node)
{
WRITE_NODE_TYPE("GATHERMERGEPATH");
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
WRITE_INT_FIELD(num_workers);
}
static void
_outNestPath(StringInfo str, const NestPath *node)
{
WRITE_NODE_TYPE("NESTPATH");
_outJoinPathInfo(str, (const JoinPath *) node);
}
static void
_outMergePath(StringInfo str, const MergePath *node)
{
WRITE_NODE_TYPE("MERGEPATH");
_outJoinPathInfo(str, (const JoinPath *) node);
WRITE_NODE_FIELD(path_mergeclauses);
WRITE_NODE_FIELD(outersortkeys);
WRITE_NODE_FIELD(innersortkeys);
WRITE_BOOL_FIELD(skip_mark_restore);
WRITE_BOOL_FIELD(materialize_inner);
}
static void
_outHashPath(StringInfo str, const HashPath *node)
{
WRITE_NODE_TYPE("HASHPATH");
_outJoinPathInfo(str, (const JoinPath *) node);
WRITE_NODE_FIELD(path_hashclauses);
WRITE_INT_FIELD(num_batches);
WRITE_FLOAT_FIELD(inner_rows_total, "%.0f");
}
static void
_outPlannerGlobal(StringInfo str, const PlannerGlobal *node)
{
WRITE_NODE_TYPE("PLANNERGLOBAL");
/* NB: this isn't a complete set of fields */
WRITE_NODE_FIELD(subplans);
WRITE_BITMAPSET_FIELD(rewindPlanIDs);
WRITE_NODE_FIELD(finalrtable);
WRITE_NODE_FIELD(finalrowmarks);
WRITE_NODE_FIELD(resultRelations);
Further adjust EXPLAIN's choices of table alias names. This patch causes EXPLAIN to always assign a separate table alias to the parent RTE of an append relation (inheritance set); before, such RTEs were ignored if not actually scanned by the plan. Since the child RTEs now always have that same alias to start with (cf. commit 55a1954da), the net effect is that the parent RTE usually gets the alias used or implied by the query text, and the children all get that alias with "_N" appended. (The exception to "usually" is if there are duplicate aliases in different subtrees of the original query; then some of those original RTEs will also have "_N" appended.) This results in more uniform output for partitioned-table plans than we had before: the partitioned table itself gets the original alias, and all child tables have aliases with "_N", rather than the previous behavior where one of the children would get an alias without "_N". The reason for giving the parent RTE an alias, even if it isn't scanned by the plan, is that we now use the parent's alias to qualify Vars that refer to an appendrel output column and appear above the Append or MergeAppend that computes the appendrel. But below the append, Vars refer to some one of the child relations, and are displayed that way. This seems clearer than the old behavior where a Var that could carry values from any child relation was displayed as if it referred to only one of them. While at it, change ruleutils.c so that the code paths used by EXPLAIN deal in Plan trees not PlanState trees. This effectively reverts a decision made in commit 1cc29fe7c, which seemed like a good idea at the time to make ruleutils.c consistent with explain.c. However, it's problematic because we'd really like to allow executor startup pruning to remove all the children of an append node when possible, leaving no child PlanState to resolve Vars against. (That's not done here, but will be in the next patch.) This requires different handling of subplans and initplans than before, but is otherwise a pretty straightforward change. Discussion: https://postgr.es/m/001001d4f44b$2a2cca50$7e865ef0$@lab.ntt.co.jp
2019-12-11 23:05:18 +01:00
WRITE_NODE_FIELD(appendRelations);
WRITE_NODE_FIELD(relationOids);
WRITE_NODE_FIELD(invalItems);
WRITE_NODE_FIELD(paramExecTypes);
WRITE_UINT_FIELD(lastPHId);
WRITE_UINT_FIELD(lastRowMarkId);
Avoid invalidating all foreign-join cached plans when user mappings change. We must not push down a foreign join when the foreign tables involved should be accessed under different user mappings. Previously we tried to enforce that rule literally during planning, but that meant that the resulting plans were dependent on the current contents of the pg_user_mapping catalog, and we had to blow away all cached plans containing any remote join when anything at all changed in pg_user_mapping. This could have been improved somewhat, but the fact that a syscache inval callback has very limited info about what changed made it hard to do better within that design. Instead, let's change the planner to not consider user mappings per se, but to allow a foreign join if both RTEs have the same checkAsUser value. If they do, then they necessarily will use the same user mapping at runtime, and we don't need to know specifically which one that is. Post-plan-time changes in pg_user_mapping no longer require any plan invalidation. This rule does give up some optimization ability, to wit where two foreign table references come from views with different owners or one's from a view and one's directly in the query, but nonetheless the same user mapping would have applied. We'll sacrifice the first case, but to not regress more than we have to in the second case, allow a foreign join involving both zero and nonzero checkAsUser values if the nonzero one is the same as the prevailing effective userID. In that case, mark the plan as only runnable by that userID. The plancache code already had a notion of plans being userID-specific, in order to support RLS. It was a little confused though, in particular lacking clarity of thought as to whether it was the rewritten query or just the finished plan that's dependent on the userID. Rearrange that code so that it's clearer what depends on which, and so that the same logic applies to both RLS-injected role dependency and foreign-join-injected role dependency. Note that this patch doesn't remove the other issue mentioned in the original complaint, which is that while we'll reliably stop using a foreign join if it's disallowed in a new context, we might fail to start using a foreign join if it's now allowed, but we previously created a generic cached plan that didn't use one. It was agreed that the chance of winning that way was not high enough to justify the much larger number of plan invalidations that would have to occur if we tried to cause it to happen. In passing, clean up randomly-varying spelling of EXPLAIN commands in postgres_fdw.sql, and fix a COSTS ON example that had been allowed to leak into the committed tests. This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the previous attempt at ensuring we wouldn't push down foreign joins that span permissions contexts. Etsuro Fujita and Tom Lane Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
WRITE_INT_FIELD(lastPlanNodeId);
WRITE_BOOL_FIELD(transientPlan);
Avoid invalidating all foreign-join cached plans when user mappings change. We must not push down a foreign join when the foreign tables involved should be accessed under different user mappings. Previously we tried to enforce that rule literally during planning, but that meant that the resulting plans were dependent on the current contents of the pg_user_mapping catalog, and we had to blow away all cached plans containing any remote join when anything at all changed in pg_user_mapping. This could have been improved somewhat, but the fact that a syscache inval callback has very limited info about what changed made it hard to do better within that design. Instead, let's change the planner to not consider user mappings per se, but to allow a foreign join if both RTEs have the same checkAsUser value. If they do, then they necessarily will use the same user mapping at runtime, and we don't need to know specifically which one that is. Post-plan-time changes in pg_user_mapping no longer require any plan invalidation. This rule does give up some optimization ability, to wit where two foreign table references come from views with different owners or one's from a view and one's directly in the query, but nonetheless the same user mapping would have applied. We'll sacrifice the first case, but to not regress more than we have to in the second case, allow a foreign join involving both zero and nonzero checkAsUser values if the nonzero one is the same as the prevailing effective userID. In that case, mark the plan as only runnable by that userID. The plancache code already had a notion of plans being userID-specific, in order to support RLS. It was a little confused though, in particular lacking clarity of thought as to whether it was the rewritten query or just the finished plan that's dependent on the userID. Rearrange that code so that it's clearer what depends on which, and so that the same logic applies to both RLS-injected role dependency and foreign-join-injected role dependency. Note that this patch doesn't remove the other issue mentioned in the original complaint, which is that while we'll reliably stop using a foreign join if it's disallowed in a new context, we might fail to start using a foreign join if it's now allowed, but we previously created a generic cached plan that didn't use one. It was agreed that the chance of winning that way was not high enough to justify the much larger number of plan invalidations that would have to occur if we tried to cause it to happen. In passing, clean up randomly-varying spelling of EXPLAIN commands in postgres_fdw.sql, and fix a COSTS ON example that had been allowed to leak into the committed tests. This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the previous attempt at ensuring we wouldn't push down foreign joins that span permissions contexts. Etsuro Fujita and Tom Lane Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
WRITE_BOOL_FIELD(dependsOnRole);
Determine whether it's safe to attempt a parallel plan for a query. Commit 924bcf4f16d54c55310b28f77686608684734f42 introduced a framework for parallel computation in PostgreSQL that makes most but not all built-in functions safe to execute in parallel mode. In order to have parallel query, we'll need to be able to determine whether that query contains functions (either built-in or user-defined) that cannot be safely executed in parallel mode. This requires those functions to be labeled, so this patch introduces an infrastructure for that. Some functions currently labeled as safe may need to be revised depending on how pending issues related to heavyweight locking under paralllelism are resolved. Parallel plans can't be used except for the case where the query will run to completion. If portal execution were suspended, the parallel mode restrictions would need to remain in effect during that time, but that might make other queries fail. Therefore, this patch introduces a framework that enables consideration of parallel plans only when it is known that the plan will be run to completion. This probably needs some refinement; for example, at bind time, we do not know whether a query run via the extended protocol will be execution to completion or run with a limited fetch count. Having the client indicate its intentions at bind time would constitute a wire protocol break. Some contexts in which parallel mode would be safe are not adjusted by this patch; the default is not to try parallel plans except from call sites that have been updated to say that such plans are OK. This commit doesn't introduce any parallel paths or plans; it just provides a way to determine whether they could potentially be used. I'm committing it on the theory that the remaining parallel sequential scan patches will also get committed to this release, hopefully in the not-too-distant future. Robert Haas and Amit Kapila. Reviewed (in earlier versions) by Noah Misch.
2015-09-16 21:38:47 +02:00
WRITE_BOOL_FIELD(parallelModeOK);
WRITE_BOOL_FIELD(parallelModeNeeded);
WRITE_CHAR_FIELD(maxParallelHazard);
}
static void
_outPlannerInfo(StringInfo str, const PlannerInfo *node)
{
WRITE_NODE_TYPE("PLANNERINFO");
/* NB: this isn't a complete set of fields */
WRITE_NODE_FIELD(parse);
WRITE_NODE_FIELD(glob);
WRITE_UINT_FIELD(query_level);
Fix PARAM_EXEC assignment mechanism to be safe in the presence of WITH. The planner previously assumed that parameter Vars having the same absolute query level, varno, and varattno could safely be assigned the same runtime PARAM_EXEC slot, even though they might be different Vars appearing in different subqueries. This was (probably) safe before the introduction of CTEs, but the lazy-evalution mechanism used for CTEs means that a CTE can be executed during execution of some other subquery, causing the lifespan of Params at the same syntactic nesting level as the CTE to overlap with use of the same slots inside the CTE. In 9.1 we created additional hazards by using the same parameter-assignment technology for nestloop inner scan parameters, but it was broken before that, as illustrated by the added regression test. To fix, restructure the planner's management of PlannerParamItems so that items having different semantic lifespans are kept rigorously separated. This will probably result in complex queries using more runtime PARAM_EXEC slots than before, but the slots are cheap enough that this hardly matters. Also, stop generating PlannerParamItems containing Params for subquery outputs: all we really need to do is reserve the PARAM_EXEC slot number, and that now only takes incrementing a counter. The planning code is simpler and probably faster than before, as well as being more correct. Per report from Vik Reykja. These changes will mostly also need to be made in the back branches, but I'm going to hold off on that until after 9.2.0 wraps.
2012-09-05 18:54:03 +02:00
WRITE_NODE_FIELD(plan_params);
WRITE_BITMAPSET_FIELD(outer_params);
WRITE_BITMAPSET_FIELD(all_baserels);
Compute correct em_nullable_relids in get_eclass_for_sort_expr(). Bug #8591 from Claudio Freire demonstrates that get_eclass_for_sort_expr must be able to compute valid em_nullable_relids for any new equivalence class members it creates. I'd worried about this in the commit message for db9f0e1d9a4a0842c814a464cdc9758c3f20b96c, but claimed that it wasn't a problem because multi-member ECs should already exist when it runs. That is transparently wrong, though, because this function is also called by initialize_mergeclause_eclasses, which runs during deconstruct_jointree. The example given in the bug report (which the new regression test item is based upon) fails because the COALESCE() expression is first seen by initialize_mergeclause_eclasses rather than process_equivalence. Fixing this requires passing the appropriate nullable_relids set to get_eclass_for_sort_expr, and it requires new code to compute that set for top-level expressions such as ORDER BY, GROUP BY, etc. We store the top-level nullable_relids in a new field in PlannerInfo to avoid computing it many times. In the back branches, I've added the new field at the end of the struct to minimize ABI breakage for planner plugins. There doesn't seem to be a good alternative to changing get_eclass_for_sort_expr's API signature, though. There probably aren't any third-party extensions calling that function directly; moreover, if there are, they probably need to think about what to pass for nullable_relids anyway. Back-patch to 9.2, like the previous patch in this area.
2013-11-15 22:46:18 +01:00
WRITE_BITMAPSET_FIELD(nullable_baserels);
WRITE_NODE_FIELD(join_rel_list);
WRITE_INT_FIELD(join_cur_level);
WRITE_NODE_FIELD(init_plans);
WRITE_NODE_FIELD(cte_plan_ids);
WRITE_NODE_FIELD(multiexpr_params);
WRITE_NODE_FIELD(eq_classes);
Speed up finding EquivalenceClasses for a given set of rels Previously in order to determine which ECs a relation had members in, we had to loop over all ECs stored in PlannerInfo's eq_classes and check if ec_relids mentioned the relation. For the most part, this was fine, as generally, unless queries were fairly complex, the overhead of performing the lookup would have not been that significant. However, when queries contained large numbers of joins and ECs, the overhead to find the set of classes matching a given set of relations could become a significant portion of the overall planning effort. Here we allow a much more efficient method to access the ECs which match a given relation or set of relations. A new Bitmapset field in RelOptInfo now exists to store the indexes into PlannerInfo's eq_classes list which each relation is mentioned in. This allows very fast lookups to find all ECs belonging to a single relation. When we need to lookup ECs belonging to a given pair of relations, we can simply bitwise-AND the Bitmapsets from each relation and use the result to perform the lookup. We also take the opportunity to write a new implementation of generate_join_implied_equalities which makes use of the new indexes. generate_join_implied_equalities_for_ecs must remain as is as it can be given a custom list of ECs, which we can't easily determine the indexes of. This was originally intended to fix the performance penalty of looking up foreign keys matching a join condition which was introduced by 100340e2d. However, we're speeding up much more than just that here. Author: David Rowley, Tom Lane Reviewed-by: Tom Lane, Tomas Vondra Discussion: https://postgr.es/m/6970.1545327857@sss.pgh.pa.us
2019-07-21 07:30:58 +02:00
WRITE_BOOL_FIELD(ec_merging_done);
WRITE_NODE_FIELD(canon_pathkeys);
WRITE_NODE_FIELD(left_join_clauses);
WRITE_NODE_FIELD(right_join_clauses);
WRITE_NODE_FIELD(full_join_clauses);
WRITE_NODE_FIELD(join_info_list);
Rework planning and execution of UPDATE and DELETE. This patch makes two closely related sets of changes: 1. For UPDATE, the subplan of the ModifyTable node now only delivers the new values of the changed columns (i.e., the expressions computed in the query's SET clause) plus row identity information such as CTID. ModifyTable must re-fetch the original tuple to merge in the old values of any unchanged columns. The core advantage of this is that the changed columns are uniform across all tables of an inherited or partitioned target relation, whereas the other columns might not be. A secondary advantage, when the UPDATE involves joins, is that less data needs to pass through the plan tree. The disadvantage of course is an extra fetch of each tuple to be updated. However, that seems to be very nearly free in context; even worst-case tests don't show it to add more than a couple percent to the total query cost. At some point it might be interesting to combine the re-fetch with the tuple access that ModifyTable must do anyway to mark the old tuple dead; but that would require a good deal of refactoring and it seems it wouldn't buy all that much, so this patch doesn't attempt it. 2. For inherited UPDATE/DELETE, instead of generating a separate subplan for each target relation, we now generate a single subplan that is just exactly like a SELECT's plan, then stick ModifyTable on top of that. To let ModifyTable know which target relation a given incoming row refers to, a tableoid junk column is added to the row identity information. This gets rid of the horrid hack that was inheritance_planner(), eliminating O(N^2) planning cost and memory consumption in cases where there were many unprunable target relations. Point 2 of course requires point 1, so that there is a uniform definition of the non-junk columns to be returned by the subplan. We can't insist on uniform definition of the row identity junk columns however, if we want to keep the ability to have both plain and foreign tables in a partitioning hierarchy. Since it wouldn't scale very far to have every child table have its own row identity column, this patch includes provisions to merge similar row identity columns into one column of the subplan result. In particular, we can merge the whole-row Vars typically used as row identity by FDWs into one column by pretending they are type RECORD. (It's still okay for the actual composite Datums to be labeled with the table's rowtype OID, though.) There is more that can be done to file down residual inefficiencies in this patch, but it seems to be committable now. FDW authors should note several API changes: * The argument list for AddForeignUpdateTargets() has changed, and so has the method it must use for adding junk columns to the query. Call add_row_identity_var() instead of manipulating the parse tree directly. You might want to reconsider exactly what you're adding, too. * PlanDirectModify() must now work a little harder to find the ForeignScan plan node; if the foreign table is part of a partitioning hierarchy then the ForeignScan might not be the direct child of ModifyTable. See postgres_fdw for sample code. * To check whether a relation is a target relation, it's no longer sufficient to compare its relid to root->parse->resultRelation. Instead, check it against all_result_relids or leaf_result_relids, as appropriate. Amit Langote and Tom Lane Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
WRITE_BITMAPSET_FIELD(all_result_relids);
WRITE_BITMAPSET_FIELD(leaf_result_relids);
WRITE_NODE_FIELD(append_rel_list);
Rework planning and execution of UPDATE and DELETE. This patch makes two closely related sets of changes: 1. For UPDATE, the subplan of the ModifyTable node now only delivers the new values of the changed columns (i.e., the expressions computed in the query's SET clause) plus row identity information such as CTID. ModifyTable must re-fetch the original tuple to merge in the old values of any unchanged columns. The core advantage of this is that the changed columns are uniform across all tables of an inherited or partitioned target relation, whereas the other columns might not be. A secondary advantage, when the UPDATE involves joins, is that less data needs to pass through the plan tree. The disadvantage of course is an extra fetch of each tuple to be updated. However, that seems to be very nearly free in context; even worst-case tests don't show it to add more than a couple percent to the total query cost. At some point it might be interesting to combine the re-fetch with the tuple access that ModifyTable must do anyway to mark the old tuple dead; but that would require a good deal of refactoring and it seems it wouldn't buy all that much, so this patch doesn't attempt it. 2. For inherited UPDATE/DELETE, instead of generating a separate subplan for each target relation, we now generate a single subplan that is just exactly like a SELECT's plan, then stick ModifyTable on top of that. To let ModifyTable know which target relation a given incoming row refers to, a tableoid junk column is added to the row identity information. This gets rid of the horrid hack that was inheritance_planner(), eliminating O(N^2) planning cost and memory consumption in cases where there were many unprunable target relations. Point 2 of course requires point 1, so that there is a uniform definition of the non-junk columns to be returned by the subplan. We can't insist on uniform definition of the row identity junk columns however, if we want to keep the ability to have both plain and foreign tables in a partitioning hierarchy. Since it wouldn't scale very far to have every child table have its own row identity column, this patch includes provisions to merge similar row identity columns into one column of the subplan result. In particular, we can merge the whole-row Vars typically used as row identity by FDWs into one column by pretending they are type RECORD. (It's still okay for the actual composite Datums to be labeled with the table's rowtype OID, though.) There is more that can be done to file down residual inefficiencies in this patch, but it seems to be committable now. FDW authors should note several API changes: * The argument list for AddForeignUpdateTargets() has changed, and so has the method it must use for adding junk columns to the query. Call add_row_identity_var() instead of manipulating the parse tree directly. You might want to reconsider exactly what you're adding, too. * PlanDirectModify() must now work a little harder to find the ForeignScan plan node; if the foreign table is part of a partitioning hierarchy then the ForeignScan might not be the direct child of ModifyTable. See postgres_fdw for sample code. * To check whether a relation is a target relation, it's no longer sufficient to compare its relid to root->parse->resultRelation. Instead, check it against all_result_relids or leaf_result_relids, as appropriate. Amit Langote and Tom Lane Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
WRITE_NODE_FIELD(row_identity_vars);
WRITE_NODE_FIELD(rowMarks);
WRITE_NODE_FIELD(placeholder_list);
WRITE_NODE_FIELD(fkey_list);
WRITE_NODE_FIELD(query_pathkeys);
WRITE_NODE_FIELD(group_pathkeys);
WRITE_NODE_FIELD(window_pathkeys);
WRITE_NODE_FIELD(distinct_pathkeys);
WRITE_NODE_FIELD(sort_pathkeys);
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
WRITE_NODE_FIELD(processed_tlist);
Rework planning and execution of UPDATE and DELETE. This patch makes two closely related sets of changes: 1. For UPDATE, the subplan of the ModifyTable node now only delivers the new values of the changed columns (i.e., the expressions computed in the query's SET clause) plus row identity information such as CTID. ModifyTable must re-fetch the original tuple to merge in the old values of any unchanged columns. The core advantage of this is that the changed columns are uniform across all tables of an inherited or partitioned target relation, whereas the other columns might not be. A secondary advantage, when the UPDATE involves joins, is that less data needs to pass through the plan tree. The disadvantage of course is an extra fetch of each tuple to be updated. However, that seems to be very nearly free in context; even worst-case tests don't show it to add more than a couple percent to the total query cost. At some point it might be interesting to combine the re-fetch with the tuple access that ModifyTable must do anyway to mark the old tuple dead; but that would require a good deal of refactoring and it seems it wouldn't buy all that much, so this patch doesn't attempt it. 2. For inherited UPDATE/DELETE, instead of generating a separate subplan for each target relation, we now generate a single subplan that is just exactly like a SELECT's plan, then stick ModifyTable on top of that. To let ModifyTable know which target relation a given incoming row refers to, a tableoid junk column is added to the row identity information. This gets rid of the horrid hack that was inheritance_planner(), eliminating O(N^2) planning cost and memory consumption in cases where there were many unprunable target relations. Point 2 of course requires point 1, so that there is a uniform definition of the non-junk columns to be returned by the subplan. We can't insist on uniform definition of the row identity junk columns however, if we want to keep the ability to have both plain and foreign tables in a partitioning hierarchy. Since it wouldn't scale very far to have every child table have its own row identity column, this patch includes provisions to merge similar row identity columns into one column of the subplan result. In particular, we can merge the whole-row Vars typically used as row identity by FDWs into one column by pretending they are type RECORD. (It's still okay for the actual composite Datums to be labeled with the table's rowtype OID, though.) There is more that can be done to file down residual inefficiencies in this patch, but it seems to be committable now. FDW authors should note several API changes: * The argument list for AddForeignUpdateTargets() has changed, and so has the method it must use for adding junk columns to the query. Call add_row_identity_var() instead of manipulating the parse tree directly. You might want to reconsider exactly what you're adding, too. * PlanDirectModify() must now work a little harder to find the ForeignScan plan node; if the foreign table is part of a partitioning hierarchy then the ForeignScan might not be the direct child of ModifyTable. See postgres_fdw for sample code. * To check whether a relation is a target relation, it's no longer sufficient to compare its relid to root->parse->resultRelation. Instead, check it against all_result_relids or leaf_result_relids, as appropriate. Amit Langote and Tom Lane Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
WRITE_NODE_FIELD(update_colnos);
WRITE_NODE_FIELD(minmax_aggs);
WRITE_FLOAT_FIELD(total_table_pages, "%.0f");
WRITE_FLOAT_FIELD(tuple_fraction, "%.4f");
WRITE_FLOAT_FIELD(limit_tuples, "%.0f");
Improve RLS planning by marking individual quals with security levels. In an RLS query, we must ensure that security filter quals are evaluated before ordinary query quals, in case the latter contain "leaky" functions that could expose the contents of sensitive rows. The original implementation of RLS planning ensured this by pushing the scan of a secured table into a sub-query that it marked as a security-barrier view. Unfortunately this results in very inefficient plans in many cases, because the sub-query cannot be flattened and gets planned independently of the rest of the query. To fix, drop the use of sub-queries to enforce RLS qual order, and instead mark each qual (RestrictInfo) with a security_level field establishing its priority for evaluation. Quals must be evaluated in security_level order, except that "leakproof" quals can be allowed to go ahead of quals of lower security_level, if it's helpful to do so. This has to be enforced within the ordering of any one list of quals to be evaluated at a table scan node, and we also have to ensure that quals are not chosen for early evaluation (i.e., use as an index qual or TID scan qual) if they're not allowed to go ahead of other quals at the scan node. This is sufficient to fix the problem for RLS quals, since we only support RLS policies on simple tables and thus RLS quals will always exist at the table scan level only. Eventually these qual ordering rules should be enforced for join quals as well, which would permit improving planning for explicit security-barrier views; but that's a task for another patch. Note that FDWs would need to be aware of these rules --- and not, for example, send an insecure qual for remote execution --- but since we do not yet allow RLS policies on foreign tables, the case doesn't arise. This will need to be addressed before we can allow such policies. Patch by me, reviewed by Stephen Frost and Dean Rasheed. Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
WRITE_UINT_FIELD(qual_security_level);
WRITE_BOOL_FIELD(hasJoinRTEs);
WRITE_BOOL_FIELD(hasLateralRTEs);
WRITE_BOOL_FIELD(hasHavingQual);
WRITE_BOOL_FIELD(hasPseudoConstantQuals);
Move resolution of AlternativeSubPlan choices to the planner. When commit bd3daddaf introduced AlternativeSubPlans, I had some ambitions towards allowing the choice of subplan to change during execution. That has not happened, or even been thought about, in the ensuing twelve years; so it seems like a failed experiment. So let's rip that out and resolve the choice of subplan at the end of planning (in setrefs.c) rather than during executor startup. This has a number of positive benefits: * Removal of a few hundred lines of executor code, since AlternativeSubPlans need no longer be supported there. * Removal of executor-startup overhead (particularly, initialization of subplans that won't be used). * Removal of incidental costs of having a larger plan tree, such as tree-scanning and copying costs in the plancache; not to mention setrefs.c's own costs of processing the discarded subplans. * EXPLAIN no longer has to print a weird (and undocumented) representation of an AlternativeSubPlan choice; it sees only the subplan actually used. This should mean less confusion for users. * Since setrefs.c knows which subexpression of a plan node it's working on at any instant, it's possible to adjust the estimated number of executions of the subplan based on that. For example, we should usually estimate more executions of a qual expression than a targetlist expression. The implementation used here is pretty simplistic, because we don't want to expend a lot of cycles on the issue; but it's better than ignoring the point entirely, as the executor had to. That last point might possibly result in shifting the choice between hashed and non-hashed EXISTS subplans in a few cases, but in general this patch isn't meant to change planner choices. Since we're doing the resolution so late, it's really impossible to change any plan choices outside the AlternativeSubPlan itself. Patch by me; thanks to David Rowley for review. Discussion: https://postgr.es/m/1992952.1592785225@sss.pgh.pa.us
2020-09-27 18:51:28 +02:00
WRITE_BOOL_FIELD(hasAlternativeSubPlans);
WRITE_BOOL_FIELD(hasRecursion);
WRITE_INT_FIELD(wt_param_id);
WRITE_BITMAPSET_FIELD(curOuterRels);
WRITE_NODE_FIELD(curOuterParams);
WRITE_BOOL_FIELD(partColsUpdated);
}
static void
_outRelOptInfo(StringInfo str, const RelOptInfo *node)
{
WRITE_NODE_TYPE("RELOPTINFO");
/* NB: this isn't a complete set of fields */
WRITE_ENUM_FIELD(reloptkind, RelOptKind);
WRITE_BITMAPSET_FIELD(relids);
WRITE_FLOAT_FIELD(rows, "%.0f");
WRITE_BOOL_FIELD(consider_startup);
Fix planner's cost estimation for SEMI/ANTI joins with inner indexscans. When the inner side of a nestloop SEMI or ANTI join is an indexscan that uses all the join clauses as indexquals, it can be presumed that both matched and unmatched outer rows will be processed very quickly: for matched rows, we'll stop after fetching one row from the indexscan, while for unmatched rows we'll have an indexscan that finds no matching index entries, which should also be quick. The planner already knew about this, but it was nonetheless charging for at least one full run of the inner indexscan, as a consequence of concerns about the behavior of materialized inner scans --- but those concerns don't apply in the fast case. If the inner side has low cardinality (many matching rows) this could make an indexscan plan look far more expensive than it actually is. To fix, rearrange the work in initial_cost_nestloop/final_cost_nestloop so that we don't add the inner scan cost until we've inspected the indexquals, and then we can add either the full-run cost or just the first tuple's cost as appropriate. Experimentation with this fix uncovered another problem: add_path and friends were coded to disregard cheap startup cost when considering parameterized paths. That's usually okay (and desirable, because it thins the path herd faster); but in this fast case for SEMI/ANTI joins, it could result in throwing away the desired plain indexscan path in favor of a bitmap scan path before we ever get to the join costing logic. In the many-matching-rows cases of interest here, a bitmap scan will do a lot more work than required, so this is a problem. To fix, add a per-relation flag consider_param_startup that works like the existing consider_startup flag, but applies to parameterized paths, and set it for relations that are the inside of a SEMI or ANTI join. To make this patch reasonably safe to back-patch, care has been taken to avoid changing the planner's behavior except in the very narrow case of SEMI/ANTI joins with inner indexscans. There are places in compare_path_costs_fuzzily and add_path_precheck that are not terribly consistent with the new approach, but changing them will affect planner decisions at the margins in other cases, so we'll leave that for a HEAD-only fix. Back-patch to 9.3; before that, the consider_startup flag didn't exist, meaning that the second aspect of the patch would be too invasive. Per a complaint from Peter Holzer and analysis by Tomas Vondra.
2015-06-03 17:58:47 +02:00
WRITE_BOOL_FIELD(consider_param_startup);
Generate parallel sequential scan plans in simple cases. Add a new flag, consider_parallel, to each RelOptInfo, indicating whether a plan for that relation could conceivably be run inside of a parallel worker. Right now, we're pretty conservative: for example, it might be possible to defer applying a parallel-restricted qual in a worker, and later do it in the leader, but right now we just don't try to parallelize access to that relation. That's probably the right decision in most cases, anyway. Using the new flag, generate parallel sequential scan plans for plain baserels, meaning that we now have parallel sequential scan in PostgreSQL. The logic here is pretty unsophisticated right now: the costing model probably isn't right in detail, and we can't push joins beneath Gather nodes, so the number of plans that can actually benefit from this is pretty limited right now. Lots more work is needed. Nevertheless, it seems time to enable this functionality so that all this code can actually be tested easily by users and developers. Note that, if you wish to test this functionality, it will be necessary to set max_parallel_degree to a value greater than the default of 0. Once a few more loose ends have been tidied up here, we might want to consider changing the default value of this GUC, but I'm leaving it alone for now. Along the way, fix a bug in cost_gather: the previous coding thought that a Gather node's transfer overhead should be costed on the basis of the relation size rather than the number of tuples that actually need to be passed off to the leader. Patch by me, reviewed in earlier versions by Amit Kapila.
2015-11-11 15:02:52 +01:00
WRITE_BOOL_FIELD(consider_parallel);
WRITE_NODE_FIELD(reltarget);
WRITE_NODE_FIELD(pathlist);
Revise parameterized-path mechanism to fix assorted issues. This patch adjusts the treatment of parameterized paths so that all paths with the same parameterization (same set of required outer rels) for the same relation will have the same rowcount estimate. We cache the rowcount estimates to ensure that property, and hopefully save a few cycles too. Doing this makes it practical for add_path_precheck to operate without a rowcount estimate: it need only assume that paths with different parameterizations never dominate each other, which is close enough to true anyway for coarse filtering, because normally a more-parameterized path should yield fewer rows thanks to having more join clauses to apply. In add_path, we do the full nine yards of comparing rowcount estimates along with everything else, so that we can discard parameterized paths that don't actually have an advantage. This fixes some issues I'd found with add_path rejecting parameterized paths on the grounds that they were more expensive than not-parameterized ones, even though they yielded many fewer rows and hence would be cheaper once subsequent joining was considered. To make the same-rowcounts assumption valid, we have to require that any parameterized path enforce *all* join clauses that could be obtained from the particular set of outer rels, even if not all of them are useful for indexing. This is required at both base scans and joins. It's a good thing anyway since the net impact is that join quals are checked at the lowest practical level in the join tree. Hence, discard the original rather ad-hoc mechanism for choosing parameterization joinquals, and build a better one that has a more principled rule for when clauses can be moved. The original rule was actually buggy anyway for lack of knowledge about which relations are part of an outer join's outer side; getting this right requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
WRITE_NODE_FIELD(ppilist);
WRITE_NODE_FIELD(partial_pathlist);
WRITE_NODE_FIELD(cheapest_startup_path);
WRITE_NODE_FIELD(cheapest_total_path);
WRITE_NODE_FIELD(cheapest_unique_path);
WRITE_NODE_FIELD(cheapest_parameterized_paths);
WRITE_BITMAPSET_FIELD(direct_lateral_relids);
WRITE_BITMAPSET_FIELD(lateral_relids);
WRITE_UINT_FIELD(relid);
WRITE_OID_FIELD(reltablespace);
WRITE_ENUM_FIELD(rtekind, RTEKind);
WRITE_INT_FIELD(min_attr);
WRITE_INT_FIELD(max_attr);
WRITE_NODE_FIELD(lateral_vars);
WRITE_BITMAPSET_FIELD(lateral_referencers);
WRITE_NODE_FIELD(indexlist);
Implement multivariate n-distinct coefficients Add support for explicitly declared statistic objects (CREATE STATISTICS), allowing collection of statistics on more complex combinations that individual table columns. Companion commands DROP STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are added too. All this DDL has been designed so that more statistic types can be added later on, such as multivariate most-common-values and multivariate histograms between columns of a single table, leaving room for permitting columns on multiple tables, too, as well as expressions. This commit only adds support for collection of n-distinct coefficient on user-specified sets of columns in a single table. This is useful to estimate number of distinct groups in GROUP BY and DISTINCT clauses; estimation errors there can cause over-allocation of memory in hashed aggregates, for instance, so it's a worthwhile problem to solve. A new special pseudo-type pg_ndistinct is used. (num-distinct estimation was deemed sufficiently useful by itself that this is worthwhile even if no further statistic types are added immediately; so much so that another version of essentially the same functionality was submitted by Kyotaro Horiguchi: https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp though this commit does not use that code.) Author: Tomas Vondra. Some code rework by Álvaro. Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes, Ideriha Takeshi Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
WRITE_NODE_FIELD(statlist);
WRITE_UINT_FIELD(pages);
WRITE_FLOAT_FIELD(tuples, "%.0f");
WRITE_FLOAT_FIELD(allvisfrac, "%.6f");
Speed up finding EquivalenceClasses for a given set of rels Previously in order to determine which ECs a relation had members in, we had to loop over all ECs stored in PlannerInfo's eq_classes and check if ec_relids mentioned the relation. For the most part, this was fine, as generally, unless queries were fairly complex, the overhead of performing the lookup would have not been that significant. However, when queries contained large numbers of joins and ECs, the overhead to find the set of classes matching a given set of relations could become a significant portion of the overall planning effort. Here we allow a much more efficient method to access the ECs which match a given relation or set of relations. A new Bitmapset field in RelOptInfo now exists to store the indexes into PlannerInfo's eq_classes list which each relation is mentioned in. This allows very fast lookups to find all ECs belonging to a single relation. When we need to lookup ECs belonging to a given pair of relations, we can simply bitwise-AND the Bitmapsets from each relation and use the result to perform the lookup. We also take the opportunity to write a new implementation of generate_join_implied_equalities which makes use of the new indexes. generate_join_implied_equalities_for_ecs must remain as is as it can be given a custom list of ECs, which we can't easily determine the indexes of. This was originally intended to fix the performance penalty of looking up foreign keys matching a join condition which was introduced by 100340e2d. However, we're speeding up much more than just that here. Author: David Rowley, Tom Lane Reviewed-by: Tom Lane, Tomas Vondra Discussion: https://postgr.es/m/6970.1545327857@sss.pgh.pa.us
2019-07-21 07:30:58 +02:00
WRITE_BITMAPSET_FIELD(eclass_indexes);
WRITE_NODE_FIELD(subroot);
Fix PARAM_EXEC assignment mechanism to be safe in the presence of WITH. The planner previously assumed that parameter Vars having the same absolute query level, varno, and varattno could safely be assigned the same runtime PARAM_EXEC slot, even though they might be different Vars appearing in different subqueries. This was (probably) safe before the introduction of CTEs, but the lazy-evalution mechanism used for CTEs means that a CTE can be executed during execution of some other subquery, causing the lifespan of Params at the same syntactic nesting level as the CTE to overlap with use of the same slots inside the CTE. In 9.1 we created additional hazards by using the same parameter-assignment technology for nestloop inner scan parameters, but it was broken before that, as illustrated by the added regression test. To fix, restructure the planner's management of PlannerParamItems so that items having different semantic lifespans are kept rigorously separated. This will probably result in complex queries using more runtime PARAM_EXEC slots than before, but the slots are cheap enough that this hardly matters. Also, stop generating PlannerParamItems containing Params for subquery outputs: all we really need to do is reserve the PARAM_EXEC slot number, and that now only takes incrementing a counter. The planning code is simpler and probably faster than before, as well as being more correct. Per report from Vik Reykja. These changes will mostly also need to be made in the back branches, but I'm going to hold off on that until after 9.2.0 wraps.
2012-09-05 18:54:03 +02:00
WRITE_NODE_FIELD(subplan_params);
Avoid invalidating all foreign-join cached plans when user mappings change. We must not push down a foreign join when the foreign tables involved should be accessed under different user mappings. Previously we tried to enforce that rule literally during planning, but that meant that the resulting plans were dependent on the current contents of the pg_user_mapping catalog, and we had to blow away all cached plans containing any remote join when anything at all changed in pg_user_mapping. This could have been improved somewhat, but the fact that a syscache inval callback has very limited info about what changed made it hard to do better within that design. Instead, let's change the planner to not consider user mappings per se, but to allow a foreign join if both RTEs have the same checkAsUser value. If they do, then they necessarily will use the same user mapping at runtime, and we don't need to know specifically which one that is. Post-plan-time changes in pg_user_mapping no longer require any plan invalidation. This rule does give up some optimization ability, to wit where two foreign table references come from views with different owners or one's from a view and one's directly in the query, but nonetheless the same user mapping would have applied. We'll sacrifice the first case, but to not regress more than we have to in the second case, allow a foreign join involving both zero and nonzero checkAsUser values if the nonzero one is the same as the prevailing effective userID. In that case, mark the plan as only runnable by that userID. The plancache code already had a notion of plans being userID-specific, in order to support RLS. It was a little confused though, in particular lacking clarity of thought as to whether it was the rewritten query or just the finished plan that's dependent on the userID. Rearrange that code so that it's clearer what depends on which, and so that the same logic applies to both RLS-injected role dependency and foreign-join-injected role dependency. Note that this patch doesn't remove the other issue mentioned in the original complaint, which is that while we'll reliably stop using a foreign join if it's disallowed in a new context, we might fail to start using a foreign join if it's now allowed, but we previously created a generic cached plan that didn't use one. It was agreed that the chance of winning that way was not high enough to justify the much larger number of plan invalidations that would have to occur if we tried to cause it to happen. In passing, clean up randomly-varying spelling of EXPLAIN commands in postgres_fdw.sql, and fix a COSTS ON example that had been allowed to leak into the committed tests. This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the previous attempt at ensuring we wouldn't push down foreign joins that span permissions contexts. Etsuro Fujita and Tom Lane Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
WRITE_INT_FIELD(rel_parallel_workers);
WRITE_UINT_FIELD(amflags);
Code review for foreign/custom join pushdown patch. Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design decisions that seem pretty questionable to me, and there was quite a lot of stuff not to like about the documentation and comments. Clean up as follows: * Consider foreign joins only between foreign tables on the same server, rather than between any two foreign tables with the same underlying FDW handler function. In most if not all cases, the FDW would simply have had to apply the same-server restriction itself (far more expensively, both for lack of caching and because it would be repeated for each combination of input sub-joins), or else risk nasty bugs. Anyone who's really intent on doing something outside this restriction can always use the set_join_pathlist_hook. * Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist to better reflect what they're for, and allow these custom scan tlists to be used even for base relations. * Change make_foreignscan() API to include passing the fdw_scan_tlist value, since the FDW is required to set that. Backwards compatibility doesn't seem like an adequate reason to expect FDWs to set it in some ad-hoc extra step, and anyway existing FDWs can just pass NIL. * Change the API of path-generating subroutines of add_paths_to_joinrel, and in particular that of GetForeignJoinPaths and set_join_pathlist_hook, so that various less-used parameters are passed in a struct rather than as separate parameter-list entries. The objective here is to reduce the probability that future additions to those parameter lists will result in source-level API breaks for users of these hooks. It's possible that this is even a small win for the core code, since most CPU architectures can't pass more than half a dozen parameters efficiently anyway. I kept root, joinrel, outerrel, innerrel, and jointype as separate parameters to reduce code churn in joinpath.c --- in particular, putting jointype into the struct would have been problematic because of the subroutines' habit of changing their local copies of that variable. * Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all right for it to know about IndexOnlyScan, but if the list is to grow we should refactor the knowledge out to the callers. * Restore nodeForeignscan.c's previous use of the relcache to avoid extra GetFdwRoutine lookups for base-relation scans. * Lots of cleanup of documentation and missed comments. Re-order some code additions into more logical places.
2015-05-10 20:36:30 +02:00
WRITE_OID_FIELD(serverid);
Avoid invalidating all foreign-join cached plans when user mappings change. We must not push down a foreign join when the foreign tables involved should be accessed under different user mappings. Previously we tried to enforce that rule literally during planning, but that meant that the resulting plans were dependent on the current contents of the pg_user_mapping catalog, and we had to blow away all cached plans containing any remote join when anything at all changed in pg_user_mapping. This could have been improved somewhat, but the fact that a syscache inval callback has very limited info about what changed made it hard to do better within that design. Instead, let's change the planner to not consider user mappings per se, but to allow a foreign join if both RTEs have the same checkAsUser value. If they do, then they necessarily will use the same user mapping at runtime, and we don't need to know specifically which one that is. Post-plan-time changes in pg_user_mapping no longer require any plan invalidation. This rule does give up some optimization ability, to wit where two foreign table references come from views with different owners or one's from a view and one's directly in the query, but nonetheless the same user mapping would have applied. We'll sacrifice the first case, but to not regress more than we have to in the second case, allow a foreign join involving both zero and nonzero checkAsUser values if the nonzero one is the same as the prevailing effective userID. In that case, mark the plan as only runnable by that userID. The plancache code already had a notion of plans being userID-specific, in order to support RLS. It was a little confused though, in particular lacking clarity of thought as to whether it was the rewritten query or just the finished plan that's dependent on the userID. Rearrange that code so that it's clearer what depends on which, and so that the same logic applies to both RLS-injected role dependency and foreign-join-injected role dependency. Note that this patch doesn't remove the other issue mentioned in the original complaint, which is that while we'll reliably stop using a foreign join if it's disallowed in a new context, we might fail to start using a foreign join if it's now allowed, but we previously created a generic cached plan that didn't use one. It was agreed that the chance of winning that way was not high enough to justify the much larger number of plan invalidations that would have to occur if we tried to cause it to happen. In passing, clean up randomly-varying spelling of EXPLAIN commands in postgres_fdw.sql, and fix a COSTS ON example that had been allowed to leak into the committed tests. This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the previous attempt at ensuring we wouldn't push down foreign joins that span permissions contexts. Etsuro Fujita and Tom Lane Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
WRITE_OID_FIELD(userid);
WRITE_BOOL_FIELD(useridiscurrent);
Revise FDW planning API, again. Further reflection shows that a single callback isn't very workable if we desire to let FDWs generate multiple Paths, because that forces the FDW to do all work necessary to generate a valid Plan node for each Path. Instead split the former PlanForeignScan API into three steps: GetForeignRelSize, GetForeignPaths, GetForeignPlan. We had already bit the bullet of breaking the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain, and it's substantially more flexible for complex FDWs. Add an fdw_private field to RelOptInfo so that the new functions can save state there rather than possibly having to recalculate information two or three times. In addition, we'd not thought through what would be needed to allow an FDW to set up subexpressions of its choice for runtime execution. We could treat ForeignScan.fdw_private as an executable expression but that seems likely to break existing FDWs unnecessarily (in particular, it would restrict the set of node types allowable in fdw_private to those supported by expression_tree_walker). Instead, invent a separate field fdw_exprs which will receive the postprocessing appropriate for expression trees. (One field is enough since it can be a list of expressions; also, we assume the corresponding expression state tree(s) will be held within fdw_state, so we don't need to add anything to ForeignScanState.) Per review of Hanada Shigeru's pgsql_fdw patch. We may need to tweak this further as we continue to work on that patch, but to me it feels a lot closer to being right now.
2012-03-09 18:48:48 +01:00
/* we don't try to print fdwroutine or fdw_private */
/* can't print unique_for_rels/non_unique_for_rels; BMSes aren't Nodes */
WRITE_NODE_FIELD(baserestrictinfo);
Improve RLS planning by marking individual quals with security levels. In an RLS query, we must ensure that security filter quals are evaluated before ordinary query quals, in case the latter contain "leaky" functions that could expose the contents of sensitive rows. The original implementation of RLS planning ensured this by pushing the scan of a secured table into a sub-query that it marked as a security-barrier view. Unfortunately this results in very inefficient plans in many cases, because the sub-query cannot be flattened and gets planned independently of the rest of the query. To fix, drop the use of sub-queries to enforce RLS qual order, and instead mark each qual (RestrictInfo) with a security_level field establishing its priority for evaluation. Quals must be evaluated in security_level order, except that "leakproof" quals can be allowed to go ahead of quals of lower security_level, if it's helpful to do so. This has to be enforced within the ordering of any one list of quals to be evaluated at a table scan node, and we also have to ensure that quals are not chosen for early evaluation (i.e., use as an index qual or TID scan qual) if they're not allowed to go ahead of other quals at the scan node. This is sufficient to fix the problem for RLS quals, since we only support RLS policies on simple tables and thus RLS quals will always exist at the table scan level only. Eventually these qual ordering rules should be enforced for join quals as well, which would permit improving planning for explicit security-barrier views; but that's a task for another patch. Note that FDWs would need to be aware of these rules --- and not, for example, send an insecure qual for remote execution --- but since we do not yet allow RLS policies on foreign tables, the case doesn't arise. This will need to be addressed before we can allow such policies. Patch by me, reviewed by Stephen Frost and Dean Rasheed. Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
WRITE_UINT_FIELD(baserestrict_min_security);
WRITE_NODE_FIELD(joininfo);
WRITE_BOOL_FIELD(has_eclass_joins);
Disable support for partitionwise joins in problematic cases. Commit f49842d, which added support for partitionwise joins, built the child's tlist by applying adjust_appendrel_attrs() to the parent's. So in the case where the parent's included a whole-row Var for the parent, the child's contained a ConvertRowtypeExpr. To cope with that, that commit added code to the planner, such as setrefs.c, but some code paths still assumed that the tlist for a scan (or join) rel would only include Vars and PlaceHolderVars, which was true before that commit, causing errors: * When creating an explicit sort node for an input path for a mergejoin path for a child join, prepare_sort_from_pathkeys() threw the 'could not find pathkey item to sort' error. * When deparsing a relation participating in a pushed down child join as a subquery in contrib/postgres_fdw, get_relation_column_alias_ids() threw the 'unexpected expression in subquery output' error. * When performing set_plan_references() on a local join plan generated by contrib/postgres_fdw for EvalPlanQual support for a pushed down child join, fix_join_expr() threw the 'variable not found in subplan target lists' error. To fix these, two approaches have been proposed: one by Ashutosh Bapat and one by me. While the former keeps building the child's tlist with a ConvertRowtypeExpr, the latter builds it with a whole-row Var for the child not to violate the planner assumption, and tries to fix it up later, But both approaches need more work, so refuse to generate partitionwise join paths when whole-row Vars are involved, instead. We don't need to handle ConvertRowtypeExprs in the child's tlists for now, so this commit also removes the changes to the planner. Previously, partitionwise join computed attr_needed data for each child separately, and built the child join's tlist using that data, which also required an extra step for adding PlaceHolderVars to that tlist, but it would be more efficient to build it from the parent join's tlist through the adjust_appendrel_attrs() transformation. So this commit builds that list that way, and simplifies build_joinrel_tlist() and placeholder.c as well as part of set_append_rel_size() to basically what they were before partitionwise join went in. Back-patch to PG11 where partitionwise join was introduced. Report by Rajkumar Raghuwanshi. Analysis by Ashutosh Bapat, who also provided some of regression tests. Patch by me, reviewed by Robert Haas. Discussion: https://postgr.es/m/CAKcux6ktu-8tefLWtQuuZBYFaZA83vUzuRd7c1YHC-yEWyYFpg@mail.gmail.com
2018-08-31 13:34:06 +02:00
WRITE_BOOL_FIELD(consider_partitionwise_join);
WRITE_BITMAPSET_FIELD(top_parent_relids);
WRITE_BOOL_FIELD(partbounds_merged);
WRITE_BITMAPSET_FIELD(live_parts);
WRITE_BITMAPSET_FIELD(all_partrels);
}
static void
_outIndexOptInfo(StringInfo str, const IndexOptInfo *node)
{
WRITE_NODE_TYPE("INDEXOPTINFO");
/* NB: this isn't a complete set of fields */
WRITE_OID_FIELD(indexoid);
/* Do NOT print rel field, else infinite recursion */
WRITE_UINT_FIELD(pages);
WRITE_FLOAT_FIELD(tuples, "%.0f");
Redesign the planner's handling of index-descent cost estimation. Historically we've used a couple of very ad-hoc fudge factors to try to get the right results when indexes of different sizes would satisfy a query with the same number of index leaf tuples being visited. In commit 21a39de5809cd3050a37d2554323cc1d0cbeed9d I tweaked one of these fudge factors, with results that proved disastrous for larger indexes. Commit bf01e34b556ff37982ba2d882db424aa484c0d07 fudged it some more, but still with not a lot of principle behind it. What seems like a better way to address these issues is to explicitly model index-descent costs, since that's what's really at stake when considering diferent indexes with similar leaf-page-level costs. We tried that once long ago, and found that charging random_page_cost per page descended through was way too much, because upper btree levels tend to stay in cache in real-world workloads. However, there's still CPU costs to think about, and the previous fudge factors can be seen as a crude attempt to account for those costs. So this patch replaces those fudge factors with explicit charges for the number of tuple comparisons needed to descend the index tree, plus a small charge per page touched in the descent. The cost multipliers are chosen so that the resulting charges are in the vicinity of the historical (pre-9.2) fudge factors for indexes of up to about a million tuples, while not ballooning unreasonably beyond that, as the old fudge factor did (even more so in 9.2). To make this work accurately for btree indexes, add some code that allows extraction of the known root-page height from a btree. There's no equivalent number readily available for other index types, but we can use the log of the number of index pages as an approximate substitute. This seems like too much of a behavioral change to risk back-patching, but it should improve matters going forward. In 9.2 I'll just revert the fudge-factor change.
2013-01-11 18:56:58 +01:00
WRITE_INT_FIELD(tree_height);
WRITE_INT_FIELD(ncolumns);
Redesign the planner's handling of index-descent cost estimation. Historically we've used a couple of very ad-hoc fudge factors to try to get the right results when indexes of different sizes would satisfy a query with the same number of index leaf tuples being visited. In commit 21a39de5809cd3050a37d2554323cc1d0cbeed9d I tweaked one of these fudge factors, with results that proved disastrous for larger indexes. Commit bf01e34b556ff37982ba2d882db424aa484c0d07 fudged it some more, but still with not a lot of principle behind it. What seems like a better way to address these issues is to explicitly model index-descent costs, since that's what's really at stake when considering diferent indexes with similar leaf-page-level costs. We tried that once long ago, and found that charging random_page_cost per page descended through was way too much, because upper btree levels tend to stay in cache in real-world workloads. However, there's still CPU costs to think about, and the previous fudge factors can be seen as a crude attempt to account for those costs. So this patch replaces those fudge factors with explicit charges for the number of tuple comparisons needed to descend the index tree, plus a small charge per page touched in the descent. The cost multipliers are chosen so that the resulting charges are in the vicinity of the historical (pre-9.2) fudge factors for indexes of up to about a million tuples, while not ballooning unreasonably beyond that, as the old fudge factor did (even more so in 9.2). To make this work accurately for btree indexes, add some code that allows extraction of the known root-page height from a btree. There's no equivalent number readily available for other index types, but we can use the log of the number of index pages as an approximate substitute. This seems like too much of a behavioral change to risk back-patching, but it should improve matters going forward. In 9.2 I'll just revert the fudge-factor change.
2013-01-11 18:56:58 +01:00
/* array fields aren't really worth the trouble to print */
WRITE_OID_FIELD(relam);
/* indexprs is redundant since we print indextlist */
WRITE_NODE_FIELD(indpred);
WRITE_NODE_FIELD(indextlist);
Support using index-only scans with partial indexes in more cases. Previously, the planner would reject an index-only scan if any restriction clause for its table used a column not available from the index, even if that restriction clause would later be dropped from the plan entirely because it's implied by the index's predicate. This is a fairly common situation for partial indexes because predicates using columns not included in the index are often the most useful kind of predicate, and we have to duplicate (or at least imply) the predicate in the WHERE clause in order to get the index to be considered at all. So index-only scans were essentially unavailable with such partial indexes. To fix, we have to do detection of implied-by-predicate clauses much earlier in the planner. This patch puts it in check_index_predicates (nee check_partial_indexes), meaning it gets done for every partial index, whereas we previously only considered this issue at createplan time, so that the work was only done for an index actually selected for use. That could result in a noticeable planning slowdown for queries against tables with many partial indexes. However, testing suggested that there isn't really a significant cost, especially not with reasonable numbers of partial indexes. We do get a small additional benefit, which is that cost_index is more accurate since it correctly discounts the evaluation cost of clauses that will be removed. We can also avoid considering such clauses as potential indexquals, which saves useless matching cycles in the case where the predicate columns aren't in the index, and prevents generating bogus plans that double-count the clause's selectivity when the columns are in the index. Tomas Vondra and Kyotaro Horiguchi, reviewed by Kevin Grittner and Konstantin Knizhnik, and whacked around a little by me
2016-03-31 20:48:56 +02:00
WRITE_NODE_FIELD(indrestrictinfo);
WRITE_BOOL_FIELD(predOK);
WRITE_BOOL_FIELD(unique);
WRITE_BOOL_FIELD(immediate);
WRITE_BOOL_FIELD(hypothetical);
/* we don't bother with fields copied from the index AM's API struct */
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#endif /* OBSOLETE */
static void
_outForeignKeyOptInfo(StringInfo str, const ForeignKeyOptInfo *node)
{
int i;
WRITE_NODE_TYPE("FOREIGNKEYOPTINFO");
WRITE_UINT_FIELD(con_relid);
WRITE_UINT_FIELD(ref_relid);
WRITE_INT_FIELD(nkeys);
WRITE_ATTRNUMBER_ARRAY(conkey, node->nkeys);
WRITE_ATTRNUMBER_ARRAY(confkey, node->nkeys);
WRITE_OID_ARRAY(conpfeqop, node->nkeys);
WRITE_INT_FIELD(nmatched_ec);
Fix foreign-key selectivity estimation in the presence of constants. get_foreign_key_join_selectivity() looks for join clauses that equate the two sides of the FK constraint. However, if we have a query like "WHERE fktab.a = pktab.a and fktab.a = 1", it won't find any such join clause, because equivclass.c replaces the given clauses with "fktab.a = 1 and pktab.a = 1", which can be enforced at the scan level, leaving nothing to be done for column "a" at the join level. We can fix that expectation without much trouble, but then a new problem arises: applying the foreign-key-based selectivity rule produces a rowcount underestimate, because we're effectively double-counting the selectivity of the "fktab.a = 1" clause. So we have to cancel that selectivity out of the estimate. To fix, refactor process_implied_equality() so that it can pass back the new RestrictInfo to its callers in equivclass.c, allowing the generated "fktab.a = 1" clause to be saved in the EquivalenceClass's ec_derives list. Then it's not much trouble to dig out the relevant RestrictInfo when we need to adjust an FK selectivity estimate. (While at it, we can also remove the expensive use of initialize_mergeclause_eclasses() to set up the new RestrictInfo's left_ec and right_ec pointers. The equivclass.c code can set those basically for free.) This seems like clearly a bug fix, but I'm hesitant to back-patch it, first because there's some API/ABI risk for extensions and second because we're usually loath to destabilize plan choices in stable branches. Per report from Sigrid Ehrenreich. Discussion: https://postgr.es/m/1019549.1603770457@sss.pgh.pa.us Discussion: https://postgr.es/m/AM6PR02MB5287A0ADD936C1FA80973E72AB190@AM6PR02MB5287.eurprd02.prod.outlook.com
2020-10-28 16:15:47 +01:00
WRITE_INT_FIELD(nconst_ec);
WRITE_INT_FIELD(nmatched_rcols);
WRITE_INT_FIELD(nmatched_ri);
/* for compactness, just print the number of matches per column: */
appendStringInfoString(str, " :eclass");
for (i = 0; i < node->nkeys; i++)
appendStringInfo(str, " %d", (node->eclass[i] != NULL));
appendStringInfoString(str, " :rinfos");
for (i = 0; i < node->nkeys; i++)
appendStringInfo(str, " %d", list_length(node->rinfos[i]));
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#ifdef OBSOLETE
Implement multivariate n-distinct coefficients Add support for explicitly declared statistic objects (CREATE STATISTICS), allowing collection of statistics on more complex combinations that individual table columns. Companion commands DROP STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are added too. All this DDL has been designed so that more statistic types can be added later on, such as multivariate most-common-values and multivariate histograms between columns of a single table, leaving room for permitting columns on multiple tables, too, as well as expressions. This commit only adds support for collection of n-distinct coefficient on user-specified sets of columns in a single table. This is useful to estimate number of distinct groups in GROUP BY and DISTINCT clauses; estimation errors there can cause over-allocation of memory in hashed aggregates, for instance, so it's a worthwhile problem to solve. A new special pseudo-type pg_ndistinct is used. (num-distinct estimation was deemed sufficiently useful by itself that this is worthwhile even if no further statistic types are added immediately; so much so that another version of essentially the same functionality was submitted by Kyotaro Horiguchi: https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp though this commit does not use that code.) Author: Tomas Vondra. Some code rework by Álvaro. Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes, Ideriha Takeshi Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
static void
_outStatisticExtInfo(StringInfo str, const StatisticExtInfo *node)
{
WRITE_NODE_TYPE("STATISTICEXTINFO");
/* NB: this isn't a complete set of fields */
WRITE_OID_FIELD(statOid);
/* don't write rel, leads to infinite recursion in plan tree dump */
WRITE_CHAR_FIELD(kind);
WRITE_BITMAPSET_FIELD(keys);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#endif /* OBSOLETE */
Implement multivariate n-distinct coefficients Add support for explicitly declared statistic objects (CREATE STATISTICS), allowing collection of statistics on more complex combinations that individual table columns. Companion commands DROP STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are added too. All this DDL has been designed so that more statistic types can be added later on, such as multivariate most-common-values and multivariate histograms between columns of a single table, leaving room for permitting columns on multiple tables, too, as well as expressions. This commit only adds support for collection of n-distinct coefficient on user-specified sets of columns in a single table. This is useful to estimate number of distinct groups in GROUP BY and DISTINCT clauses; estimation errors there can cause over-allocation of memory in hashed aggregates, for instance, so it's a worthwhile problem to solve. A new special pseudo-type pg_ndistinct is used. (num-distinct estimation was deemed sufficiently useful by itself that this is worthwhile even if no further statistic types are added immediately; so much so that another version of essentially the same functionality was submitted by Kyotaro Horiguchi: https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp though this commit does not use that code.) Author: Tomas Vondra. Some code rework by Álvaro. Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes, Ideriha Takeshi Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
static void
_outEquivalenceClass(StringInfo str, const EquivalenceClass *node)
{
/*
* To simplify reading, we just chase up to the topmost merged EC and
* print that, without bothering to show the merge-ees separately.
*/
while (node->ec_merged)
node = node->ec_merged;
WRITE_NODE_TYPE("EQUIVALENCECLASS");
WRITE_NODE_FIELD(ec_opfamilies);
WRITE_OID_FIELD(ec_collation);
WRITE_NODE_FIELD(ec_members);
WRITE_NODE_FIELD(ec_sources);
WRITE_NODE_FIELD(ec_derives);
WRITE_BITMAPSET_FIELD(ec_relids);
WRITE_BOOL_FIELD(ec_has_const);
WRITE_BOOL_FIELD(ec_has_volatile);
WRITE_BOOL_FIELD(ec_below_outer_join);
WRITE_BOOL_FIELD(ec_broken);
WRITE_UINT_FIELD(ec_sortref);
Improve RLS planning by marking individual quals with security levels. In an RLS query, we must ensure that security filter quals are evaluated before ordinary query quals, in case the latter contain "leaky" functions that could expose the contents of sensitive rows. The original implementation of RLS planning ensured this by pushing the scan of a secured table into a sub-query that it marked as a security-barrier view. Unfortunately this results in very inefficient plans in many cases, because the sub-query cannot be flattened and gets planned independently of the rest of the query. To fix, drop the use of sub-queries to enforce RLS qual order, and instead mark each qual (RestrictInfo) with a security_level field establishing its priority for evaluation. Quals must be evaluated in security_level order, except that "leakproof" quals can be allowed to go ahead of quals of lower security_level, if it's helpful to do so. This has to be enforced within the ordering of any one list of quals to be evaluated at a table scan node, and we also have to ensure that quals are not chosen for early evaluation (i.e., use as an index qual or TID scan qual) if they're not allowed to go ahead of other quals at the scan node. This is sufficient to fix the problem for RLS quals, since we only support RLS policies on simple tables and thus RLS quals will always exist at the table scan level only. Eventually these qual ordering rules should be enforced for join quals as well, which would permit improving planning for explicit security-barrier views; but that's a task for another patch. Note that FDWs would need to be aware of these rules --- and not, for example, send an insecure qual for remote execution --- but since we do not yet allow RLS policies on foreign tables, the case doesn't arise. This will need to be addressed before we can allow such policies. Patch by me, reviewed by Stephen Frost and Dean Rasheed. Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
WRITE_UINT_FIELD(ec_min_security);
WRITE_UINT_FIELD(ec_max_security);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#ifdef OBSOLETE
static void
_outEquivalenceMember(StringInfo str, const EquivalenceMember *node)
{
WRITE_NODE_TYPE("EQUIVALENCEMEMBER");
WRITE_NODE_FIELD(em_expr);
WRITE_BITMAPSET_FIELD(em_relids);
Fix planning of non-strict equivalence clauses above outer joins. If a potential equivalence clause references a variable from the nullable side of an outer join, the planner needs to take care that derived clauses are not pushed to below the outer join; else they may use the wrong value for the variable. (The problem arises only with non-strict clauses, since if an upper clause can be proven strict then the outer join will get simplified to a plain join.) The planner attempted to prevent this type of error by checking that potential equivalence clauses aren't outerjoin-delayed as a whole, but actually we have to check each side separately, since the two sides of the clause will get moved around separately if it's treated as an equivalence. Bugs of this type can be demonstrated as far back as 7.4, even though releases before 8.3 had only a very ad-hoc notion of equivalence clauses. In addition, we neglected to account for the possibility that such clauses might have nonempty nullable_relids even when not outerjoin-delayed; so the equivalence-class machinery lacked logic to compute correct nullable_relids values for clauses it constructs. This oversight was harmless before 9.2 because we were only using RestrictInfo.nullable_relids for OR clauses; but as of 9.2 it could result in pushing constructed equivalence clauses to incorrect places. (This accounts for bug #7604 from Bill MacArthur.) Fix the first problem by adding a new test check_equivalence_delay() in distribute_qual_to_rels, and fix the second one by adding code in equivclass.c and called functions to set correct nullable_relids for generated clauses. Although I believe the second part of this is not currently necessary before 9.2, I chose to back-patch it anyway, partly to keep the logic similar across branches and partly because it seems possible we might find other reasons why we need valid values of nullable_relids in the older branches. Add regression tests illustrating these problems. In 9.0 and up, also add test cases checking that we can push constants through outer joins, since we've broken that optimization before and I nearly broke it again with an overly simplistic patch for this problem.
2012-10-18 18:28:45 +02:00
WRITE_BITMAPSET_FIELD(em_nullable_relids);
WRITE_BOOL_FIELD(em_is_const);
WRITE_BOOL_FIELD(em_is_child);
WRITE_OID_FIELD(em_datatype);
}
static void
_outPathKey(StringInfo str, const PathKey *node)
{
WRITE_NODE_TYPE("PATHKEY");
WRITE_NODE_FIELD(pk_eclass);
WRITE_OID_FIELD(pk_opfamily);
WRITE_INT_FIELD(pk_strategy);
WRITE_BOOL_FIELD(pk_nulls_first);
}
static void
_outPathTarget(StringInfo str, const PathTarget *node)
{
WRITE_NODE_TYPE("PATHTARGET");
WRITE_NODE_FIELD(exprs);
WRITE_INDEX_ARRAY(sortgrouprefs, list_length(node->exprs));
WRITE_FLOAT_FIELD(cost.startup, "%.2f");
WRITE_FLOAT_FIELD(cost.per_tuple, "%.2f");
WRITE_INT_FIELD(width);
WRITE_ENUM_FIELD(has_volatile_expr, VolatileFunctionStatus);
}
Revise parameterized-path mechanism to fix assorted issues. This patch adjusts the treatment of parameterized paths so that all paths with the same parameterization (same set of required outer rels) for the same relation will have the same rowcount estimate. We cache the rowcount estimates to ensure that property, and hopefully save a few cycles too. Doing this makes it practical for add_path_precheck to operate without a rowcount estimate: it need only assume that paths with different parameterizations never dominate each other, which is close enough to true anyway for coarse filtering, because normally a more-parameterized path should yield fewer rows thanks to having more join clauses to apply. In add_path, we do the full nine yards of comparing rowcount estimates along with everything else, so that we can discard parameterized paths that don't actually have an advantage. This fixes some issues I'd found with add_path rejecting parameterized paths on the grounds that they were more expensive than not-parameterized ones, even though they yielded many fewer rows and hence would be cheaper once subsequent joining was considered. To make the same-rowcounts assumption valid, we have to require that any parameterized path enforce *all* join clauses that could be obtained from the particular set of outer rels, even if not all of them are useful for indexing. This is required at both base scans and joins. It's a good thing anyway since the net impact is that join quals are checked at the lowest practical level in the join tree. Hence, discard the original rather ad-hoc mechanism for choosing parameterization joinquals, and build a better one that has a more principled rule for when clauses can be moved. The original rule was actually buggy anyway for lack of knowledge about which relations are part of an outer join's outer side; getting this right requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
static void
_outParamPathInfo(StringInfo str, const ParamPathInfo *node)
{
WRITE_NODE_TYPE("PARAMPATHINFO");
WRITE_BITMAPSET_FIELD(ppi_req_outer);
WRITE_FLOAT_FIELD(ppi_rows, "%.0f");
WRITE_NODE_FIELD(ppi_clauses);
}
static void
_outRestrictInfo(StringInfo str, const RestrictInfo *node)
{
WRITE_NODE_TYPE("RESTRICTINFO");
/* NB: this isn't a complete set of fields */
WRITE_NODE_FIELD(clause);
WRITE_BOOL_FIELD(is_pushed_down);
WRITE_BOOL_FIELD(outerjoin_delayed);
WRITE_BOOL_FIELD(can_join);
WRITE_BOOL_FIELD(pseudoconstant);
Improve RLS planning by marking individual quals with security levels. In an RLS query, we must ensure that security filter quals are evaluated before ordinary query quals, in case the latter contain "leaky" functions that could expose the contents of sensitive rows. The original implementation of RLS planning ensured this by pushing the scan of a secured table into a sub-query that it marked as a security-barrier view. Unfortunately this results in very inefficient plans in many cases, because the sub-query cannot be flattened and gets planned independently of the rest of the query. To fix, drop the use of sub-queries to enforce RLS qual order, and instead mark each qual (RestrictInfo) with a security_level field establishing its priority for evaluation. Quals must be evaluated in security_level order, except that "leakproof" quals can be allowed to go ahead of quals of lower security_level, if it's helpful to do so. This has to be enforced within the ordering of any one list of quals to be evaluated at a table scan node, and we also have to ensure that quals are not chosen for early evaluation (i.e., use as an index qual or TID scan qual) if they're not allowed to go ahead of other quals at the scan node. This is sufficient to fix the problem for RLS quals, since we only support RLS policies on simple tables and thus RLS quals will always exist at the table scan level only. Eventually these qual ordering rules should be enforced for join quals as well, which would permit improving planning for explicit security-barrier views; but that's a task for another patch. Note that FDWs would need to be aware of these rules --- and not, for example, send an insecure qual for remote execution --- but since we do not yet allow RLS policies on foreign tables, the case doesn't arise. This will need to be addressed before we can allow such policies. Patch by me, reviewed by Stephen Frost and Dean Rasheed. Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
WRITE_BOOL_FIELD(leakproof);
WRITE_ENUM_FIELD(has_volatile, VolatileFunctionStatus);
Improve RLS planning by marking individual quals with security levels. In an RLS query, we must ensure that security filter quals are evaluated before ordinary query quals, in case the latter contain "leaky" functions that could expose the contents of sensitive rows. The original implementation of RLS planning ensured this by pushing the scan of a secured table into a sub-query that it marked as a security-barrier view. Unfortunately this results in very inefficient plans in many cases, because the sub-query cannot be flattened and gets planned independently of the rest of the query. To fix, drop the use of sub-queries to enforce RLS qual order, and instead mark each qual (RestrictInfo) with a security_level field establishing its priority for evaluation. Quals must be evaluated in security_level order, except that "leakproof" quals can be allowed to go ahead of quals of lower security_level, if it's helpful to do so. This has to be enforced within the ordering of any one list of quals to be evaluated at a table scan node, and we also have to ensure that quals are not chosen for early evaluation (i.e., use as an index qual or TID scan qual) if they're not allowed to go ahead of other quals at the scan node. This is sufficient to fix the problem for RLS quals, since we only support RLS policies on simple tables and thus RLS quals will always exist at the table scan level only. Eventually these qual ordering rules should be enforced for join quals as well, which would permit improving planning for explicit security-barrier views; but that's a task for another patch. Note that FDWs would need to be aware of these rules --- and not, for example, send an insecure qual for remote execution --- but since we do not yet allow RLS policies on foreign tables, the case doesn't arise. This will need to be addressed before we can allow such policies. Patch by me, reviewed by Stephen Frost and Dean Rasheed. Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
WRITE_UINT_FIELD(security_level);
WRITE_BITMAPSET_FIELD(clause_relids);
WRITE_BITMAPSET_FIELD(required_relids);
Revise parameterized-path mechanism to fix assorted issues. This patch adjusts the treatment of parameterized paths so that all paths with the same parameterization (same set of required outer rels) for the same relation will have the same rowcount estimate. We cache the rowcount estimates to ensure that property, and hopefully save a few cycles too. Doing this makes it practical for add_path_precheck to operate without a rowcount estimate: it need only assume that paths with different parameterizations never dominate each other, which is close enough to true anyway for coarse filtering, because normally a more-parameterized path should yield fewer rows thanks to having more join clauses to apply. In add_path, we do the full nine yards of comparing rowcount estimates along with everything else, so that we can discard parameterized paths that don't actually have an advantage. This fixes some issues I'd found with add_path rejecting parameterized paths on the grounds that they were more expensive than not-parameterized ones, even though they yielded many fewer rows and hence would be cheaper once subsequent joining was considered. To make the same-rowcounts assumption valid, we have to require that any parameterized path enforce *all* join clauses that could be obtained from the particular set of outer rels, even if not all of them are useful for indexing. This is required at both base scans and joins. It's a good thing anyway since the net impact is that join quals are checked at the lowest practical level in the join tree. Hence, discard the original rather ad-hoc mechanism for choosing parameterization joinquals, and build a better one that has a more principled rule for when clauses can be moved. The original rule was actually buggy anyway for lack of knowledge about which relations are part of an outer join's outer side; getting this right requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
WRITE_BITMAPSET_FIELD(outer_relids);
WRITE_BITMAPSET_FIELD(nullable_relids);
WRITE_BITMAPSET_FIELD(left_relids);
WRITE_BITMAPSET_FIELD(right_relids);
WRITE_NODE_FIELD(orclause);
/* don't write parent_ec, leads to infinite recursion in plan tree dump */
WRITE_FLOAT_FIELD(norm_selec, "%.4f");
WRITE_FLOAT_FIELD(outer_selec, "%.4f");
WRITE_NODE_FIELD(mergeopfamilies);
/* don't write left_ec, leads to infinite recursion in plan tree dump */
/* don't write right_ec, leads to infinite recursion in plan tree dump */
WRITE_NODE_FIELD(left_em);
WRITE_NODE_FIELD(right_em);
WRITE_BOOL_FIELD(outer_is_left);
WRITE_OID_FIELD(hashjoinoperator);
WRITE_OID_FIELD(left_hasheqoperator);
WRITE_OID_FIELD(right_hasheqoperator);
}
Refactor the representation of indexable clauses in IndexPaths. In place of three separate but interrelated lists (indexclauses, indexquals, and indexqualcols), an IndexPath now has one list "indexclauses" of IndexClause nodes. This holds basically the same information as before, but in a more useful format: in particular, there is now a clear connection between an indexclause (an original restriction clause from WHERE or JOIN/ON) and the indexquals (directly usable index conditions) derived from it. We also change the ground rules a bit by mandating that clause commutation, if needed, be done up-front so that what is stored in the indexquals list is always directly usable as an index condition. This gets rid of repeated re-determination of which side of the clause is the indexkey during costing and plan generation, as well as repeated lookups of the commutator operator. To minimize the added up-front cost, the typical case of commuting a plain OpExpr is handled by a new special-purpose function commute_restrictinfo(). For RowCompareExprs, generating the new clause properly commuted to begin with is not really any more complex than before, it's just different --- and we can save doing that work twice, as the pretty-klugy original implementation did. Tracking the connection between original and derived clauses lets us also track explicitly whether the derived clauses are an exact or lossy translation of the original. This provides a cheap solution to getting rid of unnecessary rechecks of boolean index clauses, which previously seemed like it'd be more expensive than it was worth. Another pleasant (IMO) side-effect is that EXPLAIN now always shows index clauses with the indexkey on the left; this seems less confusing. This commit leaves expand_indexqual_conditions() and some related functions in a slightly messy state. I didn't bother to change them any more than minimally necessary to work with the new data structure, because all that code is going to be refactored out of existence in a follow-on patch. Discussion: https://postgr.es/m/22182.1549124950@sss.pgh.pa.us
2019-02-09 23:30:43 +01:00
static void
_outIndexClause(StringInfo str, const IndexClause *node)
{
WRITE_NODE_TYPE("INDEXCLAUSE");
WRITE_NODE_FIELD(rinfo);
WRITE_NODE_FIELD(indexquals);
WRITE_BOOL_FIELD(lossy);
WRITE_INT_FIELD(indexcol);
WRITE_NODE_FIELD(indexcols);
}
static void
_outPlaceHolderVar(StringInfo str, const PlaceHolderVar *node)
{
WRITE_NODE_TYPE("PLACEHOLDERVAR");
WRITE_NODE_FIELD(phexpr);
WRITE_BITMAPSET_FIELD(phrels);
WRITE_UINT_FIELD(phid);
WRITE_UINT_FIELD(phlevelsup);
}
static void
_outSpecialJoinInfo(StringInfo str, const SpecialJoinInfo *node)
{
WRITE_NODE_TYPE("SPECIALJOININFO");
WRITE_BITMAPSET_FIELD(min_lefthand);
WRITE_BITMAPSET_FIELD(min_righthand);
Rewrite make_outerjoininfo's construction of min_lefthand and min_righthand sets for outer joins, in the light of bug #3588 and additional thought and experimentation. The original methodology was fatally flawed for nests of more than two outer joins: it got the relationships between adjacent joins right, but didn't always come to the right conclusions about whether a join could be interchanged with one two or more levels below it. This was largely caused by a mistaken idea that we should use the min_lefthand + min_righthand sets of a sub-join as the minimum left or right input set of an upper join when we conclude that the sub-join can't commute with the upper one. If there's a still-lower join that the sub-join *can* commute with, this method led us to think that that one could commute with the topmost join; which it can't. Another problem (not directly connected to bug #3588) was that make_outerjoininfo's processing-order-dependent method for enforcing outer join identity #3 didn't work right: if we decided that join A could safely commute with lower join B, we dropped all information about sub-joins under B that join A could perhaps not safely commute with, because we removed B's entire min_righthand from A's. To fix, make an explicit computation of all inner join combinations that occur below an outer join, and add to that the full syntactic relsets of any lower outer joins that we determine it can't commute with. This method gives much more direct enforcement of the outer join rearrangement identities, and it turns out not to cost a lot of additional bookkeeping. Thanks to Richard Harris for the bug report and test case.
2007-08-31 03:44:06 +02:00
WRITE_BITMAPSET_FIELD(syn_lefthand);
WRITE_BITMAPSET_FIELD(syn_righthand);
WRITE_ENUM_FIELD(jointype, JoinType);
WRITE_BOOL_FIELD(lhs_strict);
WRITE_BOOL_FIELD(delay_upper_joins);
Improve planner's cost estimation in the presence of semijoins. If we have a semijoin, say SELECT * FROM x WHERE x1 IN (SELECT y1 FROM y) and we're estimating the cost of a parameterized indexscan on x, the number of repetitions of the indexscan should not be taken as the size of y; it'll really only be the number of distinct values of y1, because the only valid plan with y on the outside of a nestloop would require y to be unique-ified before joining it to x. Most of the time this doesn't make that much difference, but sometimes it can lead to drastically underestimating the cost of the indexscan and hence choosing a bad plan, as pointed out by David Kubečka. Fixing this is a bit difficult because parameterized indexscans are costed out quite early in the planning process, before we have the information that would be needed to call estimate_num_groups() and thereby estimate the number of distinct values of the join column(s). However we can move the code that extracts a semijoin RHS's unique-ification columns, so that it's done in initsplan.c rather than on-the-fly in create_unique_path(). That shouldn't make any difference speed-wise and it's really a bit cleaner too. The other bit of information we need is the size of the semijoin RHS, which is easy if it's a single relation (we make those estimates before considering indexscan costs) but problematic if it's a join relation. The solution adopted here is just to use the product of the sizes of the join component rels. That will generally be an overestimate, but since estimate_num_groups() only uses this input as a clamp, an overestimate shouldn't hurt us too badly. In any case we don't allow this new logic to produce a value larger than we would have chosen before, so that at worst an overestimate leaves us no wiser than we were before.
2015-03-12 02:21:00 +01:00
WRITE_BOOL_FIELD(semi_can_btree);
WRITE_BOOL_FIELD(semi_can_hash);
WRITE_NODE_FIELD(semi_operators);
WRITE_NODE_FIELD(semi_rhs_exprs);
}
static void
_outAppendRelInfo(StringInfo str, const AppendRelInfo *node)
{
WRITE_NODE_TYPE("APPENDRELINFO");
WRITE_UINT_FIELD(parent_relid);
WRITE_UINT_FIELD(child_relid);
WRITE_OID_FIELD(parent_reltype);
WRITE_OID_FIELD(child_reltype);
WRITE_NODE_FIELD(translated_vars);
WRITE_INT_FIELD(num_child_cols);
WRITE_ATTRNUMBER_ARRAY(parent_colnos, node->num_child_cols);
WRITE_OID_FIELD(parent_reloid);
}
Rework planning and execution of UPDATE and DELETE. This patch makes two closely related sets of changes: 1. For UPDATE, the subplan of the ModifyTable node now only delivers the new values of the changed columns (i.e., the expressions computed in the query's SET clause) plus row identity information such as CTID. ModifyTable must re-fetch the original tuple to merge in the old values of any unchanged columns. The core advantage of this is that the changed columns are uniform across all tables of an inherited or partitioned target relation, whereas the other columns might not be. A secondary advantage, when the UPDATE involves joins, is that less data needs to pass through the plan tree. The disadvantage of course is an extra fetch of each tuple to be updated. However, that seems to be very nearly free in context; even worst-case tests don't show it to add more than a couple percent to the total query cost. At some point it might be interesting to combine the re-fetch with the tuple access that ModifyTable must do anyway to mark the old tuple dead; but that would require a good deal of refactoring and it seems it wouldn't buy all that much, so this patch doesn't attempt it. 2. For inherited UPDATE/DELETE, instead of generating a separate subplan for each target relation, we now generate a single subplan that is just exactly like a SELECT's plan, then stick ModifyTable on top of that. To let ModifyTable know which target relation a given incoming row refers to, a tableoid junk column is added to the row identity information. This gets rid of the horrid hack that was inheritance_planner(), eliminating O(N^2) planning cost and memory consumption in cases where there were many unprunable target relations. Point 2 of course requires point 1, so that there is a uniform definition of the non-junk columns to be returned by the subplan. We can't insist on uniform definition of the row identity junk columns however, if we want to keep the ability to have both plain and foreign tables in a partitioning hierarchy. Since it wouldn't scale very far to have every child table have its own row identity column, this patch includes provisions to merge similar row identity columns into one column of the subplan result. In particular, we can merge the whole-row Vars typically used as row identity by FDWs into one column by pretending they are type RECORD. (It's still okay for the actual composite Datums to be labeled with the table's rowtype OID, though.) There is more that can be done to file down residual inefficiencies in this patch, but it seems to be committable now. FDW authors should note several API changes: * The argument list for AddForeignUpdateTargets() has changed, and so has the method it must use for adding junk columns to the query. Call add_row_identity_var() instead of manipulating the parse tree directly. You might want to reconsider exactly what you're adding, too. * PlanDirectModify() must now work a little harder to find the ForeignScan plan node; if the foreign table is part of a partitioning hierarchy then the ForeignScan might not be the direct child of ModifyTable. See postgres_fdw for sample code. * To check whether a relation is a target relation, it's no longer sufficient to compare its relid to root->parse->resultRelation. Instead, check it against all_result_relids or leaf_result_relids, as appropriate. Amit Langote and Tom Lane Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
static void
_outRowIdentityVarInfo(StringInfo str, const RowIdentityVarInfo *node)
{
WRITE_NODE_TYPE("ROWIDENTITYVARINFO");
WRITE_NODE_FIELD(rowidvar);
WRITE_INT_FIELD(rowidwidth);
WRITE_STRING_FIELD(rowidname);
WRITE_BITMAPSET_FIELD(rowidrels);
}
static void
_outPlaceHolderInfo(StringInfo str, const PlaceHolderInfo *node)
{
WRITE_NODE_TYPE("PLACEHOLDERINFO");
WRITE_UINT_FIELD(phid);
WRITE_NODE_FIELD(ph_var);
WRITE_BITMAPSET_FIELD(ph_eval_at);
WRITE_BITMAPSET_FIELD(ph_lateral);
WRITE_BITMAPSET_FIELD(ph_needed);
WRITE_INT_FIELD(ph_width);
}
static void
_outMinMaxAggInfo(StringInfo str, const MinMaxAggInfo *node)
{
WRITE_NODE_TYPE("MINMAXAGGINFO");
WRITE_OID_FIELD(aggfnoid);
WRITE_OID_FIELD(aggsortop);
WRITE_NODE_FIELD(target);
/* We intentionally omit subroot --- too large, not interesting enough */
WRITE_NODE_FIELD(path);
WRITE_FLOAT_FIELD(pathcost, "%.2f");
WRITE_NODE_FIELD(param);
}
static void
_outPlannerParamItem(StringInfo str, const PlannerParamItem *node)
{
WRITE_NODE_TYPE("PLANNERPARAMITEM");
WRITE_NODE_FIELD(item);
Fix PARAM_EXEC assignment mechanism to be safe in the presence of WITH. The planner previously assumed that parameter Vars having the same absolute query level, varno, and varattno could safely be assigned the same runtime PARAM_EXEC slot, even though they might be different Vars appearing in different subqueries. This was (probably) safe before the introduction of CTEs, but the lazy-evalution mechanism used for CTEs means that a CTE can be executed during execution of some other subquery, causing the lifespan of Params at the same syntactic nesting level as the CTE to overlap with use of the same slots inside the CTE. In 9.1 we created additional hazards by using the same parameter-assignment technology for nestloop inner scan parameters, but it was broken before that, as illustrated by the added regression test. To fix, restructure the planner's management of PlannerParamItems so that items having different semantic lifespans are kept rigorously separated. This will probably result in complex queries using more runtime PARAM_EXEC slots than before, but the slots are cheap enough that this hardly matters. Also, stop generating PlannerParamItems containing Params for subquery outputs: all we really need to do is reserve the PARAM_EXEC slot number, and that now only takes incrementing a counter. The planning code is simpler and probably faster than before, as well as being more correct. Per report from Vik Reykja. These changes will mostly also need to be made in the back branches, but I'm going to hold off on that until after 9.2.0 wraps.
2012-09-05 18:54:03 +02:00
WRITE_INT_FIELD(paramId);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#endif /* OBSOLETE */
/*****************************************************************************
*
* Stuff from extensible.h
*
*****************************************************************************/
static void
_outExtensibleNode(StringInfo str, const ExtensibleNode *node)
{
const ExtensibleNodeMethods *methods;
methods = GetExtensibleNodeMethods(node->extnodename, false);
WRITE_NODE_TYPE("EXTENSIBLENODE");
WRITE_STRING_FIELD(extnodename);
/* serialize the private fields */
methods->nodeOut(str, node);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#ifdef OBSOLETE
/*****************************************************************************
*
* Stuff from parsenodes.h.
*
*****************************************************************************/
/*
* print the basic stuff of all nodes that inherit from CreateStmt
*/
static void
_outCreateStmtInfo(StringInfo str, const CreateStmt *node)
{
WRITE_NODE_FIELD(relation);
WRITE_NODE_FIELD(tableElts);
WRITE_NODE_FIELD(inhRelations);
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
WRITE_NODE_FIELD(partspec);
WRITE_NODE_FIELD(partbound);
WRITE_NODE_FIELD(ofTypename);
WRITE_NODE_FIELD(constraints);
WRITE_NODE_FIELD(options);
WRITE_ENUM_FIELD(oncommit, OnCommitAction);
WRITE_STRING_FIELD(tablespacename);
WRITE_STRING_FIELD(accessMethod);
WRITE_BOOL_FIELD(if_not_exists);
}
static void
_outCreateStmt(StringInfo str, const CreateStmt *node)
{
WRITE_NODE_TYPE("CREATESTMT");
_outCreateStmtInfo(str, (const CreateStmt *) node);
}
static void
_outCreateForeignTableStmt(StringInfo str, const CreateForeignTableStmt *node)
{
WRITE_NODE_TYPE("CREATEFOREIGNTABLESTMT");
_outCreateStmtInfo(str, (const CreateStmt *) node);
WRITE_STRING_FIELD(servername);
WRITE_NODE_FIELD(options);
}
static void
_outImportForeignSchemaStmt(StringInfo str, const ImportForeignSchemaStmt *node)
{
WRITE_NODE_TYPE("IMPORTFOREIGNSCHEMASTMT");
WRITE_STRING_FIELD(server_name);
WRITE_STRING_FIELD(remote_schema);
WRITE_STRING_FIELD(local_schema);
WRITE_ENUM_FIELD(list_type, ImportForeignSchemaType);
WRITE_NODE_FIELD(table_list);
WRITE_NODE_FIELD(options);
}
static void
_outIndexStmt(StringInfo str, const IndexStmt *node)
{
WRITE_NODE_TYPE("INDEXSTMT");
WRITE_STRING_FIELD(idxname);
WRITE_NODE_FIELD(relation);
WRITE_STRING_FIELD(accessMethod);
WRITE_STRING_FIELD(tableSpace);
WRITE_NODE_FIELD(indexParams);
WRITE_NODE_FIELD(indexIncludingParams);
WRITE_NODE_FIELD(options);
WRITE_NODE_FIELD(whereClause);
WRITE_NODE_FIELD(excludeOpNames);
WRITE_STRING_FIELD(idxcomment);
WRITE_OID_FIELD(indexOid);
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
WRITE_OID_FIELD(oldNumber);
Skip WAL for new relfilenodes, under wal_level=minimal. Until now, only selected bulk operations (e.g. COPY) did this. If a given relfilenode received both a WAL-skipping COPY and a WAL-logged operation (e.g. INSERT), recovery could lose tuples from the COPY. See src/backend/access/transam/README section "Skipping WAL for New RelFileNode" for the new coding rules. Maintainers of table access methods should examine that section. To maintain data durability, just before commit, we choose between an fsync of the relfilenode and copying its contents to WAL. A new GUC, wal_skip_threshold, guides that choice. If this change slows a workload that creates small, permanent relfilenodes under wal_level=minimal, try adjusting wal_skip_threshold. Users setting a timeout on COMMIT may need to adjust that timeout, and log_min_duration_statement analysis will reflect time consumption moving to COMMIT from commands like COPY. Internally, this requires a reliable determination of whether RollbackAndReleaseCurrentSubTransaction() would unlink a relation's current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the specification of rd_createSubid such that the field is zero when a new rel has an old rd_node. Make relcache.c retain entries for certain dropped relations until end of transaction. Bump XLOG_PAGE_MAGIC, since this introduces XLOG_GIST_ASSIGN_LSN. Future servers accept older WAL, so this bump is discretionary. Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert Haas. Heikki Linnakangas and Michael Paquier implemented earlier designs that materially clarified the problem. Reviewed, in earlier designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane, Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout. Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
2020-04-04 21:25:34 +02:00
WRITE_UINT_FIELD(oldCreateSubid);
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
WRITE_UINT_FIELD(oldFirstRelfilelocatorSubid);
WRITE_BOOL_FIELD(unique);
WRITE_BOOL_FIELD(nulls_not_distinct);
WRITE_BOOL_FIELD(primary);
WRITE_BOOL_FIELD(isconstraint);
WRITE_BOOL_FIELD(deferrable);
WRITE_BOOL_FIELD(initdeferred);
Get rid of multiple applications of transformExpr() to the same tree. transformExpr() has for many years had provisions to do nothing when applied to an already-transformed expression tree. However, this was always ugly and of dubious reliability, so we'd be much better off without it. The primary historical reason for it was that gram.y sometimes returned multiple links to the same subexpression, which is no longer true as of my BETWEEN fixes. We'd also grown some lazy hacks in CREATE TABLE LIKE (failing to distinguish between raw and already-transformed index specifications) and one or two other places. This patch removes the need for and support for re-transforming already transformed expressions. The index case is dealt with by adding a flag to struct IndexStmt to indicate that it's already been transformed; which has some benefit anyway in that tablecmds.c can now Assert that transformation has happened rather than just assuming. The other main reason was some rather sloppy code for array type coercion, which can be fixed (and its performance improved too) by refactoring. I did leave transformJoinUsingClause() still constructing expressions containing untransformed operator nodes being applied to Vars, so that transformExpr() still has to allow Var inputs. But that's a much narrower, and safer, special case than before, since Vars will never appear in a raw parse tree, and they don't have any substructure to worry about. In passing fix some oversights in the patch that added CREATE INDEX IF NOT EXISTS (missing processing of IndexStmt.if_not_exists). These appear relatively harmless, but still sloppy coding practice.
2015-02-22 19:59:09 +01:00
WRITE_BOOL_FIELD(transformed);
WRITE_BOOL_FIELD(concurrent);
Get rid of multiple applications of transformExpr() to the same tree. transformExpr() has for many years had provisions to do nothing when applied to an already-transformed expression tree. However, this was always ugly and of dubious reliability, so we'd be much better off without it. The primary historical reason for it was that gram.y sometimes returned multiple links to the same subexpression, which is no longer true as of my BETWEEN fixes. We'd also grown some lazy hacks in CREATE TABLE LIKE (failing to distinguish between raw and already-transformed index specifications) and one or two other places. This patch removes the need for and support for re-transforming already transformed expressions. The index case is dealt with by adding a flag to struct IndexStmt to indicate that it's already been transformed; which has some benefit anyway in that tablecmds.c can now Assert that transformation has happened rather than just assuming. The other main reason was some rather sloppy code for array type coercion, which can be fixed (and its performance improved too) by refactoring. I did leave transformJoinUsingClause() still constructing expressions containing untransformed operator nodes being applied to Vars, so that transformExpr() still has to allow Var inputs. But that's a much narrower, and safer, special case than before, since Vars will never appear in a raw parse tree, and they don't have any substructure to worry about. In passing fix some oversights in the patch that added CREATE INDEX IF NOT EXISTS (missing processing of IndexStmt.if_not_exists). These appear relatively harmless, but still sloppy coding practice.
2015-02-22 19:59:09 +01:00
WRITE_BOOL_FIELD(if_not_exists);
WRITE_BOOL_FIELD(reset_default_tblspc);
}
Implement multivariate n-distinct coefficients Add support for explicitly declared statistic objects (CREATE STATISTICS), allowing collection of statistics on more complex combinations that individual table columns. Companion commands DROP STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are added too. All this DDL has been designed so that more statistic types can be added later on, such as multivariate most-common-values and multivariate histograms between columns of a single table, leaving room for permitting columns on multiple tables, too, as well as expressions. This commit only adds support for collection of n-distinct coefficient on user-specified sets of columns in a single table. This is useful to estimate number of distinct groups in GROUP BY and DISTINCT clauses; estimation errors there can cause over-allocation of memory in hashed aggregates, for instance, so it's a worthwhile problem to solve. A new special pseudo-type pg_ndistinct is used. (num-distinct estimation was deemed sufficiently useful by itself that this is worthwhile even if no further statistic types are added immediately; so much so that another version of essentially the same functionality was submitted by Kyotaro Horiguchi: https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp though this commit does not use that code.) Author: Tomas Vondra. Some code rework by Álvaro. Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes, Ideriha Takeshi Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
static void
_outCreateStatsStmt(StringInfo str, const CreateStatsStmt *node)
{
WRITE_NODE_TYPE("CREATESTATSSTMT");
WRITE_NODE_FIELD(defnames);
WRITE_NODE_FIELD(stat_types);
WRITE_NODE_FIELD(exprs);
WRITE_NODE_FIELD(relations);
WRITE_STRING_FIELD(stxcomment);
WRITE_BOOL_FIELD(transformed);
Implement multivariate n-distinct coefficients Add support for explicitly declared statistic objects (CREATE STATISTICS), allowing collection of statistics on more complex combinations that individual table columns. Companion commands DROP STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are added too. All this DDL has been designed so that more statistic types can be added later on, such as multivariate most-common-values and multivariate histograms between columns of a single table, leaving room for permitting columns on multiple tables, too, as well as expressions. This commit only adds support for collection of n-distinct coefficient on user-specified sets of columns in a single table. This is useful to estimate number of distinct groups in GROUP BY and DISTINCT clauses; estimation errors there can cause over-allocation of memory in hashed aggregates, for instance, so it's a worthwhile problem to solve. A new special pseudo-type pg_ndistinct is used. (num-distinct estimation was deemed sufficiently useful by itself that this is worthwhile even if no further statistic types are added immediately; so much so that another version of essentially the same functionality was submitted by Kyotaro Horiguchi: https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp though this commit does not use that code.) Author: Tomas Vondra. Some code rework by Álvaro. Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes, Ideriha Takeshi Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
WRITE_BOOL_FIELD(if_not_exists);
}
static void
_outAlterStatsStmt(StringInfo str, const AlterStatsStmt *node)
{
WRITE_NODE_TYPE("ALTERSTATSSTMT");
WRITE_NODE_FIELD(defnames);
WRITE_INT_FIELD(stxstattarget);
WRITE_BOOL_FIELD(missing_ok);
}
static void
_outNotifyStmt(StringInfo str, const NotifyStmt *node)
{
WRITE_NODE_TYPE("NOTIFYSTMT");
WRITE_STRING_FIELD(conditionname);
WRITE_STRING_FIELD(payload);
}
static void
_outDeclareCursorStmt(StringInfo str, const DeclareCursorStmt *node)
{
WRITE_NODE_TYPE("DECLARECURSORSTMT");
WRITE_STRING_FIELD(portalname);
WRITE_INT_FIELD(options);
WRITE_NODE_FIELD(query);
}
static void
_outSelectStmt(StringInfo str, const SelectStmt *node)
{
WRITE_NODE_TYPE("SELECT");
WRITE_NODE_FIELD(distinctClause);
WRITE_NODE_FIELD(intoClause);
WRITE_NODE_FIELD(targetList);
WRITE_NODE_FIELD(fromClause);
WRITE_NODE_FIELD(whereClause);
WRITE_NODE_FIELD(groupClause);
WRITE_BOOL_FIELD(groupDistinct);
WRITE_NODE_FIELD(havingClause);
WRITE_NODE_FIELD(windowClause);
WRITE_NODE_FIELD(valuesLists);
WRITE_NODE_FIELD(sortClause);
WRITE_NODE_FIELD(limitOffset);
WRITE_NODE_FIELD(limitCount);
WRITE_ENUM_FIELD(limitOption, LimitOption);
WRITE_NODE_FIELD(lockingClause);
WRITE_NODE_FIELD(withClause);
WRITE_ENUM_FIELD(op, SetOperation);
WRITE_BOOL_FIELD(all);
WRITE_NODE_FIELD(larg);
WRITE_NODE_FIELD(rarg);
}
SQL-standard function body This adds support for writing CREATE FUNCTION and CREATE PROCEDURE statements for language SQL with a function body that conforms to the SQL standard and is portable to other implementations. Instead of the PostgreSQL-specific AS $$ string literal $$ syntax, this allows writing out the SQL statements making up the body unquoted, either as a single statement: CREATE FUNCTION add(a integer, b integer) RETURNS integer LANGUAGE SQL RETURN a + b; or as a block CREATE PROCEDURE insert_data(a integer, b integer) LANGUAGE SQL BEGIN ATOMIC INSERT INTO tbl VALUES (a); INSERT INTO tbl VALUES (b); END; The function body is parsed at function definition time and stored as expression nodes in a new pg_proc column prosqlbody. So at run time, no further parsing is required. However, this form does not support polymorphic arguments, because there is no more parse analysis done at call time. Dependencies between the function and the objects it uses are fully tracked. A new RETURN statement is introduced. This can only be used inside function bodies. Internally, it is treated much like a SELECT statement. psql needs some new intelligence to keep track of function body boundaries so that it doesn't send off statements when it sees semicolons that are inside a function body. Tested-by: Jaime Casanova <jcasanov@systemguards.com.ec> Reviewed-by: Julien Rouhaud <rjuju123@gmail.com> Discussion: https://www.postgresql.org/message-id/flat/1c11f1eb-f00c-43b7-799d-2d44132c02d7@2ndquadrant.com
2021-04-07 21:30:08 +02:00
static void
_outReturnStmt(StringInfo str, const ReturnStmt *node)
{
WRITE_NODE_TYPE("RETURN");
WRITE_NODE_FIELD(returnval);
}
static void
_outPLAssignStmt(StringInfo str, const PLAssignStmt *node)
{
WRITE_NODE_TYPE("PLASSIGN");
WRITE_STRING_FIELD(name);
WRITE_NODE_FIELD(indirection);
WRITE_INT_FIELD(nnames);
WRITE_NODE_FIELD(val);
WRITE_LOCATION_FIELD(location);
}
static void
_outFuncCall(StringInfo str, const FuncCall *node)
{
WRITE_NODE_TYPE("FUNCCALL");
WRITE_NODE_FIELD(funcname);
WRITE_NODE_FIELD(args);
WRITE_NODE_FIELD(agg_order);
WRITE_NODE_FIELD(agg_filter);
Improve our ability to regurgitate SQL-syntax function calls. The SQL spec calls out nonstandard syntax for certain function calls, for example substring() with numeric position info is supposed to be spelled "SUBSTRING(string FROM start FOR count)". We accept many of these things, but up to now would not print them in the same format, instead simplifying down to "substring"(string, start, count). That's long annoyed me because it creates an interoperability problem: we're gratuitously injecting Postgres-specific syntax into what might otherwise be a perfectly spec-compliant view definition. However, the real reason for addressing it right now is to support a planned change in the semantics of EXTRACT() a/k/a date_part(). When we switch that to returning numeric, we'll have the parser translate EXTRACT() to some new function name (might as well be "extract" if you ask me) and then teach ruleutils.c to reverse-list that per SQL spec. In this way existing calls to date_part() will continue to have the old semantics. To implement this, invent a new CoercionForm value COERCE_SQL_SYNTAX, and make the parser insert that rather than COERCE_EXPLICIT_CALL when the input has SQL-spec decoration. (But if the input has the form of a plain function call, continue to mark it COERCE_EXPLICIT_CALL, even if it's calling one of these functions.) Then ruleutils.c recognizes COERCE_SQL_SYNTAX as a cue to emit SQL call syntax. It can know which decoration to emit using hard-wired knowledge about the functions that could be called this way. (While this solution isn't extensible without manual additions, neither is the grammar, so this doesn't seem unmaintainable.) Notice that this solution will reverse-list a function call with SQL decoration only if it was entered that way; so dump-and-reload will not by itself produce any changes in the appearance of views. This requires adding a CoercionForm field to struct FuncCall. (I couldn't resist the temptation to rearrange that struct's field order a tad while I was at it.) FuncCall doesn't appear in stored rules, so that change isn't a reason for a catversion bump, but I did one anyway because the new enum value for CoercionForm fields could confuse old backend code. Possible future work: * Perhaps CoercionForm should now be renamed to DisplayForm, or something like that, to reflect its more general meaning. This'd require touching a couple hundred places, so it's not clear it's worth the code churn. * The SQLValueFunction node type, which was invented partly for the same goal of improving SQL-compatibility of view output, could perhaps be replaced with regular function calls marked with COERCE_SQL_SYNTAX. It's unclear if this would be a net code savings, however. Discussion: https://postgr.es/m/42b73d2d-da12-ba9f-570a-420e0cce19d9@phystech.edu
2020-11-04 18:34:50 +01:00
WRITE_NODE_FIELD(over);
Support ordered-set (WITHIN GROUP) aggregates. This patch introduces generic support for ordered-set and hypothetical-set aggregate functions, as well as implementations of the instances defined in SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(), percent_rank(), cume_dist()). We also added mode() though it is not in the spec, as well as versions of percentile_cont() and percentile_disc() that can compute multiple percentile values in one pass over the data. Unlike the original submission, this patch puts full control of the sorting process in the hands of the aggregate's support functions. To allow the support functions to find out how they're supposed to sort, a new API function AggGetAggref() is added to nodeAgg.c. This allows retrieval of the aggregate call's Aggref node, which may have other uses beyond the immediate need. There is also support for ordered-set aggregates to install cleanup callback functions, so that they can be sure that infrastructure such as tuplesort objects gets cleaned up. In passing, make some fixes in the recently-added support for variadic aggregates, and make some editorial adjustments in the recent FILTER additions for aggregates. Also, simplify use of IsBinaryCoercible() by allowing it to succeed whenever the target type is ANY or ANYELEMENT. It was inconsistent that it dealt with other polymorphic target types but not these. Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing, and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
WRITE_BOOL_FIELD(agg_within_group);
WRITE_BOOL_FIELD(agg_star);
WRITE_BOOL_FIELD(agg_distinct);
WRITE_BOOL_FIELD(func_variadic);
Improve our ability to regurgitate SQL-syntax function calls. The SQL spec calls out nonstandard syntax for certain function calls, for example substring() with numeric position info is supposed to be spelled "SUBSTRING(string FROM start FOR count)". We accept many of these things, but up to now would not print them in the same format, instead simplifying down to "substring"(string, start, count). That's long annoyed me because it creates an interoperability problem: we're gratuitously injecting Postgres-specific syntax into what might otherwise be a perfectly spec-compliant view definition. However, the real reason for addressing it right now is to support a planned change in the semantics of EXTRACT() a/k/a date_part(). When we switch that to returning numeric, we'll have the parser translate EXTRACT() to some new function name (might as well be "extract" if you ask me) and then teach ruleutils.c to reverse-list that per SQL spec. In this way existing calls to date_part() will continue to have the old semantics. To implement this, invent a new CoercionForm value COERCE_SQL_SYNTAX, and make the parser insert that rather than COERCE_EXPLICIT_CALL when the input has SQL-spec decoration. (But if the input has the form of a plain function call, continue to mark it COERCE_EXPLICIT_CALL, even if it's calling one of these functions.) Then ruleutils.c recognizes COERCE_SQL_SYNTAX as a cue to emit SQL call syntax. It can know which decoration to emit using hard-wired knowledge about the functions that could be called this way. (While this solution isn't extensible without manual additions, neither is the grammar, so this doesn't seem unmaintainable.) Notice that this solution will reverse-list a function call with SQL decoration only if it was entered that way; so dump-and-reload will not by itself produce any changes in the appearance of views. This requires adding a CoercionForm field to struct FuncCall. (I couldn't resist the temptation to rearrange that struct's field order a tad while I was at it.) FuncCall doesn't appear in stored rules, so that change isn't a reason for a catversion bump, but I did one anyway because the new enum value for CoercionForm fields could confuse old backend code. Possible future work: * Perhaps CoercionForm should now be renamed to DisplayForm, or something like that, to reflect its more general meaning. This'd require touching a couple hundred places, so it's not clear it's worth the code churn. * The SQLValueFunction node type, which was invented partly for the same goal of improving SQL-compatibility of view output, could perhaps be replaced with regular function calls marked with COERCE_SQL_SYNTAX. It's unclear if this would be a net code savings, however. Discussion: https://postgr.es/m/42b73d2d-da12-ba9f-570a-420e0cce19d9@phystech.edu
2020-11-04 18:34:50 +01:00
WRITE_ENUM_FIELD(funcformat, CoercionForm);
WRITE_LOCATION_FIELD(location);
}
static void
_outDefElem(StringInfo str, const DefElem *node)
{
WRITE_NODE_TYPE("DEFELEM");
WRITE_STRING_FIELD(defnamespace);
WRITE_STRING_FIELD(defname);
WRITE_NODE_FIELD(arg);
WRITE_ENUM_FIELD(defaction, DefElemAction);
WRITE_LOCATION_FIELD(location);
}
static void
_outTableLikeClause(StringInfo str, const TableLikeClause *node)
{
WRITE_NODE_TYPE("TABLELIKECLAUSE");
WRITE_NODE_FIELD(relation);
WRITE_UINT_FIELD(options);
WRITE_OID_FIELD(relationOid);
}
static void
_outLockingClause(StringInfo str, const LockingClause *node)
{
WRITE_NODE_TYPE("LOCKINGCLAUSE");
WRITE_NODE_FIELD(lockedRels);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
WRITE_ENUM_FIELD(strength, LockClauseStrength);
WRITE_ENUM_FIELD(waitPolicy, LockWaitPolicy);
}
static void
_outXmlSerialize(StringInfo str, const XmlSerialize *node)
{
WRITE_NODE_TYPE("XMLSERIALIZE");
WRITE_ENUM_FIELD(xmloption, XmlOptionType);
WRITE_NODE_FIELD(expr);
WRITE_NODE_FIELD(typeName);
WRITE_LOCATION_FIELD(location);
}
static void
_outTriggerTransition(StringInfo str, const TriggerTransition *node)
{
WRITE_NODE_TYPE("TRIGGERTRANSITION");
WRITE_STRING_FIELD(name);
WRITE_BOOL_FIELD(isNew);
WRITE_BOOL_FIELD(isTable);
}
static void
_outColumnDef(StringInfo str, const ColumnDef *node)
{
WRITE_NODE_TYPE("COLUMNDEF");
WRITE_STRING_FIELD(colname);
WRITE_NODE_FIELD(typeName);
Allow configurable LZ4 TOAST compression. There is now a per-column COMPRESSION option which can be set to pglz (the default, and the only option in up until now) or lz4. Or, if you like, you can set the new default_toast_compression GUC to lz4, and then that will be the default for new table columns for which no value is specified. We don't have lz4 support in the PostgreSQL code, so to use lz4 compression, PostgreSQL must be built --with-lz4. In general, TOAST compression means compression of individual column values, not the whole tuple, and those values can either be compressed inline within the tuple or compressed and then stored externally in the TOAST table, so those properties also apply to this feature. Prior to this commit, a TOAST pointer has two unused bits as part of the va_extsize field, and a compessed datum has two unused bits as part of the va_rawsize field. These bits are unused because the length of a varlena is limited to 1GB; we now use them to indicate the compression type that was used. This means we only have bit space for 2 more built-in compresison types, but we could work around that problem, if necessary, by introducing a new vartag_external value for any further types we end up wanting to add. Hopefully, it won't be too important to offer a wide selection of algorithms here, since each one we add not only takes more coding but also adds a build dependency for every packager. Nevertheless, it seems worth doing at least this much, because LZ4 gets better compression than PGLZ with less CPU usage. It's possible for LZ4-compressed datums to leak into composite type values stored on disk, just as it is for PGLZ. It's also possible for LZ4-compressed attributes to be copied into a different table via SQL commands such as CREATE TABLE AS or INSERT .. SELECT. It would be expensive to force such values to be decompressed, so PostgreSQL has never done so. For the same reasons, we also don't force recompression of already-compressed values even if the target table prefers a different compression method than was used for the source data. These architectural decisions are perhaps arguable but revisiting them is well beyond the scope of what seemed possible to do as part of this project. However, it's relatively cheap to recompress as part of VACUUM FULL or CLUSTER, so this commit adjusts those commands to do so, if the configured compression method of the table happens not to match what was used for some column value stored therein. Dilip Kumar. The original patches on which this work was based were written by Ildus Kurbangaliev, and those were patches were based on even earlier work by Nikita Glukhov, but the design has since changed very substantially, since allow a potentially large number of compression methods that could be added and dropped on a running system proved too problematic given some of the architectural issues mentioned above; the choice of which specific compression method to add first is now different; and a lot of the code has been heavily refactored. More recently, Justin Przyby helped quite a bit with testing and reviewing and this version also includes some code contributions from him. Other design input and review from Tomas Vondra, Álvaro Herrera, Andres Freund, Oleg Bartunov, Alexander Korotkov, and me. Discussion: http://postgr.es/m/20170907194236.4cefce96%40wp.localdomain Discussion: http://postgr.es/m/CAFiTN-uUpX3ck%3DK0mLEk-G_kUQY%3DSNOTeqdaNRR9FMdQrHKebw%40mail.gmail.com
2021-03-19 20:10:38 +01:00
WRITE_STRING_FIELD(compression);
WRITE_INT_FIELD(inhcount);
WRITE_BOOL_FIELD(is_local);
WRITE_BOOL_FIELD(is_not_null);
Remove collation information from TypeName, where it does not belong. The initial collations patch treated a COLLATE spec as part of a TypeName, following what can only be described as brain fade on the part of the SQL committee. It's a lot more reasonable to treat COLLATE as a syntactically separate object, so that it can be added in only the productions where it actually belongs, rather than needing to reject it in a boatload of places where it doesn't belong (something the original patch mostly failed to do). In addition this change lets us meet the spec's requirement to allow COLLATE anywhere in the clauses of a ColumnDef, and it avoids unfriendly behavior for constructs such as "foo::type COLLATE collation". To do this, pull collation information out of TypeName and put it in ColumnDef instead, thus reverting most of the collation-related changes in parse_type.c's API. I made one additional structural change, which was to use a ColumnDef as an intermediate node in AT_AlterColumnType AlterTableCmd nodes. This provides enough room to get rid of the "transform" wart in AlterTableCmd too, since the ColumnDef can carry the USING expression easily enough. Also fix some other minor bugs that have crept in in the same areas, like failure to copy recently-added fields of ColumnDef in copyfuncs.c. While at it, document the formerly secret ability to specify a collation in ALTER TABLE ALTER COLUMN TYPE, ALTER TYPE ADD ATTRIBUTE, and ALTER TYPE ALTER ATTRIBUTE TYPE; and correct some misstatements about what the default collation selection will be when COLLATE is omitted. BTW, the three-parameter form of format_type() should go away too, since it just contributes to the confusion in this area; but I'll do that in a separate patch.
2011-03-10 04:38:52 +01:00
WRITE_BOOL_FIELD(is_from_type);
WRITE_CHAR_FIELD(storage);
WRITE_NODE_FIELD(raw_default);
WRITE_NODE_FIELD(cooked_default);
WRITE_CHAR_FIELD(identity);
WRITE_NODE_FIELD(identitySequence);
WRITE_CHAR_FIELD(generated);
Remove collation information from TypeName, where it does not belong. The initial collations patch treated a COLLATE spec as part of a TypeName, following what can only be described as brain fade on the part of the SQL committee. It's a lot more reasonable to treat COLLATE as a syntactically separate object, so that it can be added in only the productions where it actually belongs, rather than needing to reject it in a boatload of places where it doesn't belong (something the original patch mostly failed to do). In addition this change lets us meet the spec's requirement to allow COLLATE anywhere in the clauses of a ColumnDef, and it avoids unfriendly behavior for constructs such as "foo::type COLLATE collation". To do this, pull collation information out of TypeName and put it in ColumnDef instead, thus reverting most of the collation-related changes in parse_type.c's API. I made one additional structural change, which was to use a ColumnDef as an intermediate node in AT_AlterColumnType AlterTableCmd nodes. This provides enough room to get rid of the "transform" wart in AlterTableCmd too, since the ColumnDef can carry the USING expression easily enough. Also fix some other minor bugs that have crept in in the same areas, like failure to copy recently-added fields of ColumnDef in copyfuncs.c. While at it, document the formerly secret ability to specify a collation in ALTER TABLE ALTER COLUMN TYPE, ALTER TYPE ADD ATTRIBUTE, and ALTER TYPE ALTER ATTRIBUTE TYPE; and correct some misstatements about what the default collation selection will be when COLLATE is omitted. BTW, the three-parameter form of format_type() should go away too, since it just contributes to the confusion in this area; but I'll do that in a separate patch.
2011-03-10 04:38:52 +01:00
WRITE_NODE_FIELD(collClause);
WRITE_OID_FIELD(collOid);
WRITE_NODE_FIELD(constraints);
WRITE_NODE_FIELD(fdwoptions);
WRITE_LOCATION_FIELD(location);
}
static void
_outTypeName(StringInfo str, const TypeName *node)
{
WRITE_NODE_TYPE("TYPENAME");
WRITE_NODE_FIELD(names);
WRITE_OID_FIELD(typeOid);
WRITE_BOOL_FIELD(setof);
WRITE_BOOL_FIELD(pct_type);
WRITE_NODE_FIELD(typmods);
WRITE_INT_FIELD(typemod);
WRITE_NODE_FIELD(arrayBounds);
WRITE_LOCATION_FIELD(location);
}
static void
_outTypeCast(StringInfo str, const TypeCast *node)
{
WRITE_NODE_TYPE("TYPECAST");
WRITE_NODE_FIELD(arg);
WRITE_NODE_FIELD(typeName);
WRITE_LOCATION_FIELD(location);
}
static void
_outCollateClause(StringInfo str, const CollateClause *node)
{
WRITE_NODE_TYPE("COLLATECLAUSE");
WRITE_NODE_FIELD(arg);
WRITE_NODE_FIELD(collname);
WRITE_LOCATION_FIELD(location);
}
static void
_outIndexElem(StringInfo str, const IndexElem *node)
{
WRITE_NODE_TYPE("INDEXELEM");
WRITE_STRING_FIELD(name);
WRITE_NODE_FIELD(expr);
WRITE_STRING_FIELD(indexcolname);
WRITE_NODE_FIELD(collation);
WRITE_NODE_FIELD(opclass);
Implement operator class parameters PostgreSQL provides set of template index access methods, where opclasses have much freedom in the semantics of indexing. These index AMs are GiST, GIN, SP-GiST and BRIN. There opclasses define representation of keys, operations on them and supported search strategies. So, it's natural that opclasses may be faced some tradeoffs, which require user-side decision. This commit implements opclass parameters allowing users to set some values, which tell opclass how to index the particular dataset. This commit doesn't introduce new storage in system catalog. Instead it uses pg_attribute.attoptions, which is used for table column storage options but unused for index attributes. In order to evade changing signature of each opclass support function, we implement unified way to pass options to opclass support functions. Options are set to fn_expr as the constant bytea expression. It's possible due to the fact that opclass support functions are executed outside of expressions, so fn_expr is unused for them. This commit comes with some examples of opclass options usage. We parametrize signature length in GiST. That applies to multiple opclasses: tsvector_ops, gist__intbig_ops, gist_ltree_ops, gist__ltree_ops, gist_trgm_ops and gist_hstore_ops. Also we parametrize maximum number of integer ranges for gist__int_ops. However, the main future usage of this feature is expected to be json, where users would be able to specify which way to index particular json parts. Catversion is bumped. Discussion: https://postgr.es/m/d22c3a18-31c7-1879-fc11-4c1ce2f5e5af%40postgrespro.ru Author: Nikita Glukhov, revised by me Reviwed-by: Nikolay Shaplov, Robert Haas, Tom Lane, Tomas Vondra, Alvaro Herrera
2020-03-30 18:17:11 +02:00
WRITE_NODE_FIELD(opclassopts);
WRITE_ENUM_FIELD(ordering, SortByDir);
WRITE_ENUM_FIELD(nulls_ordering, SortByNulls);
}
Extended statistics on expressions Allow defining extended statistics on expressions, not just just on simple column references. With this commit, expressions are supported by all existing extended statistics kinds, improving the same types of estimates. A simple example may look like this: CREATE TABLE t (a int); CREATE STATISTICS s ON mod(a,10), mod(a,20) FROM t; ANALYZE t; The collected statistics are useful e.g. to estimate queries with those expressions in WHERE or GROUP BY clauses: SELECT * FROM t WHERE mod(a,10) = 0 AND mod(a,20) = 0; SELECT 1 FROM t GROUP BY mod(a,10), mod(a,20); This introduces new internal statistics kind 'e' (expressions) which is built automatically when the statistics object definition includes any expressions. This represents single-expression statistics, as if there was an expression index (but without the index maintenance overhead). The statistics is stored in pg_statistics_ext_data as an array of composite types, which is possible thanks to 79f6a942bd. CREATE STATISTICS allows building statistics on a single expression, in which case in which case it's not possible to specify statistics kinds. A new system view pg_stats_ext_exprs can be used to display expression statistics, similarly to pg_stats and pg_stats_ext views. ALTER TABLE ... ALTER COLUMN ... TYPE now treats indexes the same way it treats indexes, i.e. it drops and recreates the statistics. This means all statistics are reset, and we no longer try to preserve at least the functional dependencies. This should not be a major issue in practice, as the functional dependencies actually rely on per-column statistics, which were always reset anyway. Author: Tomas Vondra Reviewed-by: Justin Pryzby, Dean Rasheed, Zhihong Yu Discussion: https://postgr.es/m/ad7891d2-e90c-b446-9fe2-7419143847d7%40enterprisedb.com
2021-03-26 23:22:01 +01:00
static void
_outStatsElem(StringInfo str, const StatsElem *node)
{
WRITE_NODE_TYPE("STATSELEM");
WRITE_STRING_FIELD(name);
WRITE_NODE_FIELD(expr);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#endif /* OBSOLETE */
Extended statistics on expressions Allow defining extended statistics on expressions, not just just on simple column references. With this commit, expressions are supported by all existing extended statistics kinds, improving the same types of estimates. A simple example may look like this: CREATE TABLE t (a int); CREATE STATISTICS s ON mod(a,10), mod(a,20) FROM t; ANALYZE t; The collected statistics are useful e.g. to estimate queries with those expressions in WHERE or GROUP BY clauses: SELECT * FROM t WHERE mod(a,10) = 0 AND mod(a,20) = 0; SELECT 1 FROM t GROUP BY mod(a,10), mod(a,20); This introduces new internal statistics kind 'e' (expressions) which is built automatically when the statistics object definition includes any expressions. This represents single-expression statistics, as if there was an expression index (but without the index maintenance overhead). The statistics is stored in pg_statistics_ext_data as an array of composite types, which is possible thanks to 79f6a942bd. CREATE STATISTICS allows building statistics on a single expression, in which case in which case it's not possible to specify statistics kinds. A new system view pg_stats_ext_exprs can be used to display expression statistics, similarly to pg_stats and pg_stats_ext views. ALTER TABLE ... ALTER COLUMN ... TYPE now treats indexes the same way it treats indexes, i.e. it drops and recreates the statistics. This means all statistics are reset, and we no longer try to preserve at least the functional dependencies. This should not be a major issue in practice, as the functional dependencies actually rely on per-column statistics, which were always reset anyway. Author: Tomas Vondra Reviewed-by: Justin Pryzby, Dean Rasheed, Zhihong Yu Discussion: https://postgr.es/m/ad7891d2-e90c-b446-9fe2-7419143847d7%40enterprisedb.com
2021-03-26 23:22:01 +01:00
static void
_outQuery(StringInfo str, const Query *node)
{
WRITE_NODE_TYPE("QUERY");
WRITE_ENUM_FIELD(commandType, CmdType);
WRITE_ENUM_FIELD(querySource, QuerySource);
/* we intentionally do not print the queryId field */
WRITE_BOOL_FIELD(canSetTag);
/*
* Hack to work around missing outfuncs routines for a lot of the
* utility-statement node types. (The only one we actually *need* for
* rules support is NotifyStmt.) Someday we ought to support 'em all, but
* for the meantime do this to avoid getting lots of warnings when running
* with debug_print_parse on.
*/
if (node->utilityStmt)
{
switch (nodeTag(node->utilityStmt))
{
case T_CreateStmt:
case T_IndexStmt:
case T_NotifyStmt:
case T_DeclareCursorStmt:
WRITE_NODE_FIELD(utilityStmt);
break;
default:
appendStringInfoString(str, " :utilityStmt ?");
break;
}
}
else
appendStringInfoString(str, " :utilityStmt <>");
WRITE_INT_FIELD(resultRelation);
WRITE_BOOL_FIELD(hasAggs);
WRITE_BOOL_FIELD(hasWindowFuncs);
Improve parser's and planner's handling of set-returning functions. Teach the parser to reject misplaced set-returning functions during parse analysis using p_expr_kind, in much the same way as we do for aggregates and window functions (cf commit eaccfded9). While this isn't complete (it misses nesting-based restrictions), it's much better than the previous error reporting for such cases, and it allows elimination of assorted ad-hoc expression_returns_set() error checks. We could add nesting checks later if it seems important to catch all cases at parse time. There is one case the parser will now throw error for although previous versions allowed it, which is SRFs in the tlist of an UPDATE. That never behaved sensibly (since it's ill-defined which generated row should be used to perform the update) and it's hard to see why it should not be treated as an error. It's a release-note-worthy change though. Also, add a new Query field hasTargetSRFs reporting whether there are any SRFs in the targetlist (including GROUP BY/ORDER BY expressions). The parser can now set that basically for free during parse analysis, and we can use it in a number of places to avoid expression_returns_set searches. (There will be more such checks soon.) In some places, this allows decontorting the logic since it's no longer expensive to check for SRFs in the tlist --- so I made the checks parallel to the handling of hasAggs/hasWindowFuncs wherever it seemed appropriate. catversion bump because adding a Query field changes stored rules. Andres Freund and Tom Lane Discussion: <24639.1473782855@sss.pgh.pa.us>
2016-09-13 19:54:24 +02:00
WRITE_BOOL_FIELD(hasTargetSRFs);
WRITE_BOOL_FIELD(hasSubLinks);
WRITE_BOOL_FIELD(hasDistinctOn);
WRITE_BOOL_FIELD(hasRecursive);
WRITE_BOOL_FIELD(hasModifyingCTE);
WRITE_BOOL_FIELD(hasForUpdate);
Row-Level Security Policies (RLS) Building on the updatable security-barrier views work, add the ability to define policies on tables to limit the set of rows which are returned from a query and which are allowed to be added to a table. Expressions defined by the policy for filtering are added to the security barrier quals of the query, while expressions defined to check records being added to a table are added to the with-check options of the query. New top-level commands are CREATE/ALTER/DROP POLICY and are controlled by the table owner. Row Security is able to be enabled and disabled by the owner on a per-table basis using ALTER TABLE .. ENABLE/DISABLE ROW SECURITY. Per discussion, ROW SECURITY is disabled on tables by default and must be enabled for policies on the table to be used. If no policies exist on a table with ROW SECURITY enabled, a default-deny policy is used and no records will be visible. By default, row security is applied at all times except for the table owner and the superuser. A new GUC, row_security, is added which can be set to ON, OFF, or FORCE. When set to FORCE, row security will be applied even for the table owner and superusers. When set to OFF, row security will be disabled when allowed and an error will be thrown if the user does not have rights to bypass row security. Per discussion, pg_dump sets row_security = OFF by default to ensure that exports and backups will have all data in the table or will error if there are insufficient privileges to bypass row security. A new option has been added to pg_dump, --enable-row-security, to ask pg_dump to export with row security enabled. A new role capability, BYPASSRLS, which can only be set by the superuser, is added to allow other users to be able to bypass row security using row_security = OFF. Many thanks to the various individuals who have helped with the design, particularly Robert Haas for his feedback. Authors include Craig Ringer, KaiGai Kohei, Adam Brightwell, Dean Rasheed, with additional changes and rework by me. Reviewers have included all of the above, Greg Smith, Jeff McCormick, and Robert Haas.
2014-09-19 17:18:35 +02:00
WRITE_BOOL_FIELD(hasRowSecurity);
SQL-standard function body This adds support for writing CREATE FUNCTION and CREATE PROCEDURE statements for language SQL with a function body that conforms to the SQL standard and is portable to other implementations. Instead of the PostgreSQL-specific AS $$ string literal $$ syntax, this allows writing out the SQL statements making up the body unquoted, either as a single statement: CREATE FUNCTION add(a integer, b integer) RETURNS integer LANGUAGE SQL RETURN a + b; or as a block CREATE PROCEDURE insert_data(a integer, b integer) LANGUAGE SQL BEGIN ATOMIC INSERT INTO tbl VALUES (a); INSERT INTO tbl VALUES (b); END; The function body is parsed at function definition time and stored as expression nodes in a new pg_proc column prosqlbody. So at run time, no further parsing is required. However, this form does not support polymorphic arguments, because there is no more parse analysis done at call time. Dependencies between the function and the objects it uses are fully tracked. A new RETURN statement is introduced. This can only be used inside function bodies. Internally, it is treated much like a SELECT statement. psql needs some new intelligence to keep track of function body boundaries so that it doesn't send off statements when it sees semicolons that are inside a function body. Tested-by: Jaime Casanova <jcasanov@systemguards.com.ec> Reviewed-by: Julien Rouhaud <rjuju123@gmail.com> Discussion: https://www.postgresql.org/message-id/flat/1c11f1eb-f00c-43b7-799d-2d44132c02d7@2ndquadrant.com
2021-04-07 21:30:08 +02:00
WRITE_BOOL_FIELD(isReturn);
WRITE_NODE_FIELD(cteList);
WRITE_NODE_FIELD(rtable);
WRITE_NODE_FIELD(jointree);
WRITE_NODE_FIELD(targetList);
WRITE_ENUM_FIELD(override, OverridingKind);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
WRITE_NODE_FIELD(onConflict);
WRITE_NODE_FIELD(returningList);
WRITE_NODE_FIELD(groupClause);
WRITE_BOOL_FIELD(groupDistinct);
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
WRITE_NODE_FIELD(groupingSets);
WRITE_NODE_FIELD(havingQual);
WRITE_NODE_FIELD(windowClause);
WRITE_NODE_FIELD(distinctClause);
WRITE_NODE_FIELD(sortClause);
WRITE_NODE_FIELD(limitOffset);
WRITE_NODE_FIELD(limitCount);
WRITE_ENUM_FIELD(limitOption, LimitOption);
WRITE_NODE_FIELD(rowMarks);
WRITE_NODE_FIELD(setOperations);
WRITE_NODE_FIELD(constraintDeps);
Fix some minor issues exposed by outfuncs/readfuncs testing. A test patch to pass parse and plan trees through outfuncs + readfuncs exposed several issues that need to be fixed to get clean matches: Query.withCheckOptions failed to get copied; it's intentionally ignored by outfuncs/readfuncs on the grounds that it'd always be NIL anyway in stored rules. This seems less than future-proof, and it's not even saving very much, so just undo the decision and treat the field like all others. Several places that convert a view RTE into a subquery RTE, or similar manipulations, failed to clear out fields that were specific to the original RTE type and should be zero in a subquery RTE. Since readfuncs.c will leave such fields as zero, equalfuncs.c thinks the nodes are different leading to a reported mismatch. It seems like a good idea to clear out the no-longer-needed fields, even though in principle nothing should look at them; the node ought to be indistinguishable from how it would look if we'd built a new node instead of scribbling on the old one. BuildOnConflictExcludedTargetlist randomly set the resname of some TargetEntries to "" not NULL. outfuncs/readfuncs don't distinguish those cases, and so the string will read back in as NULL ... but equalfuncs.c does distinguish. Perhaps we ought to try to make things more consistent in this area --- but it's just useless extra code space for BuildOnConflictExcludedTargetlist to not use NULL here, so I fixed it for now by making it do that. catversion bumped because the change in handling of Query.withCheckOptions affects stored rules. Discussion: https://postgr.es/m/17114.1537138992@sss.pgh.pa.us
2018-09-18 21:08:28 +02:00
WRITE_NODE_FIELD(withCheckOptions);
Add support for MERGE SQL command MERGE performs actions that modify rows in the target table using a source table or query. MERGE provides a single SQL statement that can conditionally INSERT/UPDATE/DELETE rows -- a task that would otherwise require multiple PL statements. For example, MERGE INTO target AS t USING source AS s ON t.tid = s.sid WHEN MATCHED AND t.balance > s.delta THEN UPDATE SET balance = t.balance - s.delta WHEN MATCHED THEN DELETE WHEN NOT MATCHED AND s.delta > 0 THEN INSERT VALUES (s.sid, s.delta) WHEN NOT MATCHED THEN DO NOTHING; MERGE works with regular tables, partitioned tables and inheritance hierarchies, including column and row security enforcement, as well as support for row and statement triggers and transition tables therein. MERGE is optimized for OLTP and is parameterizable, though also useful for large scale ETL/ELT. MERGE is not intended to be used in preference to existing single SQL commands for INSERT, UPDATE or DELETE since there is some overhead. MERGE can be used from PL/pgSQL. MERGE does not support targetting updatable views or foreign tables, and RETURNING clauses are not allowed either. These limitations are likely fixable with sufficient effort. Rewrite rules are also not supported, but it's not clear that we'd want to support them. Author: Pavan Deolasee <pavan.deolasee@gmail.com> Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Author: Amit Langote <amitlangote09@gmail.com> Author: Simon Riggs <simon.riggs@enterprisedb.com> Reviewed-by: Peter Eisentraut <peter.eisentraut@enterprisedb.com> Reviewed-by: Andres Freund <andres@anarazel.de> (earlier versions) Reviewed-by: Peter Geoghegan <pg@bowt.ie> (earlier versions) Reviewed-by: Robert Haas <robertmhaas@gmail.com> (earlier versions) Reviewed-by: Japin Li <japinli@hotmail.com> Reviewed-by: Justin Pryzby <pryzby@telsasoft.com> Reviewed-by: Tomas Vondra <tomas.vondra@enterprisedb.com> Reviewed-by: Zhihong Yu <zyu@yugabyte.com> Discussion: https://postgr.es/m/CANP8+jKitBSrB7oTgT9CY2i1ObfOt36z0XMraQc+Xrz8QB0nXA@mail.gmail.com Discussion: https://postgr.es/m/CAH2-WzkJdBuxj9PO=2QaO9-3h3xGbQPZ34kJH=HukRekwM-GZg@mail.gmail.com Discussion: https://postgr.es/m/20201231134736.GA25392@alvherre.pgsql
2022-03-28 16:45:58 +02:00
WRITE_NODE_FIELD(mergeActionList);
WRITE_BOOL_FIELD(mergeUseOuterJoin);
Change representation of statement lists, and add statement location info. This patch makes several changes that improve the consistency of representation of lists of statements. It's always been the case that the output of parse analysis is a list of Query nodes, whatever the types of the individual statements in the list. This patch brings similar consistency to the outputs of raw parsing and planning steps: * The output of raw parsing is now always a list of RawStmt nodes; the statement-type-dependent nodes are one level down from that. * The output of pg_plan_queries() is now always a list of PlannedStmt nodes, even for utility statements. In the case of a utility statement, "planning" just consists of wrapping a CMD_UTILITY PlannedStmt around the utility node. This list representation is now used in Portal and CachedPlan plan lists, replacing the former convention of intermixing PlannedStmts with bare utility-statement nodes. Now, every list of statements has a consistent head-node type depending on how far along it is in processing. This allows changing many places that formerly used generic "Node *" pointers to use a more specific pointer type, thus reducing the number of IsA() tests and casts needed, as well as improving code clarity. Also, the post-parse-analysis representation of DECLARE CURSOR is changed so that it looks more like EXPLAIN, PREPARE, etc. That is, the contained SELECT remains a child of the DeclareCursorStmt rather than getting flipped around to be the other way. It's now true for both Query and PlannedStmt that utilityStmt is non-null if and only if commandType is CMD_UTILITY. That allows simplifying a lot of places that were testing both fields. (I think some of those were just defensive programming, but in many places, it was actually necessary to avoid confusing DECLARE CURSOR with SELECT.) Because PlannedStmt carries a canSetTag field, we're also able to get rid of some ad-hoc rules about how to reconstruct canSetTag for a bare utility statement; specifically, the assumption that a utility is canSetTag if and only if it's the only one in its list. While I see no near-term need for relaxing that restriction, it's nice to get rid of the ad-hocery. The API of ProcessUtility() is changed so that what it's passed is the wrapper PlannedStmt not just the bare utility statement. This will affect all users of ProcessUtility_hook, but the changes are pretty trivial; see the affected contrib modules for examples of the minimum change needed. (Most compilers should give pointer-type-mismatch warnings for uncorrected code.) There's also a change in the API of ExplainOneQuery_hook, to pass through cursorOptions instead of expecting hook functions to know what to pick. This is needed because of the DECLARE CURSOR changes, but really should have been done in 9.6; it's unlikely that any extant hook functions know about using CURSOR_OPT_PARALLEL_OK. Finally, teach gram.y to save statement boundary locations in RawStmt nodes, and pass those through to Query and PlannedStmt nodes. This allows more intelligent handling of cases where a source query string contains multiple statements. This patch doesn't actually do anything with the information, but a follow-on patch will. (Passing this information through cleanly is the true motivation for these changes; while I think this is all good cleanup, it's unlikely we'd have bothered without this end goal.) catversion bump because addition of location fields to struct Query affects stored rules. This patch is by me, but it owes a good deal to Fabien Coelho who did a lot of preliminary work on the problem, and also reviewed the patch. Discussion: https://postgr.es/m/alpine.DEB.2.20.1612200926310.29821@lancre
2017-01-14 22:02:35 +01:00
WRITE_LOCATION_FIELD(stmt_location);
WRITE_INT_FIELD(stmt_len);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#ifdef OBSOLETE
static void
_outWithCheckOption(StringInfo str, const WithCheckOption *node)
{
WRITE_NODE_TYPE("WITHCHECKOPTION");
WRITE_ENUM_FIELD(kind, WCOKind);
WRITE_STRING_FIELD(relname);
WRITE_STRING_FIELD(polname);
WRITE_NODE_FIELD(qual);
WRITE_BOOL_FIELD(cascaded);
}
static void
_outSortGroupClause(StringInfo str, const SortGroupClause *node)
{
WRITE_NODE_TYPE("SORTGROUPCLAUSE");
WRITE_UINT_FIELD(tleSortGroupRef);
WRITE_OID_FIELD(eqop);
WRITE_OID_FIELD(sortop);
WRITE_BOOL_FIELD(nulls_first);
WRITE_BOOL_FIELD(hashable);
}
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
static void
_outGroupingSet(StringInfo str, const GroupingSet *node)
{
WRITE_NODE_TYPE("GROUPINGSET");
WRITE_ENUM_FIELD(kind, GroupingSetKind);
WRITE_NODE_FIELD(content);
WRITE_LOCATION_FIELD(location);
}
static void
_outWindowClause(StringInfo str, const WindowClause *node)
{
WRITE_NODE_TYPE("WINDOWCLAUSE");
WRITE_STRING_FIELD(name);
WRITE_STRING_FIELD(refname);
WRITE_NODE_FIELD(partitionClause);
WRITE_NODE_FIELD(orderClause);
WRITE_INT_FIELD(frameOptions);
WRITE_NODE_FIELD(startOffset);
WRITE_NODE_FIELD(endOffset);
Teach planner and executor about monotonic window funcs Window functions such as row_number() always return a value higher than the previously returned value for tuples in any given window partition. Traditionally queries such as; SELECT * FROM ( SELECT *, row_number() over (order by c) rn FROM t ) t WHERE rn <= 10; were executed fairly inefficiently. Neither the query planner nor the executor knew that once rn made it to 11 that nothing further would match the outer query's WHERE clause. It would blindly continue until all tuples were exhausted from the subquery. Here we implement means to make the above execute more efficiently. This is done by way of adding a pg_proc.prosupport function to various of the built-in window functions and adding supporting code to allow the support function to inform the planner if the window function is monotonically increasing, monotonically decreasing, both or neither. The planner is then able to make use of that information and possibly allow the executor to short-circuit execution by way of adding a "run condition" to the WindowAgg to allow it to determine if some of its execution work can be skipped. This "run condition" is not like a normal filter. These run conditions are only built using quals comparing values to monotonic window functions. For monotonic increasing functions, quals making use of the btree operators for <, <= and = can be used (assuming the window function column is on the left). You can see here that once such a condition becomes false that a monotonic increasing function could never make it subsequently true again. For monotonically decreasing functions the >, >= and = btree operators for the given type can be used for run conditions. The best-case situation for this is when there is a single WindowAgg node without a PARTITION BY clause. Here when the run condition becomes false the WindowAgg node can simply return NULL. No more tuples will ever match the run condition. It's a little more complex when there is a PARTITION BY clause. In this case, we cannot return NULL as we must still process other partitions. To speed this case up we pull tuples from the outer plan to check if they're from the same partition and simply discard them if they are. When we find a tuple belonging to another partition we start processing as normal again until the run condition becomes false or we run out of tuples to process. When there are multiple WindowAgg nodes to evaluate then this complicates the situation. For intermediate WindowAggs we must ensure we always return all tuples to the calling node. Any filtering done could lead to incorrect results in WindowAgg nodes above. For all intermediate nodes, we can still save some work when the run condition becomes false. We've no need to evaluate the WindowFuncs anymore. Other WindowAgg nodes cannot reference the value of these and these tuples will not appear in the final result anyway. The savings here are small in comparison to what can be saved in the top-level WingowAgg, but still worthwhile. Intermediate WindowAgg nodes never filter out tuples, but here we change WindowAgg so that the top-level WindowAgg filters out tuples that don't match the intermediate WindowAgg node's run condition. Such filters appear in the "Filter" clause in EXPLAIN for the top-level WindowAgg node. Here we add prosupport functions to allow the above to work for; row_number(), rank(), dense_rank(), count(*) and count(expr). It appears technically possible to do the same for min() and max(), however, it seems unlikely to be useful enough, so that's not done here. Bump catversion Author: David Rowley Reviewed-by: Andy Fan, Zhihong Yu Discussion: https://postgr.es/m/CAApHDvqvp3At8++yF8ij06sdcoo1S_b2YoaT9D4Nf+MObzsrLQ@mail.gmail.com
2022-04-08 00:34:36 +02:00
WRITE_NODE_FIELD(runCondition);
Support all SQL:2011 options for window frame clauses. This patch adds the ability to use "RANGE offset PRECEDING/FOLLOWING" frame boundaries in window functions. We'd punted on that back in the original patch to add window functions, because it was not clear how to do it in a reasonably data-type-extensible fashion. That problem is resolved here by adding the ability for btree operator classes to provide an "in_range" support function that defines how to add or subtract the RANGE offset value. Factoring it this way also allows the operator class to avoid overflow problems near the ends of the datatype's range, if it wishes to expend effort on that. (In the committed patch, the integer opclasses handle that issue, but it did not seem worth the trouble to avoid overflow failures for datetime types.) The patch includes in_range support for the integer_ops opfamily (int2/int4/int8) as well as the standard datetime types. Support for other numeric types has been requested, but that seems like suitable material for a follow-on patch. In addition, the patch adds GROUPS mode which counts the offset in ORDER-BY peer groups rather than rows, and it adds the frame_exclusion options specified by SQL:2011. As far as I can see, we are now fully up to spec on window framing options. Existing behaviors remain unchanged, except that I changed the errcode for a couple of existing error reports to meet the SQL spec's expectation that negative "offset" values should be reported as SQLSTATE 22013. Internally and in relevant parts of the documentation, we now consistently use the terminology "offset PRECEDING/FOLLOWING" rather than "value PRECEDING/FOLLOWING", since the term "value" is confusingly vague. Oliver Ford, reviewed and whacked around some by me Discussion: https://postgr.es/m/CAGMVOdu9sivPAxbNN0X+q19Sfv9edEPv=HibOJhB14TJv_RCQg@mail.gmail.com
2018-02-07 06:06:50 +01:00
WRITE_OID_FIELD(startInRangeFunc);
WRITE_OID_FIELD(endInRangeFunc);
WRITE_OID_FIELD(inRangeColl);
WRITE_BOOL_FIELD(inRangeAsc);
WRITE_BOOL_FIELD(inRangeNullsFirst);
WRITE_UINT_FIELD(winref);
WRITE_BOOL_FIELD(copiedOrder);
}
static void
_outRowMarkClause(StringInfo str, const RowMarkClause *node)
{
WRITE_NODE_TYPE("ROWMARKCLAUSE");
WRITE_UINT_FIELD(rti);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
WRITE_ENUM_FIELD(strength, LockClauseStrength);
WRITE_ENUM_FIELD(waitPolicy, LockWaitPolicy);
WRITE_BOOL_FIELD(pushedDown);
}
static void
_outWithClause(StringInfo str, const WithClause *node)
{
WRITE_NODE_TYPE("WITHCLAUSE");
WRITE_NODE_FIELD(ctes);
WRITE_BOOL_FIELD(recursive);
WRITE_LOCATION_FIELD(location);
}
static void
_outCTESearchClause(StringInfo str, const CTESearchClause *node)
{
WRITE_NODE_TYPE("CTESEARCHCLAUSE");
WRITE_NODE_FIELD(search_col_list);
WRITE_BOOL_FIELD(search_breadth_first);
WRITE_STRING_FIELD(search_seq_column);
WRITE_LOCATION_FIELD(location);
}
static void
_outCTECycleClause(StringInfo str, const CTECycleClause *node)
{
WRITE_NODE_TYPE("CTECYCLECLAUSE");
WRITE_NODE_FIELD(cycle_col_list);
WRITE_STRING_FIELD(cycle_mark_column);
WRITE_NODE_FIELD(cycle_mark_value);
WRITE_NODE_FIELD(cycle_mark_default);
WRITE_STRING_FIELD(cycle_path_column);
WRITE_LOCATION_FIELD(location);
WRITE_OID_FIELD(cycle_mark_type);
WRITE_INT_FIELD(cycle_mark_typmod);
WRITE_OID_FIELD(cycle_mark_collation);
WRITE_OID_FIELD(cycle_mark_neop);
}
static void
_outCommonTableExpr(StringInfo str, const CommonTableExpr *node)
{
WRITE_NODE_TYPE("COMMONTABLEEXPR");
WRITE_STRING_FIELD(ctename);
WRITE_NODE_FIELD(aliascolnames);
Allow user control of CTE materialization, and change the default behavior. Historically we've always materialized the full output of a CTE query, treating WITH as an optimization fence (so that, for example, restrictions from the outer query cannot be pushed into it). This is appropriate when the CTE query is INSERT/UPDATE/DELETE, or is recursive; but when the CTE query is non-recursive and side-effect-free, there's no hazard of changing the query results by pushing restrictions down. Another argument for materialization is that it can avoid duplicate computation of an expensive WITH query --- but that only applies if the WITH query is called more than once in the outer query. Even then it could still be a net loss, if each call has restrictions that would allow just a small part of the WITH query to be computed. Hence, let's change the behavior for WITH queries that are non-recursive and side-effect-free. By default, we will inline them into the outer query (removing the optimization fence) if they are called just once. If they are called more than once, we will keep the old behavior by default, but the user can override this and force inlining by specifying NOT MATERIALIZED. Lastly, the user can force the old behavior by specifying MATERIALIZED; this would mainly be useful when the query had deliberately been employing WITH as an optimization fence to prevent a poor choice of plan. Andreas Karlsson, Andrew Gierth, David Fetter Discussion: https://postgr.es/m/87sh48ffhb.fsf@news-spur.riddles.org.uk
2019-02-16 22:11:12 +01:00
WRITE_ENUM_FIELD(ctematerialized, CTEMaterialize);
WRITE_NODE_FIELD(ctequery);
WRITE_NODE_FIELD(search_clause);
WRITE_NODE_FIELD(cycle_clause);
WRITE_LOCATION_FIELD(location);
WRITE_BOOL_FIELD(cterecursive);
WRITE_INT_FIELD(cterefcount);
WRITE_NODE_FIELD(ctecolnames);
WRITE_NODE_FIELD(ctecoltypes);
WRITE_NODE_FIELD(ctecoltypmods);
WRITE_NODE_FIELD(ctecolcollations);
}
Add support for MERGE SQL command MERGE performs actions that modify rows in the target table using a source table or query. MERGE provides a single SQL statement that can conditionally INSERT/UPDATE/DELETE rows -- a task that would otherwise require multiple PL statements. For example, MERGE INTO target AS t USING source AS s ON t.tid = s.sid WHEN MATCHED AND t.balance > s.delta THEN UPDATE SET balance = t.balance - s.delta WHEN MATCHED THEN DELETE WHEN NOT MATCHED AND s.delta > 0 THEN INSERT VALUES (s.sid, s.delta) WHEN NOT MATCHED THEN DO NOTHING; MERGE works with regular tables, partitioned tables and inheritance hierarchies, including column and row security enforcement, as well as support for row and statement triggers and transition tables therein. MERGE is optimized for OLTP and is parameterizable, though also useful for large scale ETL/ELT. MERGE is not intended to be used in preference to existing single SQL commands for INSERT, UPDATE or DELETE since there is some overhead. MERGE can be used from PL/pgSQL. MERGE does not support targetting updatable views or foreign tables, and RETURNING clauses are not allowed either. These limitations are likely fixable with sufficient effort. Rewrite rules are also not supported, but it's not clear that we'd want to support them. Author: Pavan Deolasee <pavan.deolasee@gmail.com> Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Author: Amit Langote <amitlangote09@gmail.com> Author: Simon Riggs <simon.riggs@enterprisedb.com> Reviewed-by: Peter Eisentraut <peter.eisentraut@enterprisedb.com> Reviewed-by: Andres Freund <andres@anarazel.de> (earlier versions) Reviewed-by: Peter Geoghegan <pg@bowt.ie> (earlier versions) Reviewed-by: Robert Haas <robertmhaas@gmail.com> (earlier versions) Reviewed-by: Japin Li <japinli@hotmail.com> Reviewed-by: Justin Pryzby <pryzby@telsasoft.com> Reviewed-by: Tomas Vondra <tomas.vondra@enterprisedb.com> Reviewed-by: Zhihong Yu <zyu@yugabyte.com> Discussion: https://postgr.es/m/CANP8+jKitBSrB7oTgT9CY2i1ObfOt36z0XMraQc+Xrz8QB0nXA@mail.gmail.com Discussion: https://postgr.es/m/CAH2-WzkJdBuxj9PO=2QaO9-3h3xGbQPZ34kJH=HukRekwM-GZg@mail.gmail.com Discussion: https://postgr.es/m/20201231134736.GA25392@alvherre.pgsql
2022-03-28 16:45:58 +02:00
static void
_outMergeWhenClause(StringInfo str, const MergeWhenClause *node)
{
WRITE_NODE_TYPE("MERGEWHENCLAUSE");
WRITE_BOOL_FIELD(matched);
WRITE_ENUM_FIELD(commandType, CmdType);
WRITE_ENUM_FIELD(override, OverridingKind);
WRITE_NODE_FIELD(condition);
WRITE_NODE_FIELD(targetList);
WRITE_NODE_FIELD(values);
}
static void
_outMergeAction(StringInfo str, const MergeAction *node)
{
WRITE_NODE_TYPE("MERGEACTION");
WRITE_BOOL_FIELD(matched);
WRITE_ENUM_FIELD(commandType, CmdType);
WRITE_ENUM_FIELD(override, OverridingKind);
WRITE_NODE_FIELD(qual);
WRITE_NODE_FIELD(targetList);
WRITE_NODE_FIELD(updateColnos);
}
static void
_outSetOperationStmt(StringInfo str, const SetOperationStmt *node)
{
WRITE_NODE_TYPE("SETOPERATIONSTMT");
WRITE_ENUM_FIELD(op, SetOperation);
WRITE_BOOL_FIELD(all);
WRITE_NODE_FIELD(larg);
WRITE_NODE_FIELD(rarg);
WRITE_NODE_FIELD(colTypes);
WRITE_NODE_FIELD(colTypmods);
WRITE_NODE_FIELD(colCollations);
WRITE_NODE_FIELD(groupClauses);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#endif /* OBSOLETE */
static void
_outRangeTblEntry(StringInfo str, const RangeTblEntry *node)
{
WRITE_NODE_TYPE("RANGETBLENTRY");
/* put alias + eref first to make dump more legible */
WRITE_NODE_FIELD(alias);
WRITE_NODE_FIELD(eref);
WRITE_ENUM_FIELD(rtekind, RTEKind);
switch (node->rtekind)
{
case RTE_RELATION:
WRITE_OID_FIELD(relid);
WRITE_CHAR_FIELD(relkind);
WRITE_INT_FIELD(rellockmode);
WRITE_NODE_FIELD(tablesample);
break;
case RTE_SUBQUERY:
WRITE_NODE_FIELD(subquery);
WRITE_BOOL_FIELD(security_barrier);
break;
case RTE_JOIN:
WRITE_ENUM_FIELD(jointype, JoinType);
Reconsider the representation of join alias Vars. The core idea of this patch is to make the parser generate join alias Vars (that is, ones with varno pointing to a JOIN RTE) only when the alias Var is actually different from any raw join input, that is a type coercion and/or COALESCE is necessary to generate the join output value. Otherwise just generate varno/varattno pointing to the relevant join input column. In effect, this means that the planner's flatten_join_alias_vars() transformation is already done in the parser, for all cases except (a) columns that are merged by JOIN USING and are transformed in the process, and (b) whole-row join Vars. In principle that would allow us to skip doing flatten_join_alias_vars() in many more queries than we do now, but we don't have quite enough infrastructure to know that we can do so --- in particular there's no cheap way to know whether there are any whole-row join Vars. I'm not sure if it's worth the trouble to add a Query-level flag for that, and in any case it seems like fit material for a separate patch. But even without skipping the work entirely, this should make flatten_join_alias_vars() faster, particularly where there are nested joins that it previously had to flatten recursively. An essential part of this change is to replace Var nodes' varnoold/varoattno fields with varnosyn/varattnosyn, which have considerably more tightly-defined meanings than the old fields: when they differ from varno/varattno, they identify the Var's position in an aliased JOIN RTE, and the join alias is what ruleutils.c should print for the Var. This is necessary because the varno change destroyed ruleutils.c's ability to find the JOIN RTE from the Var's varno. Another way in which this change broke ruleutils.c is that it's no longer feasible to determine, from a JOIN RTE's joinaliasvars list, which join columns correspond to which columns of the join's immediate input relations. (If those are sub-joins, the joinaliasvars entries may point to columns of their base relations, not the sub-joins.) But that was a horrid mess requiring a lot of fragile assumptions already, so let's just bite the bullet and add some more JOIN RTE fields to make it more straightforward to figure that out. I added two integer-List fields containing the relevant column numbers from the left and right input rels, plus a count of how many merged columns there are. This patch depends on the ParseNamespaceColumn infrastructure that I added in commit 5815696bc. The biggest bit of code change is restructuring transformFromClauseItem's handling of JOINs so that the ParseNamespaceColumn data is propagated upward correctly. Other than that and the ruleutils fixes, everything pretty much just works, though some processing is now inessential. I grabbed two pieces of low-hanging fruit in that line: 1. In find_expr_references, we don't need to recurse into join alias Vars anymore. There aren't any except for references to merged USING columns, which are more properly handled when we scan the join's RTE. This change actually fixes an edge-case issue: we will now record a dependency on any type-coercion function present in a USING column's joinaliasvar, even if that join column has no references in the query text. The odds of the missing dependency causing a problem seem quite small: you'd have to posit somebody dropping an implicit cast between two data types, without removing the types themselves, and then having a stored rule containing a whole-row Var for a join whose USING merge depends on that cast. So I don't feel a great need to change this in the back branches. But in theory this way is more correct. 2. markRTEForSelectPriv and markTargetListOrigin don't need to recurse into join alias Vars either, because the cases they care about don't apply to alias Vars for USING columns that are semantically distinct from the underlying columns. This removes the only case in which markVarForSelectPriv could be called with NULL for the RTE, so adjust the comments to describe that hack as being strictly internal to markRTEForSelectPriv. catversion bump required due to changes in stored rules. Discussion: https://postgr.es/m/7115.1577986646@sss.pgh.pa.us
2020-01-09 17:56:59 +01:00
WRITE_INT_FIELD(joinmergedcols);
WRITE_NODE_FIELD(joinaliasvars);
Reconsider the representation of join alias Vars. The core idea of this patch is to make the parser generate join alias Vars (that is, ones with varno pointing to a JOIN RTE) only when the alias Var is actually different from any raw join input, that is a type coercion and/or COALESCE is necessary to generate the join output value. Otherwise just generate varno/varattno pointing to the relevant join input column. In effect, this means that the planner's flatten_join_alias_vars() transformation is already done in the parser, for all cases except (a) columns that are merged by JOIN USING and are transformed in the process, and (b) whole-row join Vars. In principle that would allow us to skip doing flatten_join_alias_vars() in many more queries than we do now, but we don't have quite enough infrastructure to know that we can do so --- in particular there's no cheap way to know whether there are any whole-row join Vars. I'm not sure if it's worth the trouble to add a Query-level flag for that, and in any case it seems like fit material for a separate patch. But even without skipping the work entirely, this should make flatten_join_alias_vars() faster, particularly where there are nested joins that it previously had to flatten recursively. An essential part of this change is to replace Var nodes' varnoold/varoattno fields with varnosyn/varattnosyn, which have considerably more tightly-defined meanings than the old fields: when they differ from varno/varattno, they identify the Var's position in an aliased JOIN RTE, and the join alias is what ruleutils.c should print for the Var. This is necessary because the varno change destroyed ruleutils.c's ability to find the JOIN RTE from the Var's varno. Another way in which this change broke ruleutils.c is that it's no longer feasible to determine, from a JOIN RTE's joinaliasvars list, which join columns correspond to which columns of the join's immediate input relations. (If those are sub-joins, the joinaliasvars entries may point to columns of their base relations, not the sub-joins.) But that was a horrid mess requiring a lot of fragile assumptions already, so let's just bite the bullet and add some more JOIN RTE fields to make it more straightforward to figure that out. I added two integer-List fields containing the relevant column numbers from the left and right input rels, plus a count of how many merged columns there are. This patch depends on the ParseNamespaceColumn infrastructure that I added in commit 5815696bc. The biggest bit of code change is restructuring transformFromClauseItem's handling of JOINs so that the ParseNamespaceColumn data is propagated upward correctly. Other than that and the ruleutils fixes, everything pretty much just works, though some processing is now inessential. I grabbed two pieces of low-hanging fruit in that line: 1. In find_expr_references, we don't need to recurse into join alias Vars anymore. There aren't any except for references to merged USING columns, which are more properly handled when we scan the join's RTE. This change actually fixes an edge-case issue: we will now record a dependency on any type-coercion function present in a USING column's joinaliasvar, even if that join column has no references in the query text. The odds of the missing dependency causing a problem seem quite small: you'd have to posit somebody dropping an implicit cast between two data types, without removing the types themselves, and then having a stored rule containing a whole-row Var for a join whose USING merge depends on that cast. So I don't feel a great need to change this in the back branches. But in theory this way is more correct. 2. markRTEForSelectPriv and markTargetListOrigin don't need to recurse into join alias Vars either, because the cases they care about don't apply to alias Vars for USING columns that are semantically distinct from the underlying columns. This removes the only case in which markVarForSelectPriv could be called with NULL for the RTE, so adjust the comments to describe that hack as being strictly internal to markRTEForSelectPriv. catversion bump required due to changes in stored rules. Discussion: https://postgr.es/m/7115.1577986646@sss.pgh.pa.us
2020-01-09 17:56:59 +01:00
WRITE_NODE_FIELD(joinleftcols);
WRITE_NODE_FIELD(joinrightcols);
WRITE_NODE_FIELD(join_using_alias);
break;
case RTE_FUNCTION:
WRITE_NODE_FIELD(functions);
WRITE_BOOL_FIELD(funcordinality);
break;
case RTE_TABLEFUNC:
WRITE_NODE_FIELD(tablefunc);
break;
case RTE_VALUES:
WRITE_NODE_FIELD(values_lists);
Fix reporting of column typmods for multi-row VALUES constructs. expandRTE() and get_rte_attribute_type() reported the exprType() and exprTypmod() values of the expressions in the first row of the VALUES as being the column type/typmod returned by the VALUES RTE. That's fine for the data type, since we coerce all expressions in a column to have the same common type. But we don't coerce them to have a common typmod, so it was possible for rows after the first one to return values that violate the claimed column typmod. This leads to the incorrect result seen in bug #14448 from Hassan Mahmood, as well as some other corner-case misbehaviors. The desired behavior is the same as we use in other type-unification cases: report the common typmod if there is one, but otherwise return -1 indicating no particular constraint. It's cheap for transformValuesClause to determine the common typmod while transforming a multi-row VALUES, but it'd be less cheap for expandRTE() and get_rte_attribute_type() to re-determine that info every time they're asked --- possibly a lot less cheap, if the VALUES has many rows. Therefore, the best fix is to record the common typmods explicitly in a list in the VALUES RTE, as we were already doing for column collations. This looks quite a bit like what we're doing for CTE RTEs, so we can save a little bit of space and code by unifying the representation for those two RTE types. They both now share coltypes/coltypmods/colcollations fields. (At some point it might seem desirable to populate those fields for all RTE types; but right now it looks like constructing them for other RTE types would add more code and cycles than it would save.) The RTE change requires a catversion bump, so this fix is only usable in HEAD. If we fix this at all in the back branches, the patch will need to look quite different. Report: https://postgr.es/m/20161205143037.4377.60754@wrigleys.postgresql.org Discussion: https://postgr.es/m/27429.1480968538@sss.pgh.pa.us
2016-12-08 17:40:02 +01:00
WRITE_NODE_FIELD(coltypes);
WRITE_NODE_FIELD(coltypmods);
WRITE_NODE_FIELD(colcollations);
break;
case RTE_CTE:
WRITE_STRING_FIELD(ctename);
WRITE_UINT_FIELD(ctelevelsup);
WRITE_BOOL_FIELD(self_reference);
Fix reporting of column typmods for multi-row VALUES constructs. expandRTE() and get_rte_attribute_type() reported the exprType() and exprTypmod() values of the expressions in the first row of the VALUES as being the column type/typmod returned by the VALUES RTE. That's fine for the data type, since we coerce all expressions in a column to have the same common type. But we don't coerce them to have a common typmod, so it was possible for rows after the first one to return values that violate the claimed column typmod. This leads to the incorrect result seen in bug #14448 from Hassan Mahmood, as well as some other corner-case misbehaviors. The desired behavior is the same as we use in other type-unification cases: report the common typmod if there is one, but otherwise return -1 indicating no particular constraint. It's cheap for transformValuesClause to determine the common typmod while transforming a multi-row VALUES, but it'd be less cheap for expandRTE() and get_rte_attribute_type() to re-determine that info every time they're asked --- possibly a lot less cheap, if the VALUES has many rows. Therefore, the best fix is to record the common typmods explicitly in a list in the VALUES RTE, as we were already doing for column collations. This looks quite a bit like what we're doing for CTE RTEs, so we can save a little bit of space and code by unifying the representation for those two RTE types. They both now share coltypes/coltypmods/colcollations fields. (At some point it might seem desirable to populate those fields for all RTE types; but right now it looks like constructing them for other RTE types would add more code and cycles than it would save.) The RTE change requires a catversion bump, so this fix is only usable in HEAD. If we fix this at all in the back branches, the patch will need to look quite different. Report: https://postgr.es/m/20161205143037.4377.60754@wrigleys.postgresql.org Discussion: https://postgr.es/m/27429.1480968538@sss.pgh.pa.us
2016-12-08 17:40:02 +01:00
WRITE_NODE_FIELD(coltypes);
WRITE_NODE_FIELD(coltypmods);
WRITE_NODE_FIELD(colcollations);
break;
case RTE_NAMEDTUPLESTORE:
WRITE_STRING_FIELD(enrname);
WRITE_FLOAT_FIELD(enrtuples, "%.0f");
WRITE_OID_FIELD(relid);
WRITE_NODE_FIELD(coltypes);
WRITE_NODE_FIELD(coltypmods);
WRITE_NODE_FIELD(colcollations);
break;
In the planner, replace an empty FROM clause with a dummy RTE. The fact that "SELECT expression" has no base relations has long been a thorn in the side of the planner. It makes it hard to flatten a sub-query that looks like that, or is a trivial VALUES() item, because the planner generally uses relid sets to identify sub-relations, and such a sub-query would have an empty relid set if we flattened it. prepjointree.c contains some baroque logic that works around this in certain special cases --- but there is a much better answer. We can replace an empty FROM clause with a dummy RTE that acts like a table of one row and no columns, and then there are no such corner cases to worry about. Instead we need some logic to get rid of useless dummy RTEs, but that's simpler and covers more cases than what was there before. For really trivial cases, where the query is just "SELECT expression" and nothing else, there's a hazard that adding the extra RTE makes for a noticeable slowdown; even though it's not much processing, there's not that much for the planner to do overall. However testing says that the penalty is very small, close to the noise level. In more complex queries, this is able to find optimizations that we could not find before. The new RTE type is called RTE_RESULT, since the "scan" plan type it gives rise to is a Result node (the same plan we produced for a "SELECT expression" query before). To avoid confusion, rename the old ResultPath path type to GroupResultPath, reflecting that it's only used in degenerate grouping cases where we know the query produces just one grouped row. (It wouldn't work to unify the two cases, because there are different rules about where the associated quals live during query_planner.) Note: although this touches readfuncs.c, I don't think a catversion bump is required, because the added case can't occur in stored rules, only plans. Patch by me, reviewed by David Rowley and Mark Dilger Discussion: https://postgr.es/m/15944.1521127664@sss.pgh.pa.us
2019-01-28 23:54:10 +01:00
case RTE_RESULT:
/* no extra fields */
break;
default:
elog(ERROR, "unrecognized RTE kind: %d", (int) node->rtekind);
break;
}
WRITE_BOOL_FIELD(lateral);
WRITE_BOOL_FIELD(inh);
WRITE_BOOL_FIELD(inFromCl);
WRITE_UINT_FIELD(requiredPerms);
WRITE_OID_FIELD(checkAsUser);
WRITE_BITMAPSET_FIELD(selectedCols);
WRITE_BITMAPSET_FIELD(insertedCols);
WRITE_BITMAPSET_FIELD(updatedCols);
WRITE_BITMAPSET_FIELD(extraUpdatedCols);
WRITE_NODE_FIELD(securityQuals);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#ifdef OBSOLETE
static void
_outRangeTblFunction(StringInfo str, const RangeTblFunction *node)
{
WRITE_NODE_TYPE("RANGETBLFUNCTION");
WRITE_NODE_FIELD(funcexpr);
WRITE_INT_FIELD(funccolcount);
WRITE_NODE_FIELD(funccolnames);
WRITE_NODE_FIELD(funccoltypes);
WRITE_NODE_FIELD(funccoltypmods);
WRITE_NODE_FIELD(funccolcollations);
WRITE_BITMAPSET_FIELD(funcparams);
}
Redesign tablesample method API, and do extensive code review. The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2015-07-25 20:39:00 +02:00
static void
_outTableSampleClause(StringInfo str, const TableSampleClause *node)
{
WRITE_NODE_TYPE("TABLESAMPLECLAUSE");
WRITE_OID_FIELD(tsmhandler);
WRITE_NODE_FIELD(args);
WRITE_NODE_FIELD(repeatable);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#endif /* OBSOLETE */
Redesign tablesample method API, and do extensive code review. The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2015-07-25 20:39:00 +02:00
static void
_outA_Expr(StringInfo str, const A_Expr *node)
{
WRITE_NODE_TYPE("A_EXPR");
switch (node->kind)
{
case AEXPR_OP:
appendStringInfoChar(str, ' ');
WRITE_NODE_FIELD(name);
break;
case AEXPR_OP_ANY:
appendStringInfoChar(str, ' ');
WRITE_NODE_FIELD(name);
appendStringInfoString(str, " ANY ");
break;
case AEXPR_OP_ALL:
appendStringInfoChar(str, ' ');
WRITE_NODE_FIELD(name);
appendStringInfoString(str, " ALL ");
break;
case AEXPR_DISTINCT:
appendStringInfoString(str, " DISTINCT ");
WRITE_NODE_FIELD(name);
break;
case AEXPR_NOT_DISTINCT:
appendStringInfoString(str, " NOT_DISTINCT ");
WRITE_NODE_FIELD(name);
break;
case AEXPR_NULLIF:
appendStringInfoString(str, " NULLIF ");
WRITE_NODE_FIELD(name);
break;
case AEXPR_IN:
appendStringInfoString(str, " IN ");
WRITE_NODE_FIELD(name);
break;
case AEXPR_LIKE:
appendStringInfoString(str, " LIKE ");
WRITE_NODE_FIELD(name);
break;
case AEXPR_ILIKE:
appendStringInfoString(str, " ILIKE ");
WRITE_NODE_FIELD(name);
break;
case AEXPR_SIMILAR:
appendStringInfoString(str, " SIMILAR ");
WRITE_NODE_FIELD(name);
break;
case AEXPR_BETWEEN:
appendStringInfoString(str, " BETWEEN ");
WRITE_NODE_FIELD(name);
break;
case AEXPR_NOT_BETWEEN:
appendStringInfoString(str, " NOT_BETWEEN ");
WRITE_NODE_FIELD(name);
break;
case AEXPR_BETWEEN_SYM:
appendStringInfoString(str, " BETWEEN_SYM ");
WRITE_NODE_FIELD(name);
break;
case AEXPR_NOT_BETWEEN_SYM:
appendStringInfoString(str, " NOT_BETWEEN_SYM ");
WRITE_NODE_FIELD(name);
break;
2000-05-26 00:43:12 +02:00
default:
appendStringInfoString(str, " ??");
2000-05-26 00:43:12 +02:00
break;
}
WRITE_NODE_FIELD(lexpr);
WRITE_NODE_FIELD(rexpr);
WRITE_LOCATION_FIELD(location);
}
static void
_outInteger(StringInfo str, const Integer *node)
{
appendStringInfo(str, "%d", node->ival);
}
2004-08-29 07:07:03 +02:00
static void
_outFloat(StringInfo str, const Float *node)
{
/*
* We assume the value is a valid numeric literal and so does not need
* quoting.
*/
appendStringInfoString(str, node->fval);
}
static void
_outBoolean(StringInfo str, const Boolean *node)
{
appendStringInfoString(str, node->boolval ? "true" : "false");
}
static void
_outString(StringInfo str, const String *node)
{
/*
* We use outToken to provide escaping of the string's content, but we
* don't want it to do anything with an empty string.
*/
appendStringInfoChar(str, '"');
if (node->sval[0] != '\0')
outToken(str, node->sval);
appendStringInfoChar(str, '"');
}
static void
_outBitString(StringInfo str, const BitString *node)
{
/* internal representation already has leading 'b' */
appendStringInfoString(str, node->bsval);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#ifdef OBSOLETE
static void
_outColumnRef(StringInfo str, const ColumnRef *node)
{
WRITE_NODE_TYPE("COLUMNREF");
WRITE_NODE_FIELD(fields);
WRITE_LOCATION_FIELD(location);
}
static void
_outParamRef(StringInfo str, const ParamRef *node)
{
WRITE_NODE_TYPE("PARAMREF");
WRITE_INT_FIELD(number);
WRITE_LOCATION_FIELD(location);
}
/*
* Node types found in raw parse trees (supported for debug purposes)
*/
static void
_outRawStmt(StringInfo str, const RawStmt *node)
{
WRITE_NODE_TYPE("RAWSTMT");
WRITE_NODE_FIELD(stmt);
WRITE_LOCATION_FIELD(stmt_location);
WRITE_INT_FIELD(stmt_len);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#endif /* OBSOLETE */
static void
_outA_Const(StringInfo str, const A_Const *node)
{
WRITE_NODE_TYPE("A_CONST");
if (node->isnull)
appendStringInfoString(str, "NULL");
else
{
appendStringInfoString(str, " :val ");
outNode(str, &node->val);
}
WRITE_LOCATION_FIELD(location);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#ifdef OBSOLETE
static void
_outA_Star(StringInfo str, const A_Star *node)
{
WRITE_NODE_TYPE("A_STAR");
}
static void
_outA_Indices(StringInfo str, const A_Indices *node)
{
WRITE_NODE_TYPE("A_INDICES");
WRITE_BOOL_FIELD(is_slice);
WRITE_NODE_FIELD(lidx);
WRITE_NODE_FIELD(uidx);
}
static void
_outA_Indirection(StringInfo str, const A_Indirection *node)
{
WRITE_NODE_TYPE("A_INDIRECTION");
WRITE_NODE_FIELD(arg);
WRITE_NODE_FIELD(indirection);
}
static void
_outA_ArrayExpr(StringInfo str, const A_ArrayExpr *node)
{
WRITE_NODE_TYPE("A_ARRAYEXPR");
WRITE_NODE_FIELD(elements);
WRITE_LOCATION_FIELD(location);
}
static void
_outResTarget(StringInfo str, const ResTarget *node)
{
WRITE_NODE_TYPE("RESTARGET");
WRITE_STRING_FIELD(name);
WRITE_NODE_FIELD(indirection);
WRITE_NODE_FIELD(val);
WRITE_LOCATION_FIELD(location);
}
static void
_outMultiAssignRef(StringInfo str, const MultiAssignRef *node)
{
WRITE_NODE_TYPE("MULTIASSIGNREF");
WRITE_NODE_FIELD(source);
WRITE_INT_FIELD(colno);
WRITE_INT_FIELD(ncolumns);
}
static void
_outSortBy(StringInfo str, const SortBy *node)
{
WRITE_NODE_TYPE("SORTBY");
WRITE_NODE_FIELD(node);
WRITE_ENUM_FIELD(sortby_dir, SortByDir);
WRITE_ENUM_FIELD(sortby_nulls, SortByNulls);
WRITE_NODE_FIELD(useOp);
WRITE_LOCATION_FIELD(location);
}
static void
_outWindowDef(StringInfo str, const WindowDef *node)
{
WRITE_NODE_TYPE("WINDOWDEF");
WRITE_STRING_FIELD(name);
WRITE_STRING_FIELD(refname);
WRITE_NODE_FIELD(partitionClause);
WRITE_NODE_FIELD(orderClause);
WRITE_INT_FIELD(frameOptions);
WRITE_NODE_FIELD(startOffset);
WRITE_NODE_FIELD(endOffset);
WRITE_LOCATION_FIELD(location);
}
static void
_outRangeSubselect(StringInfo str, const RangeSubselect *node)
{
WRITE_NODE_TYPE("RANGESUBSELECT");
WRITE_BOOL_FIELD(lateral);
WRITE_NODE_FIELD(subquery);
WRITE_NODE_FIELD(alias);
}
static void
_outRangeFunction(StringInfo str, const RangeFunction *node)
{
WRITE_NODE_TYPE("RANGEFUNCTION");
WRITE_BOOL_FIELD(lateral);
WRITE_BOOL_FIELD(ordinality);
WRITE_BOOL_FIELD(is_rowsfrom);
WRITE_NODE_FIELD(functions);
WRITE_NODE_FIELD(alias);
WRITE_NODE_FIELD(coldeflist);
}
Redesign tablesample method API, and do extensive code review. The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2015-07-25 20:39:00 +02:00
static void
_outRangeTableSample(StringInfo str, const RangeTableSample *node)
{
WRITE_NODE_TYPE("RANGETABLESAMPLE");
WRITE_NODE_FIELD(relation);
WRITE_NODE_FIELD(method);
WRITE_NODE_FIELD(args);
WRITE_NODE_FIELD(repeatable);
WRITE_LOCATION_FIELD(location);
}
static void
_outRangeTableFunc(StringInfo str, const RangeTableFunc *node)
{
WRITE_NODE_TYPE("RANGETABLEFUNC");
WRITE_BOOL_FIELD(lateral);
WRITE_NODE_FIELD(docexpr);
WRITE_NODE_FIELD(rowexpr);
WRITE_NODE_FIELD(namespaces);
WRITE_NODE_FIELD(columns);
WRITE_NODE_FIELD(alias);
WRITE_LOCATION_FIELD(location);
}
static void
_outRangeTableFuncCol(StringInfo str, const RangeTableFuncCol *node)
{
WRITE_NODE_TYPE("RANGETABLEFUNCCOL");
WRITE_STRING_FIELD(colname);
WRITE_NODE_FIELD(typeName);
WRITE_BOOL_FIELD(for_ordinality);
WRITE_BOOL_FIELD(is_not_null);
WRITE_NODE_FIELD(colexpr);
WRITE_NODE_FIELD(coldefexpr);
WRITE_LOCATION_FIELD(location);
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#endif /* OBSOLETE */
1998-12-04 16:34:49 +01:00
static void
_outConstraint(StringInfo str, const Constraint *node)
1998-12-04 16:34:49 +01:00
{
WRITE_NODE_TYPE("CONSTRAINT");
WRITE_STRING_FIELD(conname);
WRITE_BOOL_FIELD(deferrable);
WRITE_BOOL_FIELD(initdeferred);
WRITE_LOCATION_FIELD(location);
1998-12-04 16:34:49 +01:00
appendStringInfoString(str, " :contype ");
1998-12-04 16:34:49 +01:00
switch (node->contype)
{
case CONSTR_NULL:
appendStringInfoString(str, "NULL");
break;
case CONSTR_NOTNULL:
appendStringInfoString(str, "NOT_NULL");
break;
case CONSTR_DEFAULT:
appendStringInfoString(str, "DEFAULT");
WRITE_NODE_FIELD(raw_expr);
WRITE_STRING_FIELD(cooked_expr);
break;
case CONSTR_IDENTITY:
appendStringInfoString(str, "IDENTITY");
WRITE_NODE_FIELD(raw_expr);
WRITE_STRING_FIELD(cooked_expr);
WRITE_CHAR_FIELD(generated_when);
break;
case CONSTR_GENERATED:
appendStringInfoString(str, "GENERATED");
WRITE_NODE_FIELD(raw_expr);
WRITE_STRING_FIELD(cooked_expr);
WRITE_CHAR_FIELD(generated_when);
break;
case CONSTR_CHECK:
appendStringInfoString(str, "CHECK");
WRITE_BOOL_FIELD(is_no_inherit);
WRITE_NODE_FIELD(raw_expr);
WRITE_STRING_FIELD(cooked_expr);
break;
1998-12-04 16:34:49 +01:00
case CONSTR_PRIMARY:
appendStringInfoString(str, "PRIMARY_KEY");
WRITE_NODE_FIELD(keys);
WRITE_NODE_FIELD(including);
WRITE_NODE_FIELD(options);
WRITE_STRING_FIELD(indexname);
WRITE_STRING_FIELD(indexspace);
WRITE_BOOL_FIELD(reset_default_tblspc);
/* access_method and where_clause not currently used */
break;
case CONSTR_UNIQUE:
appendStringInfoString(str, "UNIQUE");
WRITE_BOOL_FIELD(nulls_not_distinct);
WRITE_NODE_FIELD(keys);
WRITE_NODE_FIELD(including);
WRITE_NODE_FIELD(options);
WRITE_STRING_FIELD(indexname);
WRITE_STRING_FIELD(indexspace);
WRITE_BOOL_FIELD(reset_default_tblspc);
/* access_method and where_clause not currently used */
break;
case CONSTR_EXCLUSION:
appendStringInfoString(str, "EXCLUSION");
WRITE_NODE_FIELD(exclusions);
WRITE_NODE_FIELD(including);
WRITE_NODE_FIELD(options);
WRITE_STRING_FIELD(indexname);
WRITE_STRING_FIELD(indexspace);
WRITE_BOOL_FIELD(reset_default_tblspc);
WRITE_STRING_FIELD(access_method);
WRITE_NODE_FIELD(where_clause);
1998-12-04 16:34:49 +01:00
break;
case CONSTR_FOREIGN:
appendStringInfoString(str, "FOREIGN_KEY");
WRITE_NODE_FIELD(pktable);
WRITE_NODE_FIELD(fk_attrs);
WRITE_NODE_FIELD(pk_attrs);
WRITE_CHAR_FIELD(fk_matchtype);
WRITE_CHAR_FIELD(fk_upd_action);
WRITE_CHAR_FIELD(fk_del_action);
WRITE_NODE_FIELD(fk_del_set_cols);
WRITE_NODE_FIELD(old_conpfeqop);
WRITE_OID_FIELD(old_pktable_oid);
WRITE_BOOL_FIELD(skip_validation);
WRITE_BOOL_FIELD(initially_valid);
1998-12-04 16:34:49 +01:00
break;
case CONSTR_ATTR_DEFERRABLE:
appendStringInfoString(str, "ATTR_DEFERRABLE");
1998-12-04 16:34:49 +01:00
break;
case CONSTR_ATTR_NOT_DEFERRABLE:
appendStringInfoString(str, "ATTR_NOT_DEFERRABLE");
1998-12-04 16:34:49 +01:00
break;
case CONSTR_ATTR_DEFERRED:
appendStringInfoString(str, "ATTR_DEFERRED");
1998-12-04 16:34:49 +01:00
break;
case CONSTR_ATTR_IMMEDIATE:
appendStringInfoString(str, "ATTR_IMMEDIATE");
break;
default:
appendStringInfo(str, "<unrecognized_constraint %d>",
(int) node->contype);
break;
}
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#ifdef OBSOLETE
static void
_outForeignKeyCacheInfo(StringInfo str, const ForeignKeyCacheInfo *node)
{
WRITE_NODE_TYPE("FOREIGNKEYCACHEINFO");
Correct attach/detach logic for FKs in partitions There was no code to handle foreign key constraints on partitioned tables in the case of ALTER TABLE DETACH; and if you happened to ATTACH a partition that already had an equivalent constraint, that one was ignored and a new constraint was created. Adding this to the fact that foreign key cloning reuses the constraint name on the partition instead of generating a new name (as it probably should, to cater to SQL standard rules about constraint naming within schemas), the result was a pretty poor user experience -- the most visible failure was that just detaching a partition and re-attaching it failed with an error such as ERROR: duplicate key value violates unique constraint "pg_constraint_conrelid_contypid_conname_index" DETAIL: Key (conrelid, contypid, conname)=(26702, 0, test_result_asset_id_fkey) already exists. because it would try to create an identically-named constraint in the partition. To make matters worse, if you tried to drop the constraint in the now-independent partition, that would fail because the constraint was still seen as dependent on the constraint in its former parent partitioned table: ERROR: cannot drop inherited constraint "test_result_asset_id_fkey" of relation "test_result_cbsystem_0001_0050_monthly_2018_09" This fix attacks the problem from two angles: first, when the partition is detached, the constraint is also marked as independent, so the drop now works. Second, when the partition is re-attached, we scan existing constraints searching for one matching the FK in the parent, and if one exists, we link that one to the parent constraint. So we don't end up with a duplicate -- and better yet, we don't need to scan the referenced table to verify that the constraint holds. To implement this I made a small change to previously planner-only struct ForeignKeyCacheInfo to contain the constraint OID; also relcache now maintains the list of FKs for partitioned tables too. Backpatch to 11. Reported-by: Michael Vitale (bug #15425) Discussion: https://postgr.es/m/15425-2dbc9d2aa999f816@postgresql.org
2018-10-12 17:36:26 +02:00
WRITE_OID_FIELD(conoid);
WRITE_OID_FIELD(conrelid);
WRITE_OID_FIELD(confrelid);
WRITE_INT_FIELD(nkeys);
WRITE_ATTRNUMBER_ARRAY(conkey, node->nkeys);
WRITE_ATTRNUMBER_ARRAY(confkey, node->nkeys);
WRITE_OID_ARRAY(conpfeqop, node->nkeys);
}
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
static void
_outPartitionElem(StringInfo str, const PartitionElem *node)
{
WRITE_NODE_TYPE("PARTITIONELEM");
WRITE_STRING_FIELD(name);
WRITE_NODE_FIELD(expr);
WRITE_NODE_FIELD(collation);
WRITE_NODE_FIELD(opclass);
WRITE_LOCATION_FIELD(location);
}
Code review focused on new node types added by partitioning support. Fix failure to check that we got a plain Const from const-simplification of a coercion request. This is the cause of bug #14666 from Tian Bing: there is an int4 to money cast, but it's only stable not immutable (because of dependence on lc_monetary), resulting in a FuncExpr that the code was miserably unequipped to deal with, or indeed even to notice that it was failing to deal with. Add test cases around this coercion behavior. In view of the above, sprinkle the code liberally with castNode() macros, in hope of catching the next such bug a bit sooner. Also, change some functions that were randomly declared to take Node* to take more specific pointer types. And change some struct fields that were declared Node* but could be given more specific types, allowing removal of assorted explicit casts. Place PARTITION_MAX_KEYS check a bit closer to the code it's protecting. Likewise check only-one-key-for-list-partitioning restriction in a less random place. Avoid not-per-project-style usages like !strcmp(...). Fix assorted failures to avoid scribbling on the input of parse transformation. I'm not sure how necessary this is, but it's entirely silly for these functions to be expending cycles to avoid that and not getting it right. Add guards against partitioning on system columns. Put backend/nodes/ support code into an order that matches handling of these node types elsewhere. Annotate the fact that somebody added location fields to PartitionBoundSpec and PartitionRangeDatum but forgot to handle them in outfuncs.c/readfuncs.c. This is fairly harmless for production purposes (since readfuncs.c would just substitute -1 anyway) but it's still bogus. It's not worth forcing a post-beta1 initdb just to fix this, but if we have another reason to force initdb before 10.0, we should go back and clean this up. Contrariwise, somebody added location fields to PartitionElem and PartitionSpec but forgot to teach exprLocation() about them. Consolidate duplicative code in transformPartitionBound(). Improve a couple of error messages. Improve assorted commentary. Re-pgindent the files touched by this patch; this affects a few comment blocks that must have been added quite recently. Report: https://postgr.es/m/20170524024550.29935.14396@wrigleys.postgresql.org
2017-05-29 05:20:28 +02:00
static void
_outPartitionSpec(StringInfo str, const PartitionSpec *node)
{
WRITE_NODE_TYPE("PARTITIONSPEC");
Code review focused on new node types added by partitioning support. Fix failure to check that we got a plain Const from const-simplification of a coercion request. This is the cause of bug #14666 from Tian Bing: there is an int4 to money cast, but it's only stable not immutable (because of dependence on lc_monetary), resulting in a FuncExpr that the code was miserably unequipped to deal with, or indeed even to notice that it was failing to deal with. Add test cases around this coercion behavior. In view of the above, sprinkle the code liberally with castNode() macros, in hope of catching the next such bug a bit sooner. Also, change some functions that were randomly declared to take Node* to take more specific pointer types. And change some struct fields that were declared Node* but could be given more specific types, allowing removal of assorted explicit casts. Place PARTITION_MAX_KEYS check a bit closer to the code it's protecting. Likewise check only-one-key-for-list-partitioning restriction in a less random place. Avoid not-per-project-style usages like !strcmp(...). Fix assorted failures to avoid scribbling on the input of parse transformation. I'm not sure how necessary this is, but it's entirely silly for these functions to be expending cycles to avoid that and not getting it right. Add guards against partitioning on system columns. Put backend/nodes/ support code into an order that matches handling of these node types elsewhere. Annotate the fact that somebody added location fields to PartitionBoundSpec and PartitionRangeDatum but forgot to handle them in outfuncs.c/readfuncs.c. This is fairly harmless for production purposes (since readfuncs.c would just substitute -1 anyway) but it's still bogus. It's not worth forcing a post-beta1 initdb just to fix this, but if we have another reason to force initdb before 10.0, we should go back and clean this up. Contrariwise, somebody added location fields to PartitionElem and PartitionSpec but forgot to teach exprLocation() about them. Consolidate duplicative code in transformPartitionBound(). Improve a couple of error messages. Improve assorted commentary. Re-pgindent the files touched by this patch; this affects a few comment blocks that must have been added quite recently. Report: https://postgr.es/m/20170524024550.29935.14396@wrigleys.postgresql.org
2017-05-29 05:20:28 +02:00
WRITE_STRING_FIELD(strategy);
WRITE_NODE_FIELD(partParams);
WRITE_LOCATION_FIELD(location);
}
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
static void
_outPartitionBoundSpec(StringInfo str, const PartitionBoundSpec *node)
{
WRITE_NODE_TYPE("PARTITIONBOUNDSPEC");
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
WRITE_CHAR_FIELD(strategy);
WRITE_BOOL_FIELD(is_default);
WRITE_INT_FIELD(modulus);
WRITE_INT_FIELD(remainder);
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
WRITE_NODE_FIELD(listdatums);
WRITE_NODE_FIELD(lowerdatums);
WRITE_NODE_FIELD(upperdatums);
WRITE_LOCATION_FIELD(location);
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
}
static void
_outPartitionRangeDatum(StringInfo str, const PartitionRangeDatum *node)
{
WRITE_NODE_TYPE("PARTITIONRANGEDATUM");
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
WRITE_ENUM_FIELD(kind, PartitionRangeDatumKind);
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
WRITE_NODE_FIELD(value);
WRITE_LOCATION_FIELD(location);
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
}
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#endif /* OBSOLETE */
/*
* outNode -
* converts a Node into ascii string and append it to 'str'
*/
void
outNode(StringInfo str, const void *obj)
{
/* Guard against stack overflow due to overly complex expressions */
check_stack_depth();
if (obj == NULL)
appendStringInfoString(str, "<>");
else if (IsA(obj, List) || IsA(obj, IntList) || IsA(obj, OidList))
_outList(str, obj);
/* nodeRead does not want to see { } around these! */
else if (IsA(obj, Integer))
_outInteger(str, (Integer *) obj);
else if (IsA(obj, Float))
_outFloat(str, (Float *) obj);
else if (IsA(obj, Boolean))
_outBoolean(str, (Boolean *) obj);
else if (IsA(obj, String))
_outString(str, (String *) obj);
else if (IsA(obj, BitString))
_outBitString(str, (BitString *) obj);
else
{
appendStringInfoChar(str, '{');
switch (nodeTag(obj))
{
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#include "outfuncs.switch.c"
#ifdef OBSOLETE
case T_PlannedStmt:
_outPlannedStmt(str, obj);
break;
case T_Result:
_outResult(str, obj);
break;
Move targetlist SRF handling from expression evaluation to new executor node. Evaluation of set returning functions (SRFs_ in the targetlist (like SELECT generate_series(1,5)) so far was done in the expression evaluation (i.e. ExecEvalExpr()) and projection (i.e. ExecProject/ExecTargetList) code. This meant that most executor nodes performing projection, and most expression evaluation functions, had to deal with the possibility that an evaluated expression could return a set of return values. That's bad because it leads to repeated code in a lot of places. It also, and that's my (Andres's) motivation, made it a lot harder to implement a more efficient way of doing expression evaluation. To fix this, introduce a new executor node (ProjectSet) that can evaluate targetlists containing one or more SRFs. To avoid the complexity of the old way of handling nested expressions returning sets (e.g. having to pass up ExprDoneCond, and dealing with arguments to functions returning sets etc.), those SRFs can only be at the top level of the node's targetlist. The planner makes sure (via split_pathtarget_at_srfs()) that SRF evaluation is only necessary in ProjectSet nodes and that SRFs are only present at the top level of the node's targetlist. If there are nested SRFs the planner creates multiple stacked ProjectSet nodes. The ProjectSet nodes always get input from an underlying node. We also discussed and prototyped evaluating targetlist SRFs using ROWS FROM(), but that turned out to be more complicated than we'd hoped. While moving SRF evaluation to ProjectSet would allow to retain the old "least common multiple" behavior when multiple SRFs are present in one targetlist (i.e. continue returning rows until all SRFs are at the end of their input at the same time), we decided to instead only return rows till all SRFs are exhausted, returning NULL for already exhausted ones. We deemed the previous behavior to be too confusing, unexpected and actually not particularly useful. As a side effect, the previously prohibited case of multiple set returning arguments to a function, is now allowed. Not because it's particularly desirable, but because it ends up working and there seems to be no argument for adding code to prohibit it. Currently the behavior for COALESCE and CASE containing SRFs has changed, returning multiple rows from the expression, even when the SRF containing "arm" of the expression is not evaluated. That's because the SRFs are evaluated in a separate ProjectSet node. As that's quite confusing, we're likely to instead prohibit SRFs in those places. But that's still being discussed, and the code would reside in places not touched here, so that's a task for later. There's a lot of, now superfluous, code dealing with set return expressions around. But as the changes to get rid of those are verbose largely boring, it seems better for readability to keep the cleanup as a separate commit. Author: Tom Lane and Andres Freund Discussion: https://postgr.es/m/20160822214023.aaxz5l4igypowyri@alap3.anarazel.de
2017-01-18 21:46:50 +01:00
case T_ProjectSet:
_outProjectSet(str, obj);
break;
case T_ModifyTable:
_outModifyTable(str, obj);
break;
case T_Append:
_outAppend(str, obj);
break;
case T_MergeAppend:
_outMergeAppend(str, obj);
break;
case T_RecursiveUnion:
_outRecursiveUnion(str, obj);
break;
case T_BitmapAnd:
_outBitmapAnd(str, obj);
break;
case T_BitmapOr:
_outBitmapOr(str, obj);
break;
case T_Gather:
_outGather(str, obj);
break;
case T_GatherMerge:
_outGatherMerge(str, obj);
break;
case T_Scan:
_outScan(str, obj);
break;
case T_SeqScan:
_outSeqScan(str, obj);
break;
Redesign tablesample method API, and do extensive code review. The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2015-07-25 20:39:00 +02:00
case T_SampleScan:
_outSampleScan(str, obj);
break;
case T_IndexScan:
_outIndexScan(str, obj);
break;
case T_IndexOnlyScan:
_outIndexOnlyScan(str, obj);
break;
case T_BitmapIndexScan:
_outBitmapIndexScan(str, obj);
break;
case T_BitmapHeapScan:
_outBitmapHeapScan(str, obj);
break;
case T_TidScan:
_outTidScan(str, obj);
break;
case T_TidRangeScan:
_outTidRangeScan(str, obj);
break;
case T_SubqueryScan:
_outSubqueryScan(str, obj);
break;
case T_FunctionScan:
_outFunctionScan(str, obj);
break;
case T_TableFuncScan:
_outTableFuncScan(str, obj);
break;
case T_ValuesScan:
_outValuesScan(str, obj);
break;
case T_CteScan:
_outCteScan(str, obj);
break;
case T_NamedTuplestoreScan:
_outNamedTuplestoreScan(str, obj);
break;
case T_WorkTableScan:
_outWorkTableScan(str, obj);
break;
case T_ForeignScan:
_outForeignScan(str, obj);
break;
case T_CustomScan:
_outCustomScan(str, obj);
break;
case T_NestLoop:
_outNestLoop(str, obj);
break;
case T_MergeJoin:
_outMergeJoin(str, obj);
break;
case T_HashJoin:
_outHashJoin(str, obj);
break;
case T_Agg:
_outAgg(str, obj);
break;
case T_WindowAgg:
_outWindowAgg(str, obj);
break;
case T_Group:
_outGroup(str, obj);
break;
case T_Material:
_outMaterial(str, obj);
break;
case T_Memoize:
_outMemoize(str, obj);
Add Result Cache executor node (take 2) Here we add a new executor node type named "Result Cache". The planner can include this node type in the plan to have the executor cache the results from the inner side of parameterized nested loop joins. This allows caching of tuples for sets of parameters so that in the event that the node sees the same parameter values again, it can just return the cached tuples instead of rescanning the inner side of the join all over again. Internally, result cache uses a hash table in order to quickly find tuples that have been previously cached. For certain data sets, this can significantly improve the performance of joins. The best cases for using this new node type are for join problems where a large portion of the tuples from the inner side of the join have no join partner on the outer side of the join. In such cases, hash join would have to hash values that are never looked up, thus bloating the hash table and possibly causing it to multi-batch. Merge joins would have to skip over all of the unmatched rows. If we use a nested loop join with a result cache, then we only cache tuples that have at least one join partner on the outer side of the join. The benefits of using a parameterized nested loop with a result cache increase when there are fewer distinct values being looked up and the number of lookups of each value is large. Also, hash probes to lookup the cache can be much faster than the hash probe in a hash join as it's common that the result cache's hash table is much smaller than the hash join's due to result cache only caching useful tuples rather than all tuples from the inner side of the join. This variation in hash probe performance is more significant when the hash join's hash table no longer fits into the CPU's L3 cache, but the result cache's hash table does. The apparent "random" access of hash buckets with each hash probe can cause a poor L3 cache hit ratio for large hash tables. Smaller hash tables generally perform better. The hash table used for the cache limits itself to not exceeding work_mem * hash_mem_multiplier in size. We maintain a dlist of keys for this cache and when we're adding new tuples and realize we've exceeded the memory budget, we evict cache entries starting with the least recently used ones until we have enough memory to add the new tuples to the cache. For parameterized nested loop joins, we now consider using one of these result cache nodes in between the nested loop node and its inner node. We determine when this might be useful based on cost, which is primarily driven off of what the expected cache hit ratio will be. Estimating the cache hit ratio relies on having good distinct estimates on the nested loop's parameters. For now, the planner will only consider using a result cache for parameterized nested loop joins. This works for both normal joins and also for LATERAL type joins to subqueries. It is possible to use this new node for other uses in the future. For example, to cache results from correlated subqueries. However, that's not done here due to some difficulties obtaining a distinct estimation on the outer plan to calculate the estimated cache hit ratio. Currently we plan the inner plan before planning the outer plan so there is no good way to know if a result cache would be useful or not since we can't estimate the number of times the subplan will be called until the outer plan is generated. The functionality being added here is newly introducing a dependency on the return value of estimate_num_groups() during the join search. Previously, during the join search, we only ever needed to perform selectivity estimations. With this commit, we need to use estimate_num_groups() in order to estimate what the hit ratio on the result cache will be. In simple terms, if we expect 10 distinct values and we expect 1000 outer rows, then we'll estimate the hit ratio to be 99%. Since cache hits are very cheap compared to scanning the underlying nodes on the inner side of the nested loop join, then this will significantly reduce the planner's cost for the join. However, it's fairly easy to see here that things will go bad when estimate_num_groups() incorrectly returns a value that's significantly lower than the actual number of distinct values. If this happens then that may cause us to make use of a nested loop join with a result cache instead of some other join type, such as a merge or hash join. Our distinct estimations have been known to be a source of trouble in the past, so the extra reliance on them here could cause the planner to choose slower plans than it did previous to having this feature. Distinct estimations are also fairly hard to estimate accurately when several tables have been joined already or when a WHERE clause filters out a set of values that are correlated to the expressions we're estimating the number of distinct value for. For now, the costing we perform during query planning for result caches does put quite a bit of faith in the distinct estimations being accurate. When these are accurate then we should generally see faster execution times for plans containing a result cache. However, in the real world, we may find that we need to either change the costings to put less trust in the distinct estimations being accurate or perhaps even disable this feature by default. There's always an element of risk when we teach the query planner to do new tricks that it decides to use that new trick at the wrong time and causes a regression. Users may opt to get the old behavior by turning the feature off using the enable_resultcache GUC. Currently, this is enabled by default. It remains to be seen if we'll maintain that setting for the release. Additionally, the name "Result Cache" is the best name I could think of for this new node at the time I started writing the patch. Nobody seems to strongly dislike the name. A few people did suggest other names but no other name seemed to dominate in the brief discussion that there was about names. Let's allow the beta period to see if the current name pleases enough people. If there's some consensus on a better name, then we can change it before the release. Please see the 2nd discussion link below for the discussion on the "Result Cache" name. Author: David Rowley Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie Tested-By: Konstantin Knizhnik Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
break;
case T_Sort:
_outSort(str, obj);
break;
Implement Incremental Sort Incremental Sort is an optimized variant of multikey sort for cases when the input is already sorted by a prefix of the requested sort keys. For example when the relation is already sorted by (key1, key2) and we need to sort it by (key1, key2, key3) we can simply split the input rows into groups having equal values in (key1, key2), and only sort/compare the remaining column key3. This has a number of benefits: - Reduced memory consumption, because only a single group (determined by values in the sorted prefix) needs to be kept in memory. This may also eliminate the need to spill to disk. - Lower startup cost, because Incremental Sort produce results after each prefix group, which is beneficial for plans where startup cost matters (like for example queries with LIMIT clause). We consider both Sort and Incremental Sort, and decide based on costing. The implemented algorithm operates in two different modes: - Fetching a minimum number of tuples without check of equality on the prefix keys, and sorting on all columns when safe. - Fetching all tuples for a single prefix group and then sorting by comparing only the remaining (non-prefix) keys. We always start in the first mode, and employ a heuristic to switch into the second mode if we believe it's beneficial - the goal is to minimize the number of unnecessary comparions while keeping memory consumption below work_mem. This is a very old patch series. The idea was originally proposed by Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the patch was taken over by James Coleman, who wrote and rewrote most of the current code. There were many reviewers/contributors since 2013 - I've done my best to pick the most active ones, and listed them in this commit message. Author: James Coleman, Alexander Korotkov Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
case T_IncrementalSort:
_outIncrementalSort(str, obj);
break;
case T_Unique:
_outUnique(str, obj);
break;
case T_Hash:
_outHash(str, obj);
break;
case T_SetOp:
_outSetOp(str, obj);
break;
case T_LockRows:
_outLockRows(str, obj);
break;
case T_Limit:
_outLimit(str, obj);
break;
case T_NestLoopParam:
_outNestLoopParam(str, obj);
break;
case T_PlanRowMark:
_outPlanRowMark(str, obj);
break;
case T_PartitionPruneInfo:
_outPartitionPruneInfo(str, obj);
break;
case T_PartitionedRelPruneInfo:
_outPartitionedRelPruneInfo(str, obj);
break;
case T_PartitionPruneStepOp:
_outPartitionPruneStepOp(str, obj);
break;
case T_PartitionPruneStepCombine:
_outPartitionPruneStepCombine(str, obj);
break;
case T_PlanInvalItem:
_outPlanInvalItem(str, obj);
break;
case T_Alias:
_outAlias(str, obj);
break;
case T_RangeVar:
_outRangeVar(str, obj);
break;
case T_TableFunc:
_outTableFunc(str, obj);
break;
case T_IntoClause:
_outIntoClause(str, obj);
break;
case T_Var:
_outVar(str, obj);
break;
case T_Const:
_outConst(str, obj);
break;
case T_Param:
_outParam(str, obj);
break;
1999-01-24 01:28:37 +01:00
case T_Aggref:
_outAggref(str, obj);
break;
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
case T_GroupingFunc:
_outGroupingFunc(str, obj);
break;
case T_WindowFunc:
_outWindowFunc(str, obj);
break;
case T_SubscriptingRef:
_outSubscriptingRef(str, obj);
break;
case T_FuncExpr:
_outFuncExpr(str, obj);
break;
case T_NamedArgExpr:
_outNamedArgExpr(str, obj);
break;
case T_OpExpr:
_outOpExpr(str, obj);
break;
case T_DistinctExpr:
_outDistinctExpr(str, obj);
break;
case T_NullIfExpr:
_outNullIfExpr(str, obj);
break;
case T_ScalarArrayOpExpr:
_outScalarArrayOpExpr(str, obj);
break;
case T_BoolExpr:
_outBoolExpr(str, obj);
break;
case T_SubLink:
_outSubLink(str, obj);
break;
case T_SubPlan:
_outSubPlan(str, obj);
break;
case T_AlternativeSubPlan:
_outAlternativeSubPlan(str, obj);
break;
case T_FieldSelect:
_outFieldSelect(str, obj);
break;
case T_FieldStore:
_outFieldStore(str, obj);
break;
case T_RelabelType:
_outRelabelType(str, obj);
break;
case T_CoerceViaIO:
_outCoerceViaIO(str, obj);
break;
case T_ArrayCoerceExpr:
_outArrayCoerceExpr(str, obj);
break;
case T_ConvertRowtypeExpr:
_outConvertRowtypeExpr(str, obj);
break;
case T_CollateExpr:
_outCollateExpr(str, obj);
break;
case T_CaseExpr:
_outCaseExpr(str, obj);
break;
case T_CaseWhen:
_outCaseWhen(str, obj);
break;
case T_CaseTestExpr:
_outCaseTestExpr(str, obj);
break;
case T_ArrayExpr:
_outArrayExpr(str, obj);
break;
case T_RowExpr:
_outRowExpr(str, obj);
break;
case T_RowCompareExpr:
_outRowCompareExpr(str, obj);
break;
case T_CoalesceExpr:
_outCoalesceExpr(str, obj);
break;
case T_MinMaxExpr:
_outMinMaxExpr(str, obj);
break;
case T_SQLValueFunction:
_outSQLValueFunction(str, obj);
break;
case T_XmlExpr:
_outXmlExpr(str, obj);
break;
case T_NullTest:
_outNullTest(str, obj);
break;
case T_BooleanTest:
_outBooleanTest(str, obj);
break;
case T_CoerceToDomain:
_outCoerceToDomain(str, obj);
break;
case T_CoerceToDomainValue:
_outCoerceToDomainValue(str, obj);
break;
case T_SetToDefault:
_outSetToDefault(str, obj);
break;
case T_CurrentOfExpr:
_outCurrentOfExpr(str, obj);
break;
Code review for NextValueExpr expression node type. Add missing infrastructure for this node type, notably in ruleutils.c where its lack could demonstrably cause EXPLAIN to fail. Add outfuncs/readfuncs support. (outfuncs support is useful today for debugging purposes. The readfuncs support may never be needed, since at present it would only matter for parallel query and NextValueExpr should never appear in a parallelizable query; but it seems like a bad idea to have a primnode type that isn't fully supported here.) Teach planner infrastructure that NextValueExpr is a volatile, parallel-unsafe, non-leaky expression node with cost cpu_operator_cost. Given its limited scope of usage, there *might* be no live bug today from the lack of that knowledge, but it's certainly going to bite us on the rear someday. Teach pg_stat_statements about the new node type, too. While at it, also teach cost_qual_eval() that MinMaxExpr, SQLValueFunction, XmlExpr, and CoerceToDomain should be charged as cpu_operator_cost. Failing to do this for SQLValueFunction was an oversight in my commit 0bb51aa96. The others are longer-standing oversights, but no time like the present to fix them. (In principle, CoerceToDomain could have cost much higher than this, but it doesn't presently seem worth trying to examine the domain's constraints here.) Modify execExprInterp.c to execute NextValueExpr as an out-of-line function; it seems quite unlikely to me that it's worth insisting that it be inlined in all expression eval methods. Besides, providing the out-of-line function doesn't stop anyone from inlining if they want to. Adjust some places where NextValueExpr support had been inserted with the aid of a dartboard rather than keeping it in the same order as elsewhere. Discussion: https://postgr.es/m/23862.1499981661@sss.pgh.pa.us
2017-07-14 21:25:43 +02:00
case T_NextValueExpr:
_outNextValueExpr(str, obj);
break;
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
case T_InferenceElem:
_outInferenceElem(str, obj);
break;
case T_TargetEntry:
_outTargetEntry(str, obj);
break;
case T_RangeTblRef:
_outRangeTblRef(str, obj);
break;
case T_JoinExpr:
_outJoinExpr(str, obj);
break;
case T_FromExpr:
_outFromExpr(str, obj);
break;
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
case T_OnConflictExpr:
_outOnConflictExpr(str, obj);
break;
case T_Path:
_outPath(str, obj);
break;
case T_IndexPath:
_outIndexPath(str, obj);
break;
case T_BitmapHeapPath:
_outBitmapHeapPath(str, obj);
break;
case T_BitmapAndPath:
_outBitmapAndPath(str, obj);
break;
case T_BitmapOrPath:
_outBitmapOrPath(str, obj);
break;
case T_TidPath:
_outTidPath(str, obj);
break;
case T_TidRangePath:
_outTidRangePath(str, obj);
break;
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
case T_SubqueryScanPath:
_outSubqueryScanPath(str, obj);
break;
case T_ForeignPath:
_outForeignPath(str, obj);
break;
case T_CustomPath:
_outCustomPath(str, obj);
break;
case T_AppendPath:
_outAppendPath(str, obj);
break;
case T_MergeAppendPath:
_outMergeAppendPath(str, obj);
break;
In the planner, replace an empty FROM clause with a dummy RTE. The fact that "SELECT expression" has no base relations has long been a thorn in the side of the planner. It makes it hard to flatten a sub-query that looks like that, or is a trivial VALUES() item, because the planner generally uses relid sets to identify sub-relations, and such a sub-query would have an empty relid set if we flattened it. prepjointree.c contains some baroque logic that works around this in certain special cases --- but there is a much better answer. We can replace an empty FROM clause with a dummy RTE that acts like a table of one row and no columns, and then there are no such corner cases to worry about. Instead we need some logic to get rid of useless dummy RTEs, but that's simpler and covers more cases than what was there before. For really trivial cases, where the query is just "SELECT expression" and nothing else, there's a hazard that adding the extra RTE makes for a noticeable slowdown; even though it's not much processing, there's not that much for the planner to do overall. However testing says that the penalty is very small, close to the noise level. In more complex queries, this is able to find optimizations that we could not find before. The new RTE type is called RTE_RESULT, since the "scan" plan type it gives rise to is a Result node (the same plan we produced for a "SELECT expression" query before). To avoid confusion, rename the old ResultPath path type to GroupResultPath, reflecting that it's only used in degenerate grouping cases where we know the query produces just one grouped row. (It wouldn't work to unify the two cases, because there are different rules about where the associated quals live during query_planner.) Note: although this touches readfuncs.c, I don't think a catversion bump is required, because the added case can't occur in stored rules, only plans. Patch by me, reviewed by David Rowley and Mark Dilger Discussion: https://postgr.es/m/15944.1521127664@sss.pgh.pa.us
2019-01-28 23:54:10 +01:00
case T_GroupResultPath:
_outGroupResultPath(str, obj);
break;
case T_MaterialPath:
_outMaterialPath(str, obj);
break;
case T_MemoizePath:
_outMemoizePath(str, obj);
Add Result Cache executor node (take 2) Here we add a new executor node type named "Result Cache". The planner can include this node type in the plan to have the executor cache the results from the inner side of parameterized nested loop joins. This allows caching of tuples for sets of parameters so that in the event that the node sees the same parameter values again, it can just return the cached tuples instead of rescanning the inner side of the join all over again. Internally, result cache uses a hash table in order to quickly find tuples that have been previously cached. For certain data sets, this can significantly improve the performance of joins. The best cases for using this new node type are for join problems where a large portion of the tuples from the inner side of the join have no join partner on the outer side of the join. In such cases, hash join would have to hash values that are never looked up, thus bloating the hash table and possibly causing it to multi-batch. Merge joins would have to skip over all of the unmatched rows. If we use a nested loop join with a result cache, then we only cache tuples that have at least one join partner on the outer side of the join. The benefits of using a parameterized nested loop with a result cache increase when there are fewer distinct values being looked up and the number of lookups of each value is large. Also, hash probes to lookup the cache can be much faster than the hash probe in a hash join as it's common that the result cache's hash table is much smaller than the hash join's due to result cache only caching useful tuples rather than all tuples from the inner side of the join. This variation in hash probe performance is more significant when the hash join's hash table no longer fits into the CPU's L3 cache, but the result cache's hash table does. The apparent "random" access of hash buckets with each hash probe can cause a poor L3 cache hit ratio for large hash tables. Smaller hash tables generally perform better. The hash table used for the cache limits itself to not exceeding work_mem * hash_mem_multiplier in size. We maintain a dlist of keys for this cache and when we're adding new tuples and realize we've exceeded the memory budget, we evict cache entries starting with the least recently used ones until we have enough memory to add the new tuples to the cache. For parameterized nested loop joins, we now consider using one of these result cache nodes in between the nested loop node and its inner node. We determine when this might be useful based on cost, which is primarily driven off of what the expected cache hit ratio will be. Estimating the cache hit ratio relies on having good distinct estimates on the nested loop's parameters. For now, the planner will only consider using a result cache for parameterized nested loop joins. This works for both normal joins and also for LATERAL type joins to subqueries. It is possible to use this new node for other uses in the future. For example, to cache results from correlated subqueries. However, that's not done here due to some difficulties obtaining a distinct estimation on the outer plan to calculate the estimated cache hit ratio. Currently we plan the inner plan before planning the outer plan so there is no good way to know if a result cache would be useful or not since we can't estimate the number of times the subplan will be called until the outer plan is generated. The functionality being added here is newly introducing a dependency on the return value of estimate_num_groups() during the join search. Previously, during the join search, we only ever needed to perform selectivity estimations. With this commit, we need to use estimate_num_groups() in order to estimate what the hit ratio on the result cache will be. In simple terms, if we expect 10 distinct values and we expect 1000 outer rows, then we'll estimate the hit ratio to be 99%. Since cache hits are very cheap compared to scanning the underlying nodes on the inner side of the nested loop join, then this will significantly reduce the planner's cost for the join. However, it's fairly easy to see here that things will go bad when estimate_num_groups() incorrectly returns a value that's significantly lower than the actual number of distinct values. If this happens then that may cause us to make use of a nested loop join with a result cache instead of some other join type, such as a merge or hash join. Our distinct estimations have been known to be a source of trouble in the past, so the extra reliance on them here could cause the planner to choose slower plans than it did previous to having this feature. Distinct estimations are also fairly hard to estimate accurately when several tables have been joined already or when a WHERE clause filters out a set of values that are correlated to the expressions we're estimating the number of distinct value for. For now, the costing we perform during query planning for result caches does put quite a bit of faith in the distinct estimations being accurate. When these are accurate then we should generally see faster execution times for plans containing a result cache. However, in the real world, we may find that we need to either change the costings to put less trust in the distinct estimations being accurate or perhaps even disable this feature by default. There's always an element of risk when we teach the query planner to do new tricks that it decides to use that new trick at the wrong time and causes a regression. Users may opt to get the old behavior by turning the feature off using the enable_resultcache GUC. Currently, this is enabled by default. It remains to be seen if we'll maintain that setting for the release. Additionally, the name "Result Cache" is the best name I could think of for this new node at the time I started writing the patch. Nobody seems to strongly dislike the name. A few people did suggest other names but no other name seemed to dominate in the brief discussion that there was about names. Let's allow the beta period to see if the current name pleases enough people. If there's some consensus on a better name, then we can change it before the release. Please see the 2nd discussion link below for the discussion on the "Result Cache" name. Author: David Rowley Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie Tested-By: Konstantin Knizhnik Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
break;
case T_UniquePath:
_outUniquePath(str, obj);
break;
case T_GatherPath:
_outGatherPath(str, obj);
break;
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
case T_ProjectionPath:
_outProjectionPath(str, obj);
break;
Move targetlist SRF handling from expression evaluation to new executor node. Evaluation of set returning functions (SRFs_ in the targetlist (like SELECT generate_series(1,5)) so far was done in the expression evaluation (i.e. ExecEvalExpr()) and projection (i.e. ExecProject/ExecTargetList) code. This meant that most executor nodes performing projection, and most expression evaluation functions, had to deal with the possibility that an evaluated expression could return a set of return values. That's bad because it leads to repeated code in a lot of places. It also, and that's my (Andres's) motivation, made it a lot harder to implement a more efficient way of doing expression evaluation. To fix this, introduce a new executor node (ProjectSet) that can evaluate targetlists containing one or more SRFs. To avoid the complexity of the old way of handling nested expressions returning sets (e.g. having to pass up ExprDoneCond, and dealing with arguments to functions returning sets etc.), those SRFs can only be at the top level of the node's targetlist. The planner makes sure (via split_pathtarget_at_srfs()) that SRF evaluation is only necessary in ProjectSet nodes and that SRFs are only present at the top level of the node's targetlist. If there are nested SRFs the planner creates multiple stacked ProjectSet nodes. The ProjectSet nodes always get input from an underlying node. We also discussed and prototyped evaluating targetlist SRFs using ROWS FROM(), but that turned out to be more complicated than we'd hoped. While moving SRF evaluation to ProjectSet would allow to retain the old "least common multiple" behavior when multiple SRFs are present in one targetlist (i.e. continue returning rows until all SRFs are at the end of their input at the same time), we decided to instead only return rows till all SRFs are exhausted, returning NULL for already exhausted ones. We deemed the previous behavior to be too confusing, unexpected and actually not particularly useful. As a side effect, the previously prohibited case of multiple set returning arguments to a function, is now allowed. Not because it's particularly desirable, but because it ends up working and there seems to be no argument for adding code to prohibit it. Currently the behavior for COALESCE and CASE containing SRFs has changed, returning multiple rows from the expression, even when the SRF containing "arm" of the expression is not evaluated. That's because the SRFs are evaluated in a separate ProjectSet node. As that's quite confusing, we're likely to instead prohibit SRFs in those places. But that's still being discussed, and the code would reside in places not touched here, so that's a task for later. There's a lot of, now superfluous, code dealing with set return expressions around. But as the changes to get rid of those are verbose largely boring, it seems better for readability to keep the cleanup as a separate commit. Author: Tom Lane and Andres Freund Discussion: https://postgr.es/m/20160822214023.aaxz5l4igypowyri@alap3.anarazel.de
2017-01-18 21:46:50 +01:00
case T_ProjectSetPath:
_outProjectSetPath(str, obj);
break;
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
case T_SortPath:
_outSortPath(str, obj);
break;
case T_IncrementalSortPath:
_outIncrementalSortPath(str, obj);
break;
Make the upper part of the planner work by generating and comparing Paths. I've been saying we needed to do this for more than five years, and here it finally is. This patch removes the ever-growing tangle of spaghetti logic that grouping_planner() used to use to try to identify the best plan for post-scan/join query steps. Now, there is (nearly) independent consideration of each execution step, and entirely separate construction of Paths to represent each of the possible ways to do that step. We choose the best Path or set of Paths using the same add_path() logic that's been used inside query_planner() for years. In addition, this patch removes the old restriction that subquery_planner() could return only a single Plan. It now returns a RelOptInfo containing a set of Paths, just as query_planner() does, and the parent query level can use each of those Paths as the basis of a SubqueryScanPath at its level. This allows finding some optimizations that we missed before, wherein a subquery was capable of returning presorted data and thereby avoiding a sort in the parent level, making the overall cost cheaper even though delivering sorted output was not the cheapest plan for the subquery in isolation. (A couple of regression test outputs change in consequence of that. However, there is very little change in visible planner behavior overall, because the point of this patch is not to get immediate planning benefits but to create the infrastructure for future improvements.) There is a great deal left to do here. This patch unblocks a lot of planner work that was basically impractical in the old code structure, such as allowing FDWs to implement remote aggregation, or rewriting plan_set_operations() to allow consideration of multiple implementation orders for set operations. (The latter will likely require a full rewrite of plan_set_operations(); what I've done here is only to fix it to return Paths not Plans.) I have also left unfinished some localized refactoring in createplan.c and planner.c, because it was not necessary to get this patch to a working state. Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
case T_GroupPath:
_outGroupPath(str, obj);
break;
case T_UpperUniquePath:
_outUpperUniquePath(str, obj);
break;
case T_AggPath:
_outAggPath(str, obj);
break;
case T_GroupingSetsPath:
_outGroupingSetsPath(str, obj);
break;
case T_MinMaxAggPath:
_outMinMaxAggPath(str, obj);
break;
case T_WindowAggPath:
_outWindowAggPath(str, obj);
break;
case T_SetOpPath:
_outSetOpPath(str, obj);
break;
case T_RecursiveUnionPath:
_outRecursiveUnionPath(str, obj);
break;
case T_LockRowsPath:
_outLockRowsPath(str, obj);
break;
case T_ModifyTablePath:
_outModifyTablePath(str, obj);
break;
case T_LimitPath:
_outLimitPath(str, obj);
break;
case T_GatherMergePath:
_outGatherMergePath(str, obj);
break;
1999-02-12 07:43:53 +01:00
case T_NestPath:
_outNestPath(str, obj);
break;
case T_MergePath:
_outMergePath(str, obj);
break;
case T_HashPath:
_outHashPath(str, obj);
break;
case T_PlannerGlobal:
_outPlannerGlobal(str, obj);
break;
case T_PlannerInfo:
_outPlannerInfo(str, obj);
break;
case T_RelOptInfo:
_outRelOptInfo(str, obj);
break;
case T_IndexOptInfo:
_outIndexOptInfo(str, obj);
break;
case T_ForeignKeyOptInfo:
_outForeignKeyOptInfo(str, obj);
break;
case T_EquivalenceClass:
_outEquivalenceClass(str, obj);
break;
case T_EquivalenceMember:
_outEquivalenceMember(str, obj);
break;
case T_PathKey:
_outPathKey(str, obj);
break;
case T_PathTarget:
_outPathTarget(str, obj);
break;
Revise parameterized-path mechanism to fix assorted issues. This patch adjusts the treatment of parameterized paths so that all paths with the same parameterization (same set of required outer rels) for the same relation will have the same rowcount estimate. We cache the rowcount estimates to ensure that property, and hopefully save a few cycles too. Doing this makes it practical for add_path_precheck to operate without a rowcount estimate: it need only assume that paths with different parameterizations never dominate each other, which is close enough to true anyway for coarse filtering, because normally a more-parameterized path should yield fewer rows thanks to having more join clauses to apply. In add_path, we do the full nine yards of comparing rowcount estimates along with everything else, so that we can discard parameterized paths that don't actually have an advantage. This fixes some issues I'd found with add_path rejecting parameterized paths on the grounds that they were more expensive than not-parameterized ones, even though they yielded many fewer rows and hence would be cheaper once subsequent joining was considered. To make the same-rowcounts assumption valid, we have to require that any parameterized path enforce *all* join clauses that could be obtained from the particular set of outer rels, even if not all of them are useful for indexing. This is required at both base scans and joins. It's a good thing anyway since the net impact is that join quals are checked at the lowest practical level in the join tree. Hence, discard the original rather ad-hoc mechanism for choosing parameterization joinquals, and build a better one that has a more principled rule for when clauses can be moved. The original rule was actually buggy anyway for lack of knowledge about which relations are part of an outer join's outer side; getting this right requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
case T_ParamPathInfo:
_outParamPathInfo(str, obj);
break;
case T_RestrictInfo:
_outRestrictInfo(str, obj);
break;
Refactor the representation of indexable clauses in IndexPaths. In place of three separate but interrelated lists (indexclauses, indexquals, and indexqualcols), an IndexPath now has one list "indexclauses" of IndexClause nodes. This holds basically the same information as before, but in a more useful format: in particular, there is now a clear connection between an indexclause (an original restriction clause from WHERE or JOIN/ON) and the indexquals (directly usable index conditions) derived from it. We also change the ground rules a bit by mandating that clause commutation, if needed, be done up-front so that what is stored in the indexquals list is always directly usable as an index condition. This gets rid of repeated re-determination of which side of the clause is the indexkey during costing and plan generation, as well as repeated lookups of the commutator operator. To minimize the added up-front cost, the typical case of commuting a plain OpExpr is handled by a new special-purpose function commute_restrictinfo(). For RowCompareExprs, generating the new clause properly commuted to begin with is not really any more complex than before, it's just different --- and we can save doing that work twice, as the pretty-klugy original implementation did. Tracking the connection between original and derived clauses lets us also track explicitly whether the derived clauses are an exact or lossy translation of the original. This provides a cheap solution to getting rid of unnecessary rechecks of boolean index clauses, which previously seemed like it'd be more expensive than it was worth. Another pleasant (IMO) side-effect is that EXPLAIN now always shows index clauses with the indexkey on the left; this seems less confusing. This commit leaves expand_indexqual_conditions() and some related functions in a slightly messy state. I didn't bother to change them any more than minimally necessary to work with the new data structure, because all that code is going to be refactored out of existence in a follow-on patch. Discussion: https://postgr.es/m/22182.1549124950@sss.pgh.pa.us
2019-02-09 23:30:43 +01:00
case T_IndexClause:
_outIndexClause(str, obj);
break;
case T_PlaceHolderVar:
_outPlaceHolderVar(str, obj);
break;
case T_SpecialJoinInfo:
_outSpecialJoinInfo(str, obj);
break;
case T_AppendRelInfo:
_outAppendRelInfo(str, obj);
break;
Rework planning and execution of UPDATE and DELETE. This patch makes two closely related sets of changes: 1. For UPDATE, the subplan of the ModifyTable node now only delivers the new values of the changed columns (i.e., the expressions computed in the query's SET clause) plus row identity information such as CTID. ModifyTable must re-fetch the original tuple to merge in the old values of any unchanged columns. The core advantage of this is that the changed columns are uniform across all tables of an inherited or partitioned target relation, whereas the other columns might not be. A secondary advantage, when the UPDATE involves joins, is that less data needs to pass through the plan tree. The disadvantage of course is an extra fetch of each tuple to be updated. However, that seems to be very nearly free in context; even worst-case tests don't show it to add more than a couple percent to the total query cost. At some point it might be interesting to combine the re-fetch with the tuple access that ModifyTable must do anyway to mark the old tuple dead; but that would require a good deal of refactoring and it seems it wouldn't buy all that much, so this patch doesn't attempt it. 2. For inherited UPDATE/DELETE, instead of generating a separate subplan for each target relation, we now generate a single subplan that is just exactly like a SELECT's plan, then stick ModifyTable on top of that. To let ModifyTable know which target relation a given incoming row refers to, a tableoid junk column is added to the row identity information. This gets rid of the horrid hack that was inheritance_planner(), eliminating O(N^2) planning cost and memory consumption in cases where there were many unprunable target relations. Point 2 of course requires point 1, so that there is a uniform definition of the non-junk columns to be returned by the subplan. We can't insist on uniform definition of the row identity junk columns however, if we want to keep the ability to have both plain and foreign tables in a partitioning hierarchy. Since it wouldn't scale very far to have every child table have its own row identity column, this patch includes provisions to merge similar row identity columns into one column of the subplan result. In particular, we can merge the whole-row Vars typically used as row identity by FDWs into one column by pretending they are type RECORD. (It's still okay for the actual composite Datums to be labeled with the table's rowtype OID, though.) There is more that can be done to file down residual inefficiencies in this patch, but it seems to be committable now. FDW authors should note several API changes: * The argument list for AddForeignUpdateTargets() has changed, and so has the method it must use for adding junk columns to the query. Call add_row_identity_var() instead of manipulating the parse tree directly. You might want to reconsider exactly what you're adding, too. * PlanDirectModify() must now work a little harder to find the ForeignScan plan node; if the foreign table is part of a partitioning hierarchy then the ForeignScan might not be the direct child of ModifyTable. See postgres_fdw for sample code. * To check whether a relation is a target relation, it's no longer sufficient to compare its relid to root->parse->resultRelation. Instead, check it against all_result_relids or leaf_result_relids, as appropriate. Amit Langote and Tom Lane Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
case T_RowIdentityVarInfo:
_outRowIdentityVarInfo(str, obj);
break;
case T_PlaceHolderInfo:
_outPlaceHolderInfo(str, obj);
break;
case T_MinMaxAggInfo:
_outMinMaxAggInfo(str, obj);
break;
case T_PlannerParamItem:
_outPlannerParamItem(str, obj);
break;
case T_RollupData:
_outRollupData(str, obj);
break;
case T_GroupingSetData:
_outGroupingSetData(str, obj);
break;
Implement multivariate n-distinct coefficients Add support for explicitly declared statistic objects (CREATE STATISTICS), allowing collection of statistics on more complex combinations that individual table columns. Companion commands DROP STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are added too. All this DDL has been designed so that more statistic types can be added later on, such as multivariate most-common-values and multivariate histograms between columns of a single table, leaving room for permitting columns on multiple tables, too, as well as expressions. This commit only adds support for collection of n-distinct coefficient on user-specified sets of columns in a single table. This is useful to estimate number of distinct groups in GROUP BY and DISTINCT clauses; estimation errors there can cause over-allocation of memory in hashed aggregates, for instance, so it's a worthwhile problem to solve. A new special pseudo-type pg_ndistinct is used. (num-distinct estimation was deemed sufficiently useful by itself that this is worthwhile even if no further statistic types are added immediately; so much so that another version of essentially the same functionality was submitted by Kyotaro Horiguchi: https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp though this commit does not use that code.) Author: Tomas Vondra. Some code rework by Álvaro. Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes, Ideriha Takeshi Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
case T_StatisticExtInfo:
_outStatisticExtInfo(str, obj);
break;
case T_ExtensibleNode:
_outExtensibleNode(str, obj);
break;
case T_CreateStmt:
_outCreateStmt(str, obj);
break;
case T_CreateForeignTableStmt:
_outCreateForeignTableStmt(str, obj);
break;
case T_ImportForeignSchemaStmt:
_outImportForeignSchemaStmt(str, obj);
break;
case T_IndexStmt:
_outIndexStmt(str, obj);
break;
Implement multivariate n-distinct coefficients Add support for explicitly declared statistic objects (CREATE STATISTICS), allowing collection of statistics on more complex combinations that individual table columns. Companion commands DROP STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are added too. All this DDL has been designed so that more statistic types can be added later on, such as multivariate most-common-values and multivariate histograms between columns of a single table, leaving room for permitting columns on multiple tables, too, as well as expressions. This commit only adds support for collection of n-distinct coefficient on user-specified sets of columns in a single table. This is useful to estimate number of distinct groups in GROUP BY and DISTINCT clauses; estimation errors there can cause over-allocation of memory in hashed aggregates, for instance, so it's a worthwhile problem to solve. A new special pseudo-type pg_ndistinct is used. (num-distinct estimation was deemed sufficiently useful by itself that this is worthwhile even if no further statistic types are added immediately; so much so that another version of essentially the same functionality was submitted by Kyotaro Horiguchi: https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp though this commit does not use that code.) Author: Tomas Vondra. Some code rework by Álvaro. Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes, Ideriha Takeshi Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
case T_CreateStatsStmt:
_outCreateStatsStmt(str, obj);
break;
case T_AlterStatsStmt:
_outAlterStatsStmt(str, obj);
break;
case T_NotifyStmt:
_outNotifyStmt(str, obj);
break;
case T_DeclareCursorStmt:
_outDeclareCursorStmt(str, obj);
break;
case T_SelectStmt:
_outSelectStmt(str, obj);
break;
SQL-standard function body This adds support for writing CREATE FUNCTION and CREATE PROCEDURE statements for language SQL with a function body that conforms to the SQL standard and is portable to other implementations. Instead of the PostgreSQL-specific AS $$ string literal $$ syntax, this allows writing out the SQL statements making up the body unquoted, either as a single statement: CREATE FUNCTION add(a integer, b integer) RETURNS integer LANGUAGE SQL RETURN a + b; or as a block CREATE PROCEDURE insert_data(a integer, b integer) LANGUAGE SQL BEGIN ATOMIC INSERT INTO tbl VALUES (a); INSERT INTO tbl VALUES (b); END; The function body is parsed at function definition time and stored as expression nodes in a new pg_proc column prosqlbody. So at run time, no further parsing is required. However, this form does not support polymorphic arguments, because there is no more parse analysis done at call time. Dependencies between the function and the objects it uses are fully tracked. A new RETURN statement is introduced. This can only be used inside function bodies. Internally, it is treated much like a SELECT statement. psql needs some new intelligence to keep track of function body boundaries so that it doesn't send off statements when it sees semicolons that are inside a function body. Tested-by: Jaime Casanova <jcasanov@systemguards.com.ec> Reviewed-by: Julien Rouhaud <rjuju123@gmail.com> Discussion: https://www.postgresql.org/message-id/flat/1c11f1eb-f00c-43b7-799d-2d44132c02d7@2ndquadrant.com
2021-04-07 21:30:08 +02:00
case T_ReturnStmt:
_outReturnStmt(str, obj);
break;
case T_PLAssignStmt:
_outPLAssignStmt(str, obj);
break;
case T_ColumnDef:
_outColumnDef(str, obj);
break;
case T_TypeName:
_outTypeName(str, obj);
break;
case T_TypeCast:
_outTypeCast(str, obj);
break;
case T_CollateClause:
_outCollateClause(str, obj);
break;
case T_IndexElem:
_outIndexElem(str, obj);
break;
Extended statistics on expressions Allow defining extended statistics on expressions, not just just on simple column references. With this commit, expressions are supported by all existing extended statistics kinds, improving the same types of estimates. A simple example may look like this: CREATE TABLE t (a int); CREATE STATISTICS s ON mod(a,10), mod(a,20) FROM t; ANALYZE t; The collected statistics are useful e.g. to estimate queries with those expressions in WHERE or GROUP BY clauses: SELECT * FROM t WHERE mod(a,10) = 0 AND mod(a,20) = 0; SELECT 1 FROM t GROUP BY mod(a,10), mod(a,20); This introduces new internal statistics kind 'e' (expressions) which is built automatically when the statistics object definition includes any expressions. This represents single-expression statistics, as if there was an expression index (but without the index maintenance overhead). The statistics is stored in pg_statistics_ext_data as an array of composite types, which is possible thanks to 79f6a942bd. CREATE STATISTICS allows building statistics on a single expression, in which case in which case it's not possible to specify statistics kinds. A new system view pg_stats_ext_exprs can be used to display expression statistics, similarly to pg_stats and pg_stats_ext views. ALTER TABLE ... ALTER COLUMN ... TYPE now treats indexes the same way it treats indexes, i.e. it drops and recreates the statistics. This means all statistics are reset, and we no longer try to preserve at least the functional dependencies. This should not be a major issue in practice, as the functional dependencies actually rely on per-column statistics, which were always reset anyway. Author: Tomas Vondra Reviewed-by: Justin Pryzby, Dean Rasheed, Zhihong Yu Discussion: https://postgr.es/m/ad7891d2-e90c-b446-9fe2-7419143847d7%40enterprisedb.com
2021-03-26 23:22:01 +01:00
case T_StatsElem:
_outStatsElem(str, obj);
break;
case T_Query:
_outQuery(str, obj);
break;
case T_WithCheckOption:
_outWithCheckOption(str, obj);
break;
case T_SortGroupClause:
_outSortGroupClause(str, obj);
break;
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
case T_GroupingSet:
_outGroupingSet(str, obj);
break;
case T_WindowClause:
_outWindowClause(str, obj);
break;
case T_RowMarkClause:
_outRowMarkClause(str, obj);
break;
case T_WithClause:
_outWithClause(str, obj);
break;
case T_CTESearchClause:
_outCTESearchClause(str, obj);
break;
case T_CTECycleClause:
_outCTECycleClause(str, obj);
break;
case T_CommonTableExpr:
_outCommonTableExpr(str, obj);
break;
Add support for MERGE SQL command MERGE performs actions that modify rows in the target table using a source table or query. MERGE provides a single SQL statement that can conditionally INSERT/UPDATE/DELETE rows -- a task that would otherwise require multiple PL statements. For example, MERGE INTO target AS t USING source AS s ON t.tid = s.sid WHEN MATCHED AND t.balance > s.delta THEN UPDATE SET balance = t.balance - s.delta WHEN MATCHED THEN DELETE WHEN NOT MATCHED AND s.delta > 0 THEN INSERT VALUES (s.sid, s.delta) WHEN NOT MATCHED THEN DO NOTHING; MERGE works with regular tables, partitioned tables and inheritance hierarchies, including column and row security enforcement, as well as support for row and statement triggers and transition tables therein. MERGE is optimized for OLTP and is parameterizable, though also useful for large scale ETL/ELT. MERGE is not intended to be used in preference to existing single SQL commands for INSERT, UPDATE or DELETE since there is some overhead. MERGE can be used from PL/pgSQL. MERGE does not support targetting updatable views or foreign tables, and RETURNING clauses are not allowed either. These limitations are likely fixable with sufficient effort. Rewrite rules are also not supported, but it's not clear that we'd want to support them. Author: Pavan Deolasee <pavan.deolasee@gmail.com> Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Author: Amit Langote <amitlangote09@gmail.com> Author: Simon Riggs <simon.riggs@enterprisedb.com> Reviewed-by: Peter Eisentraut <peter.eisentraut@enterprisedb.com> Reviewed-by: Andres Freund <andres@anarazel.de> (earlier versions) Reviewed-by: Peter Geoghegan <pg@bowt.ie> (earlier versions) Reviewed-by: Robert Haas <robertmhaas@gmail.com> (earlier versions) Reviewed-by: Japin Li <japinli@hotmail.com> Reviewed-by: Justin Pryzby <pryzby@telsasoft.com> Reviewed-by: Tomas Vondra <tomas.vondra@enterprisedb.com> Reviewed-by: Zhihong Yu <zyu@yugabyte.com> Discussion: https://postgr.es/m/CANP8+jKitBSrB7oTgT9CY2i1ObfOt36z0XMraQc+Xrz8QB0nXA@mail.gmail.com Discussion: https://postgr.es/m/CAH2-WzkJdBuxj9PO=2QaO9-3h3xGbQPZ34kJH=HukRekwM-GZg@mail.gmail.com Discussion: https://postgr.es/m/20201231134736.GA25392@alvherre.pgsql
2022-03-28 16:45:58 +02:00
case T_MergeWhenClause:
_outMergeWhenClause(str, obj);
break;
case T_MergeAction:
_outMergeAction(str, obj);
break;
case T_SetOperationStmt:
_outSetOperationStmt(str, obj);
break;
case T_RangeTblEntry:
_outRangeTblEntry(str, obj);
break;
case T_RangeTblFunction:
_outRangeTblFunction(str, obj);
break;
Redesign tablesample method API, and do extensive code review. The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2015-07-25 20:39:00 +02:00
case T_TableSampleClause:
_outTableSampleClause(str, obj);
break;
case T_A_Expr:
_outA_Expr(str, obj);
break;
case T_ColumnRef:
_outColumnRef(str, obj);
break;
case T_ParamRef:
_outParamRef(str, obj);
break;
case T_RawStmt:
_outRawStmt(str, obj);
break;
case T_A_Const:
_outA_Const(str, obj);
break;
case T_A_Star:
_outA_Star(str, obj);
break;
case T_A_Indices:
_outA_Indices(str, obj);
break;
case T_A_Indirection:
_outA_Indirection(str, obj);
break;
case T_A_ArrayExpr:
_outA_ArrayExpr(str, obj);
break;
case T_ResTarget:
_outResTarget(str, obj);
break;
case T_MultiAssignRef:
_outMultiAssignRef(str, obj);
break;
case T_SortBy:
_outSortBy(str, obj);
break;
case T_WindowDef:
_outWindowDef(str, obj);
break;
case T_RangeSubselect:
_outRangeSubselect(str, obj);
break;
case T_RangeFunction:
_outRangeFunction(str, obj);
break;
Redesign tablesample method API, and do extensive code review. The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2015-07-25 20:39:00 +02:00
case T_RangeTableSample:
_outRangeTableSample(str, obj);
break;
case T_RangeTableFunc:
_outRangeTableFunc(str, obj);
break;
case T_RangeTableFuncCol:
_outRangeTableFuncCol(str, obj);
break;
1998-12-04 16:34:49 +01:00
case T_Constraint:
_outConstraint(str, obj);
break;
case T_FuncCall:
_outFuncCall(str, obj);
break;
case T_DefElem:
_outDefElem(str, obj);
break;
case T_TableLikeClause:
_outTableLikeClause(str, obj);
break;
case T_LockingClause:
_outLockingClause(str, obj);
break;
case T_XmlSerialize:
_outXmlSerialize(str, obj);
break;
case T_ForeignKeyCacheInfo:
_outForeignKeyCacheInfo(str, obj);
break;
case T_TriggerTransition:
_outTriggerTransition(str, obj);
break;
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
case T_PartitionElem:
_outPartitionElem(str, obj);
break;
Code review focused on new node types added by partitioning support. Fix failure to check that we got a plain Const from const-simplification of a coercion request. This is the cause of bug #14666 from Tian Bing: there is an int4 to money cast, but it's only stable not immutable (because of dependence on lc_monetary), resulting in a FuncExpr that the code was miserably unequipped to deal with, or indeed even to notice that it was failing to deal with. Add test cases around this coercion behavior. In view of the above, sprinkle the code liberally with castNode() macros, in hope of catching the next such bug a bit sooner. Also, change some functions that were randomly declared to take Node* to take more specific pointer types. And change some struct fields that were declared Node* but could be given more specific types, allowing removal of assorted explicit casts. Place PARTITION_MAX_KEYS check a bit closer to the code it's protecting. Likewise check only-one-key-for-list-partitioning restriction in a less random place. Avoid not-per-project-style usages like !strcmp(...). Fix assorted failures to avoid scribbling on the input of parse transformation. I'm not sure how necessary this is, but it's entirely silly for these functions to be expending cycles to avoid that and not getting it right. Add guards against partitioning on system columns. Put backend/nodes/ support code into an order that matches handling of these node types elsewhere. Annotate the fact that somebody added location fields to PartitionBoundSpec and PartitionRangeDatum but forgot to handle them in outfuncs.c/readfuncs.c. This is fairly harmless for production purposes (since readfuncs.c would just substitute -1 anyway) but it's still bogus. It's not worth forcing a post-beta1 initdb just to fix this, but if we have another reason to force initdb before 10.0, we should go back and clean this up. Contrariwise, somebody added location fields to PartitionElem and PartitionSpec but forgot to teach exprLocation() about them. Consolidate duplicative code in transformPartitionBound(). Improve a couple of error messages. Improve assorted commentary. Re-pgindent the files touched by this patch; this affects a few comment blocks that must have been added quite recently. Report: https://postgr.es/m/20170524024550.29935.14396@wrigleys.postgresql.org
2017-05-29 05:20:28 +02:00
case T_PartitionSpec:
_outPartitionSpec(str, obj);
break;
Implement table partitioning. Table partitioning is like table inheritance and reuses much of the existing infrastructure, but there are some important differences. The parent is called a partitioned table and is always empty; it may not have indexes or non-inherited constraints, since those make no sense for a relation with no data of its own. The children are called partitions and contain all of the actual data. Each partition has an implicit partitioning constraint. Multiple inheritance is not allowed, and partitioning and inheritance can't be mixed. Partitions can't have extra columns and may not allow nulls unless the parent does. Tuples inserted into the parent are automatically routed to the correct partition, so tuple-routing ON INSERT triggers are not needed. Tuple routing isn't yet supported for partitions which are foreign tables, and it doesn't handle updates that cross partition boundaries. Currently, tables can be range-partitioned or list-partitioned. List partitioning is limited to a single column, but range partitioning can involve multiple columns. A partitioning "column" can be an expression. Because table partitioning is less general than table inheritance, it is hoped that it will be easier to reason about properties of partitions, and therefore that this will serve as a better foundation for a variety of possible optimizations, including query planner optimizations. The tuple routing based which this patch does based on the implicit partitioning constraints is an example of this, but it seems likely that many other useful optimizations are also possible. Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat, Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova, Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
case T_PartitionBoundSpec:
_outPartitionBoundSpec(str, obj);
break;
case T_PartitionRangeDatum:
_outPartitionRangeDatum(str, obj);
break;
case T_JsonFormat:
_outJsonFormat(str, obj);
break;
case T_JsonReturning:
_outJsonReturning(str, obj);
break;
case T_JsonValueExpr:
_outJsonValueExpr(str, obj);
break;
case T_JsonConstructorExpr:
_outJsonConstructorExpr(str, obj);
break;
case T_JsonIsPredicate:
_outJsonIsPredicate(str, obj);
break;
case T_JsonBehavior:
_outJsonBehavior(str, obj);
break;
case T_JsonExpr:
_outJsonExpr(str, obj);
break;
case T_JsonCoercion:
_outJsonCoercion(str, obj);
break;
case T_JsonItemCoercions:
_outJsonItemCoercions(str, obj);
break;
case T_JsonTableParent:
_outJsonTableParent(str, obj);
break;
case T_JsonTableSibling:
_outJsonTableSibling(str, obj);
break;
Automatically generate node support functions Add a script to automatically generate the node support functions (copy, equal, out, and read, as well as the node tags enum) from the struct definitions. For each of the four node support files, it creates two include files, e.g., copyfuncs.funcs.c and copyfuncs.switch.c, to include in the main file. All the scaffolding of the main file stays in place. I have tried to mostly make the coverage of the output match what is currently there. For example, one could now do out/read coverage of utility statement nodes, but I have manually excluded those for now. The reason is mainly that it's easier to diff the before and after, and adding a bunch of stuff like this might require a separate analysis and review. Subtyping (TidScan -> Scan) is supported. For the hard cases, you can just write a manual function and exclude generating one. For the not so hard cases, there is a way of annotating struct fields to get special behaviors. For example, pg_node_attr(equal_ignore) has the field ignored in equal functions. (In this patch, I have only ifdef'ed out the code to could be removed, mainly so that it won't constantly have merge conflicts. It will be deleted in a separate patch. All the code comments that are worth keeping from those sections have already been moved to the header files where the structs are defined.) Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://www.postgresql.org/message-id/flat/c1097590-a6a4-486a-64b1-e1f9cc0533ce%40enterprisedb.com
2022-07-09 08:52:19 +02:00
#endif /* OBSOLETE */
default:
2003-08-04 02:43:34 +02:00
/*
* This should be an ERROR, but it's too useful to be able to
* dump structures that outNode only understands part of.
*/
elog(WARNING, "could not dump unrecognized node type: %d",
(int) nodeTag(obj));
break;
}
appendStringInfoChar(str, '}');
}
}
/*
* nodeToString -
* returns the ascii representation of the Node as a palloc'd string
*/
char *
nodeToString(const void *obj)
{
StringInfoData str;
/* see stringinfo.h for an explanation of this maneuver */
initStringInfo(&str);
outNode(&str, obj);
return str.data;
}
/*
* bmsToString -
* returns the ascii representation of the Bitmapset as a palloc'd string
*/
char *
bmsToString(const Bitmapset *bms)
{
StringInfoData str;
/* see stringinfo.h for an explanation of this maneuver */
initStringInfo(&str);
outBitmapset(&str, bms);
return str.data;
}