2002-11-25 19:12:12 +01:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
1999-02-14 00:22:53 +01:00
|
|
|
* outfuncs.c
|
2002-11-25 19:12:12 +01:00
|
|
|
* Output functions for Postgres tree nodes.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2022-01-08 01:04:57 +01:00
|
|
|
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2002-11-25 19:12:12 +01:00
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/nodes/outfuncs.c
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
* NOTES
|
2002-11-25 19:12:12 +01:00
|
|
|
* Every node type that can appear in stored rules' parsetrees *must*
|
|
|
|
* have an output function defined here (as well as an input function
|
2015-09-23 17:51:50 +02:00
|
|
|
* in readfuncs.c). In addition, plan nodes should have input and
|
|
|
|
* output functions so that they can be sent to parallel workers.
|
2018-09-16 19:02:47 +02:00
|
|
|
*
|
2015-09-23 17:51:50 +02:00
|
|
|
* For use in debugging, we also provide output functions for nodes
|
2018-09-16 19:02:47 +02:00
|
|
|
* that appear in raw parsetrees and planner Paths. These node types
|
|
|
|
* need not have input functions. Output support for raw parsetrees
|
|
|
|
* is somewhat incomplete, too; in particular, utility statements are
|
|
|
|
* almost entirely unsupported. We try to support everything that can
|
|
|
|
* appear in a raw SELECT, though.
|
2002-11-25 19:12:12 +01:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
1999-07-17 22:18:55 +02:00
|
|
|
#include "postgres.h"
|
2000-01-14 01:53:21 +01:00
|
|
|
|
2000-10-31 11:22:13 +01:00
|
|
|
#include <ctype.h>
|
|
|
|
|
2012-03-05 22:15:59 +01:00
|
|
|
#include "lib/stringinfo.h"
|
2018-12-10 17:12:43 +01:00
|
|
|
#include "miscadmin.h"
|
Introduce extensible node types.
An extensible node is always tagged T_Extensible, but the extnodename
field identifies it more specifically; it may also include arbitrary
private data. Extensible nodes can be copied, tested for equality,
serialized, and deserialized, but the core system doesn't know
anything about them otherwise. Some extensions may find it useful to
include these nodes in fdw_private or custom_private lists in lieu of
arm-wrestling their data into a format that the core code can
understand.
Along the way, so as not to burden the authors of such extensible
node types too much, expose the functions for writing serialized
tokens, and for serializing and deserializing bitmapsets.
KaiGai Kohei, per a design suggested by me. Reviewed by Andres Freund
and by me, and further edited by me.
2016-02-12 15:31:16 +01:00
|
|
|
#include "nodes/extensible.h"
|
2019-01-29 22:49:25 +01:00
|
|
|
#include "nodes/pathnodes.h"
|
2012-03-05 22:15:59 +01:00
|
|
|
#include "nodes/plannodes.h"
|
1999-07-16 07:00:38 +02:00
|
|
|
#include "utils/datum.h"
|
2016-06-18 21:22:34 +02:00
|
|
|
#include "utils/rel.h"
|
1996-07-09 08:22:35 +02:00
|
|
|
|
2017-06-22 04:57:23 +02:00
|
|
|
static void outChar(StringInfo str, char c);
|
|
|
|
|
1998-05-10 01:46:35 +02:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
/*
|
|
|
|
* Macros to simplify output of different kinds of fields. Use these
|
|
|
|
* wherever possible to reduce the chance for silly typos. Note that these
|
|
|
|
* hard-wire conventions about the names of the local variables in an Out
|
|
|
|
* routine.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Write the label for the node type */
|
|
|
|
#define WRITE_NODE_TYPE(nodelabel) \
|
2003-04-24 23:16:45 +02:00
|
|
|
appendStringInfoString(str, nodelabel)
|
2002-11-25 19:12:12 +01:00
|
|
|
|
|
|
|
/* Write an integer field (anything written as ":fldname %d") */
|
|
|
|
#define WRITE_INT_FIELD(fldname) \
|
|
|
|
appendStringInfo(str, " :" CppAsString(fldname) " %d", node->fldname)
|
|
|
|
|
|
|
|
/* Write an unsigned integer field (anything written as ":fldname %u") */
|
|
|
|
#define WRITE_UINT_FIELD(fldname) \
|
|
|
|
appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname)
|
|
|
|
|
2017-10-12 01:52:46 +02:00
|
|
|
/* Write an unsigned integer field (anything written with UINT64_FORMAT) */
|
|
|
|
#define WRITE_UINT64_FIELD(fldname) \
|
|
|
|
appendStringInfo(str, " :" CppAsString(fldname) " " UINT64_FORMAT, \
|
|
|
|
node->fldname)
|
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
/* Write an OID field (don't hard-wire assumption that OID is same as uint) */
|
|
|
|
#define WRITE_OID_FIELD(fldname) \
|
|
|
|
appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname)
|
|
|
|
|
|
|
|
/* Write a long-integer field */
|
|
|
|
#define WRITE_LONG_FIELD(fldname) \
|
|
|
|
appendStringInfo(str, " :" CppAsString(fldname) " %ld", node->fldname)
|
|
|
|
|
|
|
|
/* Write a char field (ie, one ascii character) */
|
|
|
|
#define WRITE_CHAR_FIELD(fldname) \
|
2017-06-22 04:57:23 +02:00
|
|
|
(appendStringInfo(str, " :" CppAsString(fldname) " "), \
|
|
|
|
outChar(str, node->fldname))
|
2002-11-25 19:12:12 +01:00
|
|
|
|
|
|
|
/* Write an enumerated-type field as an integer code */
|
|
|
|
#define WRITE_ENUM_FIELD(fldname, enumtype) \
|
|
|
|
appendStringInfo(str, " :" CppAsString(fldname) " %d", \
|
|
|
|
(int) node->fldname)
|
|
|
|
|
|
|
|
/* Write a float field --- caller must give format to define precision */
|
|
|
|
#define WRITE_FLOAT_FIELD(fldname,format) \
|
|
|
|
appendStringInfo(str, " :" CppAsString(fldname) " " format, node->fldname)
|
|
|
|
|
|
|
|
/* Write a boolean field */
|
|
|
|
#define WRITE_BOOL_FIELD(fldname) \
|
|
|
|
appendStringInfo(str, " :" CppAsString(fldname) " %s", \
|
|
|
|
booltostr(node->fldname))
|
|
|
|
|
|
|
|
/* Write a character-string (possibly NULL) field */
|
|
|
|
#define WRITE_STRING_FIELD(fldname) \
|
2017-08-16 05:34:39 +02:00
|
|
|
(appendStringInfoString(str, " :" CppAsString(fldname) " "), \
|
2016-09-16 15:36:19 +02:00
|
|
|
outToken(str, node->fldname))
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2008-08-29 01:09:48 +02:00
|
|
|
/* Write a parse location field (actually same as INT case) */
|
|
|
|
#define WRITE_LOCATION_FIELD(fldname) \
|
|
|
|
appendStringInfo(str, " :" CppAsString(fldname) " %d", node->fldname)
|
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
/* Write a Node field */
|
|
|
|
#define WRITE_NODE_FIELD(fldname) \
|
2017-08-16 05:34:39 +02:00
|
|
|
(appendStringInfoString(str, " :" CppAsString(fldname) " "), \
|
2016-04-08 23:26:36 +02:00
|
|
|
outNode(str, node->fldname))
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2003-02-08 21:20:55 +01:00
|
|
|
/* Write a bitmapset field */
|
|
|
|
#define WRITE_BITMAPSET_FIELD(fldname) \
|
2017-08-16 05:34:39 +02:00
|
|
|
(appendStringInfoString(str, " :" CppAsString(fldname) " "), \
|
2016-09-16 15:36:19 +02:00
|
|
|
outBitmapset(str, node->fldname))
|
2003-02-08 21:20:55 +01:00
|
|
|
|
2018-12-22 06:53:37 +01:00
|
|
|
#define WRITE_ATTRNUMBER_ARRAY(fldname, len) \
|
|
|
|
do { \
|
|
|
|
appendStringInfoString(str, " :" CppAsString(fldname) " "); \
|
|
|
|
for (int i = 0; i < len; i++) \
|
|
|
|
appendStringInfo(str, " %d", node->fldname[i]); \
|
|
|
|
} while(0)
|
|
|
|
|
|
|
|
#define WRITE_OID_ARRAY(fldname, len) \
|
|
|
|
do { \
|
|
|
|
appendStringInfoString(str, " :" CppAsString(fldname) " "); \
|
|
|
|
for (int i = 0; i < len; i++) \
|
|
|
|
appendStringInfo(str, " %u", node->fldname[i]); \
|
|
|
|
} while(0)
|
|
|
|
|
2022-04-27 09:15:09 +02:00
|
|
|
/*
|
|
|
|
* This macro supports the case that the field is NULL. For the other array
|
|
|
|
* macros, that is currently not needed.
|
|
|
|
*/
|
2021-09-14 09:34:50 +02:00
|
|
|
#define WRITE_INDEX_ARRAY(fldname, len) \
|
|
|
|
do { \
|
|
|
|
appendStringInfoString(str, " :" CppAsString(fldname) " "); \
|
2022-04-27 09:15:09 +02:00
|
|
|
if (node->fldname) \
|
|
|
|
for (int i = 0; i < len; i++) \
|
|
|
|
appendStringInfo(str, " %u", node->fldname[i]); \
|
|
|
|
else \
|
|
|
|
appendStringInfoString(str, "<>"); \
|
2021-09-14 09:34:50 +02:00
|
|
|
} while(0)
|
|
|
|
|
2018-12-22 06:53:37 +01:00
|
|
|
#define WRITE_INT_ARRAY(fldname, len) \
|
|
|
|
do { \
|
|
|
|
appendStringInfoString(str, " :" CppAsString(fldname) " "); \
|
|
|
|
for (int i = 0; i < len; i++) \
|
|
|
|
appendStringInfo(str, " %d", node->fldname[i]); \
|
|
|
|
} while(0)
|
|
|
|
|
|
|
|
#define WRITE_BOOL_ARRAY(fldname, len) \
|
|
|
|
do { \
|
|
|
|
appendStringInfoString(str, " :" CppAsString(fldname) " "); \
|
|
|
|
for (int i = 0; i < len; i++) \
|
|
|
|
appendStringInfo(str, " %s", booltostr(node->fldname[i])); \
|
|
|
|
} while(0)
|
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2001-01-08 01:31:43 +01:00
|
|
|
#define booltostr(x) ((x) ? "true" : "false")
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
|
2000-01-14 01:53:21 +01:00
|
|
|
/*
|
2016-09-16 15:36:19 +02:00
|
|
|
* outToken
|
2000-01-14 01:53:21 +01:00
|
|
|
* Convert an ordinary string (eg, an identifier) into a form that
|
|
|
|
* will be decoded back to a plain token by read.c's functions.
|
|
|
|
*
|
|
|
|
* If a null or empty string is given, it is encoded as "<>".
|
|
|
|
*/
|
2016-09-16 15:36:19 +02:00
|
|
|
void
|
|
|
|
outToken(StringInfo str, const char *s)
|
2000-01-14 01:53:21 +01:00
|
|
|
{
|
|
|
|
if (s == NULL || *s == '\0')
|
|
|
|
{
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "<>");
|
2000-01-14 01:53:21 +01:00
|
|
|
return;
|
|
|
|
}
|
2000-04-12 19:17:23 +02:00
|
|
|
|
2000-01-14 01:53:21 +01:00
|
|
|
/*
|
|
|
|
* Look for characters or patterns that are treated specially by read.c
|
2001-01-07 02:08:48 +01:00
|
|
|
* (either in pg_strtok() or in nodeRead()), and therefore need a
|
|
|
|
* protective backslash.
|
2000-01-14 01:53:21 +01:00
|
|
|
*/
|
|
|
|
/* These characters only need to be quoted at the start of the string */
|
|
|
|
if (*s == '<' ||
|
2015-12-23 04:43:46 +01:00
|
|
|
*s == '"' ||
|
2000-12-03 21:45:40 +01:00
|
|
|
isdigit((unsigned char) *s) ||
|
2001-01-08 01:31:43 +01:00
|
|
|
((*s == '+' || *s == '-') &&
|
|
|
|
(isdigit((unsigned char) s[1]) || s[1] == '.')))
|
2000-01-14 01:53:21 +01:00
|
|
|
appendStringInfoChar(str, '\\');
|
|
|
|
while (*s)
|
|
|
|
{
|
|
|
|
/* These chars must be backslashed anywhere in the string */
|
|
|
|
if (*s == ' ' || *s == '\n' || *s == '\t' ||
|
|
|
|
*s == '(' || *s == ')' || *s == '{' || *s == '}' ||
|
|
|
|
*s == '\\')
|
|
|
|
appendStringInfoChar(str, '\\');
|
|
|
|
appendStringInfoChar(str, *s++);
|
|
|
|
}
|
|
|
|
}
|
1999-08-31 03:28:37 +02:00
|
|
|
|
2017-06-22 04:57:23 +02:00
|
|
|
/*
|
|
|
|
* Convert one char. Goes through outToken() so that special characters are
|
|
|
|
* escaped.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
outChar(StringInfo str, char c)
|
|
|
|
{
|
|
|
|
char in[2];
|
|
|
|
|
|
|
|
in[0] = c;
|
|
|
|
in[1] = '\0';
|
|
|
|
|
|
|
|
outToken(str, in);
|
|
|
|
}
|
|
|
|
|
1996-11-10 04:06:38 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outList(StringInfo str, const List *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2011-12-07 20:46:56 +01:00
|
|
|
const ListCell *lc;
|
1996-07-09 08:22:35 +02:00
|
|
|
|
2000-01-14 01:53:21 +01:00
|
|
|
appendStringInfoChar(str, '(');
|
1996-07-09 08:22:35 +02:00
|
|
|
|
2004-05-26 06:41:50 +02:00
|
|
|
if (IsA(node, IntList))
|
|
|
|
appendStringInfoChar(str, 'i');
|
|
|
|
else if (IsA(node, OidList))
|
|
|
|
appendStringInfoChar(str, 'o');
|
2022-07-04 14:52:12 +02:00
|
|
|
else if (IsA(node, XidList))
|
|
|
|
appendStringInfoChar(str, 'x');
|
2004-05-26 06:41:50 +02:00
|
|
|
|
|
|
|
foreach(lc, node)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* For the sake of backward compatibility, we emit a slightly
|
|
|
|
* different whitespace format for lists of nodes vs. other types of
|
|
|
|
* lists. XXX: is this necessary?
|
|
|
|
*/
|
|
|
|
if (IsA(node, List))
|
|
|
|
{
|
2016-04-08 23:26:36 +02:00
|
|
|
outNode(str, lfirst(lc));
|
Represent Lists as expansible arrays, not chains of cons-cells.
Originally, Postgres Lists were a more or less exact reimplementation of
Lisp lists, which consist of chains of separately-allocated cons cells,
each having a value and a next-cell link. We'd hacked that once before
(commit d0b4399d8) to add a separate List header, but the data was still
in cons cells. That makes some operations -- notably list_nth() -- O(N),
and it's bulky because of the next-cell pointers and per-cell palloc
overhead, and it's very cache-unfriendly if the cons cells end up
scattered around rather than being adjacent.
In this rewrite, we still have List headers, but the data is in a
resizable array of values, with no next-cell links. Now we need at
most two palloc's per List, and often only one, since we can allocate
some values in the same palloc call as the List header. (Of course,
extending an existing List may require repalloc's to enlarge the array.
But this involves just O(log N) allocations not O(N).)
Of course this is not without downsides. The key difficulty is that
addition or deletion of a list entry may now cause other entries to
move, which it did not before.
For example, that breaks foreach() and sister macros, which historically
used a pointer to the current cons-cell as loop state. We can repair
those macros transparently by making their actual loop state be an
integer list index; the exposed "ListCell *" pointer is no longer state
carried across loop iterations, but is just a derived value. (In
practice, modern compilers can optimize things back to having just one
loop state value, at least for simple cases with inline loop bodies.)
In principle, this is a semantics change for cases where the loop body
inserts or deletes list entries ahead of the current loop index; but
I found no such cases in the Postgres code.
The change is not at all transparent for code that doesn't use foreach()
but chases lists "by hand" using lnext(). The largest share of such
code in the backend is in loops that were maintaining "prev" and "next"
variables in addition to the current-cell pointer, in order to delete
list cells efficiently using list_delete_cell(). However, we no longer
need a previous-cell pointer to delete a list cell efficiently. Keeping
a next-cell pointer doesn't work, as explained above, but we can improve
matters by changing such code to use a regular foreach() loop and then
using the new macro foreach_delete_current() to delete the current cell.
(This macro knows how to update the associated foreach loop's state so
that no cells will be missed in the traversal.)
There remains a nontrivial risk of code assuming that a ListCell *
pointer will remain good over an operation that could now move the list
contents. To help catch such errors, list.c can be compiled with a new
define symbol DEBUG_LIST_MEMORY_USAGE that forcibly moves list contents
whenever that could possibly happen. This makes list operations
significantly more expensive so it's not normally turned on (though it
is on by default if USE_VALGRIND is on).
There are two notable API differences from the previous code:
* lnext() now requires the List's header pointer in addition to the
current cell's address.
* list_delete_cell() no longer requires a previous-cell argument.
These changes are somewhat unfortunate, but on the other hand code using
either function needs inspection to see if it is assuming anything
it shouldn't, so it's not all bad.
Programmers should be aware of these significant performance changes:
* list_nth() and related functions are now O(1); so there's no
major access-speed difference between a list and an array.
* Inserting or deleting a list element now takes time proportional to
the distance to the end of the list, due to moving the array elements.
(However, it typically *doesn't* require palloc or pfree, so except in
long lists it's probably still faster than before.) Notably, lcons()
used to be about the same cost as lappend(), but that's no longer true
if the list is long. Code that uses lcons() and list_delete_first()
to maintain a stack might usefully be rewritten to push and pop at the
end of the list rather than the beginning.
* There are now list_insert_nth...() and list_delete_nth...() functions
that add or remove a list cell identified by index. These have the
data-movement penalty explained above, but there's no search penalty.
* list_concat() and variants now copy the second list's data into
storage belonging to the first list, so there is no longer any
sharing of cells between the input lists. The second argument is
now declared "const List *" to reflect that it isn't changed.
This patch just does the minimum needed to get the new implementation
in place and fix bugs exposed by the regression tests. As suggested
by the foregoing, there's a fair amount of followup work remaining to
do.
Also, the ENABLE_LIST_COMPAT macros are finally removed in this
commit. Code using those should have been gone a dozen years ago.
Patch by me; thanks to David Rowley, Jesper Pedersen, and others
for review.
Discussion: https://postgr.es/m/11587.1550975080@sss.pgh.pa.us
2019-07-15 19:41:58 +02:00
|
|
|
if (lnext(node, lc))
|
2004-05-26 06:41:50 +02:00
|
|
|
appendStringInfoChar(str, ' ');
|
|
|
|
}
|
|
|
|
else if (IsA(node, IntList))
|
|
|
|
appendStringInfo(str, " %d", lfirst_int(lc));
|
|
|
|
else if (IsA(node, OidList))
|
|
|
|
appendStringInfo(str, " %u", lfirst_oid(lc));
|
2022-07-04 14:52:12 +02:00
|
|
|
else if (IsA(node, XidList))
|
|
|
|
appendStringInfo(str, " %u", lfirst_xid(lc));
|
2004-05-26 06:41:50 +02:00
|
|
|
else
|
|
|
|
elog(ERROR, "unrecognized list node type: %d",
|
|
|
|
(int) node->type);
|
|
|
|
}
|
2001-01-08 01:31:43 +01:00
|
|
|
|
|
|
|
appendStringInfoChar(str, ')');
|
|
|
|
}
|
|
|
|
|
2003-02-08 21:20:55 +01:00
|
|
|
/*
|
2016-09-16 15:36:19 +02:00
|
|
|
* outBitmapset -
|
2003-02-08 21:20:55 +01:00
|
|
|
* converts a bitmap set of integers
|
|
|
|
*
|
2004-05-08 23:21:18 +02:00
|
|
|
* Note: the output format is "(b int int ...)", similar to an integer List.
|
2003-02-08 21:20:55 +01:00
|
|
|
*/
|
2016-09-16 15:36:19 +02:00
|
|
|
void
|
|
|
|
outBitmapset(StringInfo str, const Bitmapset *bms)
|
2003-02-08 21:20:55 +01:00
|
|
|
{
|
|
|
|
int x;
|
|
|
|
|
|
|
|
appendStringInfoChar(str, '(');
|
2004-05-08 23:21:18 +02:00
|
|
|
appendStringInfoChar(str, 'b');
|
2014-11-28 19:37:25 +01:00
|
|
|
x = -1;
|
|
|
|
while ((x = bms_next_member(bms, x)) >= 0)
|
2003-02-08 21:20:55 +01:00
|
|
|
appendStringInfo(str, " %d", x);
|
|
|
|
appendStringInfoChar(str, ')');
|
|
|
|
}
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
/*
|
|
|
|
* Print the value of a Datum given its type.
|
|
|
|
*/
|
2016-04-08 23:26:36 +02:00
|
|
|
void
|
|
|
|
outDatum(StringInfo str, Datum value, int typlen, bool typbyval)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-12-12 16:49:42 +01:00
|
|
|
Size length,
|
|
|
|
i;
|
|
|
|
char *s;
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
length = datumGetSize(value, typbyval, typlen);
|
1999-01-21 17:08:55 +01:00
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
if (typbyval)
|
1997-12-05 00:20:32 +01:00
|
|
|
{
|
2002-12-12 16:49:42 +01:00
|
|
|
s = (char *) (&value);
|
|
|
|
appendStringInfo(str, "%u [ ", (unsigned int) length);
|
|
|
|
for (i = 0; i < (Size) sizeof(Datum); i++)
|
|
|
|
appendStringInfo(str, "%d ", (int) (s[i]));
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoChar(str, ']');
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
s = (char *) DatumGetPointer(value);
|
|
|
|
if (!PointerIsValid(s))
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "0 [ ]");
|
2002-12-12 16:49:42 +01:00
|
|
|
else
|
1997-12-05 00:20:32 +01:00
|
|
|
{
|
2002-12-12 16:49:42 +01:00
|
|
|
appendStringInfo(str, "%u [ ", (unsigned int) length);
|
|
|
|
for (i = 0; i < length; i++)
|
|
|
|
appendStringInfo(str, "%d ", (int) (s[i]));
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoChar(str, ']');
|
1997-12-05 00:20:32 +01:00
|
|
|
}
|
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2000-10-05 21:11:39 +02:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
/*
|
|
|
|
* Stuff from plannodes.h
|
|
|
|
*/
|
|
|
|
|
2007-02-20 18:32:18 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlannedStmt(StringInfo str, const PlannedStmt *node)
|
2007-02-20 18:32:18 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PLANNEDSTMT");
|
|
|
|
|
|
|
|
WRITE_ENUM_FIELD(commandType, CmdType);
|
2017-10-12 01:52:46 +02:00
|
|
|
WRITE_UINT64_FIELD(queryId);
|
2009-10-10 03:43:50 +02:00
|
|
|
WRITE_BOOL_FIELD(hasReturning);
|
2011-02-26 00:56:23 +01:00
|
|
|
WRITE_BOOL_FIELD(hasModifyingCTE);
|
2007-02-20 18:32:18 +01:00
|
|
|
WRITE_BOOL_FIELD(canSetTag);
|
2010-08-18 17:21:54 +02:00
|
|
|
WRITE_BOOL_FIELD(transientPlan);
|
Avoid invalidating all foreign-join cached plans when user mappings change.
We must not push down a foreign join when the foreign tables involved
should be accessed under different user mappings. Previously we tried
to enforce that rule literally during planning, but that meant that the
resulting plans were dependent on the current contents of the
pg_user_mapping catalog, and we had to blow away all cached plans
containing any remote join when anything at all changed in pg_user_mapping.
This could have been improved somewhat, but the fact that a syscache inval
callback has very limited info about what changed made it hard to do better
within that design. Instead, let's change the planner to not consider user
mappings per se, but to allow a foreign join if both RTEs have the same
checkAsUser value. If they do, then they necessarily will use the same
user mapping at runtime, and we don't need to know specifically which one
that is. Post-plan-time changes in pg_user_mapping no longer require any
plan invalidation.
This rule does give up some optimization ability, to wit where two foreign
table references come from views with different owners or one's from a view
and one's directly in the query, but nonetheless the same user mapping
would have applied. We'll sacrifice the first case, but to not regress
more than we have to in the second case, allow a foreign join involving
both zero and nonzero checkAsUser values if the nonzero one is the same as
the prevailing effective userID. In that case, mark the plan as only
runnable by that userID.
The plancache code already had a notion of plans being userID-specific,
in order to support RLS. It was a little confused though, in particular
lacking clarity of thought as to whether it was the rewritten query or just
the finished plan that's dependent on the userID. Rearrange that code so
that it's clearer what depends on which, and so that the same logic applies
to both RLS-injected role dependency and foreign-join-injected role
dependency.
Note that this patch doesn't remove the other issue mentioned in the
original complaint, which is that while we'll reliably stop using a foreign
join if it's disallowed in a new context, we might fail to start using a
foreign join if it's now allowed, but we previously created a generic
cached plan that didn't use one. It was agreed that the chance of winning
that way was not high enough to justify the much larger number of plan
invalidations that would have to occur if we tried to cause it to happen.
In passing, clean up randomly-varying spelling of EXPLAIN commands in
postgres_fdw.sql, and fix a COSTS ON example that had been allowed to
leak into the committed tests.
This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the
previous attempt at ensuring we wouldn't push down foreign joins that
span permissions contexts.
Etsuro Fujita and Tom Lane
Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
|
|
|
WRITE_BOOL_FIELD(dependsOnRole);
|
|
|
|
WRITE_BOOL_FIELD(parallelModeNeeded);
|
2018-04-28 22:46:24 +02:00
|
|
|
WRITE_INT_FIELD(jitFlags);
|
2007-02-20 18:32:18 +01:00
|
|
|
WRITE_NODE_FIELD(planTree);
|
|
|
|
WRITE_NODE_FIELD(rtable);
|
|
|
|
WRITE_NODE_FIELD(resultRelations);
|
Further adjust EXPLAIN's choices of table alias names.
This patch causes EXPLAIN to always assign a separate table alias to the
parent RTE of an append relation (inheritance set); before, such RTEs
were ignored if not actually scanned by the plan. Since the child RTEs
now always have that same alias to start with (cf. commit 55a1954da),
the net effect is that the parent RTE usually gets the alias used or
implied by the query text, and the children all get that alias with "_N"
appended. (The exception to "usually" is if there are duplicate aliases
in different subtrees of the original query; then some of those original
RTEs will also have "_N" appended.)
This results in more uniform output for partitioned-table plans than
we had before: the partitioned table itself gets the original alias,
and all child tables have aliases with "_N", rather than the previous
behavior where one of the children would get an alias without "_N".
The reason for giving the parent RTE an alias, even if it isn't scanned
by the plan, is that we now use the parent's alias to qualify Vars that
refer to an appendrel output column and appear above the Append or
MergeAppend that computes the appendrel. But below the append, Vars
refer to some one of the child relations, and are displayed that way.
This seems clearer than the old behavior where a Var that could carry
values from any child relation was displayed as if it referred to only
one of them.
While at it, change ruleutils.c so that the code paths used by EXPLAIN
deal in Plan trees not PlanState trees. This effectively reverts a
decision made in commit 1cc29fe7c, which seemed like a good idea at
the time to make ruleutils.c consistent with explain.c. However,
it's problematic because we'd really like to allow executor startup
pruning to remove all the children of an append node when possible,
leaving no child PlanState to resolve Vars against. (That's not done
here, but will be in the next patch.) This requires different handling
of subplans and initplans than before, but is otherwise a pretty
straightforward change.
Discussion: https://postgr.es/m/001001d4f44b$2a2cca50$7e865ef0$@lab.ntt.co.jp
2019-12-11 23:05:18 +01:00
|
|
|
WRITE_NODE_FIELD(appendRelations);
|
2007-02-22 23:00:26 +01:00
|
|
|
WRITE_NODE_FIELD(subplans);
|
2007-02-27 02:11:26 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(rewindPlanIDs);
|
2007-02-20 18:32:18 +01:00
|
|
|
WRITE_NODE_FIELD(rowMarks);
|
2007-10-11 20:05:27 +02:00
|
|
|
WRITE_NODE_FIELD(relationOids);
|
2008-09-09 20:58:09 +02:00
|
|
|
WRITE_NODE_FIELD(invalItems);
|
2017-11-13 21:24:12 +01:00
|
|
|
WRITE_NODE_FIELD(paramExecTypes);
|
Change representation of statement lists, and add statement location info.
This patch makes several changes that improve the consistency of
representation of lists of statements. It's always been the case
that the output of parse analysis is a list of Query nodes, whatever
the types of the individual statements in the list. This patch brings
similar consistency to the outputs of raw parsing and planning steps:
* The output of raw parsing is now always a list of RawStmt nodes;
the statement-type-dependent nodes are one level down from that.
* The output of pg_plan_queries() is now always a list of PlannedStmt
nodes, even for utility statements. In the case of a utility statement,
"planning" just consists of wrapping a CMD_UTILITY PlannedStmt around
the utility node. This list representation is now used in Portal and
CachedPlan plan lists, replacing the former convention of intermixing
PlannedStmts with bare utility-statement nodes.
Now, every list of statements has a consistent head-node type depending
on how far along it is in processing. This allows changing many places
that formerly used generic "Node *" pointers to use a more specific
pointer type, thus reducing the number of IsA() tests and casts needed,
as well as improving code clarity.
Also, the post-parse-analysis representation of DECLARE CURSOR is changed
so that it looks more like EXPLAIN, PREPARE, etc. That is, the contained
SELECT remains a child of the DeclareCursorStmt rather than getting flipped
around to be the other way. It's now true for both Query and PlannedStmt
that utilityStmt is non-null if and only if commandType is CMD_UTILITY.
That allows simplifying a lot of places that were testing both fields.
(I think some of those were just defensive programming, but in many places,
it was actually necessary to avoid confusing DECLARE CURSOR with SELECT.)
Because PlannedStmt carries a canSetTag field, we're also able to get rid
of some ad-hoc rules about how to reconstruct canSetTag for a bare utility
statement; specifically, the assumption that a utility is canSetTag if and
only if it's the only one in its list. While I see no near-term need for
relaxing that restriction, it's nice to get rid of the ad-hocery.
The API of ProcessUtility() is changed so that what it's passed is the
wrapper PlannedStmt not just the bare utility statement. This will affect
all users of ProcessUtility_hook, but the changes are pretty trivial; see
the affected contrib modules for examples of the minimum change needed.
(Most compilers should give pointer-type-mismatch warnings for uncorrected
code.)
There's also a change in the API of ExplainOneQuery_hook, to pass through
cursorOptions instead of expecting hook functions to know what to pick.
This is needed because of the DECLARE CURSOR changes, but really should
have been done in 9.6; it's unlikely that any extant hook functions
know about using CURSOR_OPT_PARALLEL_OK.
Finally, teach gram.y to save statement boundary locations in RawStmt
nodes, and pass those through to Query and PlannedStmt nodes. This allows
more intelligent handling of cases where a source query string contains
multiple statements. This patch doesn't actually do anything with the
information, but a follow-on patch will. (Passing this information through
cleanly is the true motivation for these changes; while I think this is all
good cleanup, it's unlikely we'd have bothered without this end goal.)
catversion bump because addition of location fields to struct Query
affects stored rules.
This patch is by me, but it owes a good deal to Fabien Coelho who did
a lot of preliminary work on the problem, and also reviewed the patch.
Discussion: https://postgr.es/m/alpine.DEB.2.20.1612200926310.29821@lancre
2017-01-14 22:02:35 +01:00
|
|
|
WRITE_NODE_FIELD(utilityStmt);
|
|
|
|
WRITE_LOCATION_FIELD(stmt_location);
|
2020-05-26 01:23:48 +02:00
|
|
|
WRITE_INT_FIELD(stmt_len);
|
2007-02-20 18:32:18 +01:00
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
|
|
|
* print the basic stuff of all nodes that inherit from Plan
|
|
|
|
*/
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(StringInfo str, const Plan *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_FLOAT_FIELD(startup_cost, "%.2f");
|
|
|
|
WRITE_FLOAT_FIELD(total_cost, "%.2f");
|
|
|
|
WRITE_FLOAT_FIELD(plan_rows, "%.0f");
|
|
|
|
WRITE_INT_FIELD(plan_width);
|
2015-11-11 14:57:52 +01:00
|
|
|
WRITE_BOOL_FIELD(parallel_aware);
|
2017-04-12 21:13:23 +02:00
|
|
|
WRITE_BOOL_FIELD(parallel_safe);
|
Add support for asynchronous execution.
This implements asynchronous execution, which runs multiple parts of a
non-parallel-aware Append concurrently rather than serially to improve
performance when possible. Currently, the only node type that can be
run concurrently is a ForeignScan that is an immediate child of such an
Append. In the case where such ForeignScans access data on different
remote servers, this would run those ForeignScans concurrently, and
overlap the remote operations to be performed simultaneously, so it'll
improve the performance especially when the operations involve
time-consuming ones such as remote join and remote aggregation.
We may extend this to other node types such as joins or aggregates over
ForeignScans in the future.
This also adds the support for postgres_fdw, which is enabled by the
table-level/server-level option "async_capable". The default is false.
Robert Haas, Kyotaro Horiguchi, Thomas Munro, and myself. This commit
is mostly based on the patch proposed by Robert Haas, but also uses
stuff from the patch proposed by Kyotaro Horiguchi and from the patch
proposed by Thomas Munro. Reviewed by Kyotaro Horiguchi, Konstantin
Knizhnik, Andrey Lepikhov, Movead Li, Thomas Munro, Justin Pryzby, and
others.
Discussion: https://postgr.es/m/CA%2BTgmoaXQEt4tZ03FtQhnzeDEMzBck%2BLrni0UWHVVgOTnA6C1w%40mail.gmail.com
Discussion: https://postgr.es/m/CA%2BhUKGLBRyu0rHrDCMC4%3DRn3252gogyp1SjOgG8SEKKZv%3DFwfQ%40mail.gmail.com
Discussion: https://postgr.es/m/20200228.170650.667613673625155850.horikyota.ntt%40gmail.com
2021-03-31 11:45:00 +02:00
|
|
|
WRITE_BOOL_FIELD(async_capable);
|
2015-09-29 03:55:57 +02:00
|
|
|
WRITE_INT_FIELD(plan_node_id);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(targetlist);
|
|
|
|
WRITE_NODE_FIELD(qual);
|
|
|
|
WRITE_NODE_FIELD(lefttree);
|
|
|
|
WRITE_NODE_FIELD(righttree);
|
2002-12-05 16:50:39 +01:00
|
|
|
WRITE_NODE_FIELD(initPlan);
|
2003-02-09 01:30:41 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(extParam);
|
|
|
|
WRITE_BITMAPSET_FIELD(allParam);
|
2002-11-25 19:12:12 +01:00
|
|
|
}
|
1998-12-15 03:24:15 +01:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
/*
|
|
|
|
* print the basic stuff of all nodes that inherit from Scan
|
|
|
|
*/
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScanInfo(StringInfo str, const Scan *node)
|
2002-11-25 19:12:12 +01:00
|
|
|
{
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
1998-12-15 03:24:15 +01:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_UINT_FIELD(scanrelid);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2002-11-25 19:12:12 +01:00
|
|
|
* print the basic stuff of all nodes that inherit from Join
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2002-11-25 19:12:12 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outJoinPlanInfo(StringInfo str, const Join *node)
|
2002-11-25 19:12:12 +01:00
|
|
|
{
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
2002-11-25 19:12:12 +01:00
|
|
|
|
|
|
|
WRITE_ENUM_FIELD(jointype, JoinType);
|
2017-04-08 04:20:03 +02:00
|
|
|
WRITE_BOOL_FIELD(inner_unique);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(joinqual);
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outResult(StringInfo str, const Result *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("RESULT");
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(resconstantqual);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
Move targetlist SRF handling from expression evaluation to new executor node.
Evaluation of set returning functions (SRFs_ in the targetlist (like SELECT
generate_series(1,5)) so far was done in the expression evaluation (i.e.
ExecEvalExpr()) and projection (i.e. ExecProject/ExecTargetList) code.
This meant that most executor nodes performing projection, and most
expression evaluation functions, had to deal with the possibility that an
evaluated expression could return a set of return values.
That's bad because it leads to repeated code in a lot of places. It also,
and that's my (Andres's) motivation, made it a lot harder to implement a
more efficient way of doing expression evaluation.
To fix this, introduce a new executor node (ProjectSet) that can evaluate
targetlists containing one or more SRFs. To avoid the complexity of the old
way of handling nested expressions returning sets (e.g. having to pass up
ExprDoneCond, and dealing with arguments to functions returning sets etc.),
those SRFs can only be at the top level of the node's targetlist. The
planner makes sure (via split_pathtarget_at_srfs()) that SRF evaluation is
only necessary in ProjectSet nodes and that SRFs are only present at the
top level of the node's targetlist. If there are nested SRFs the planner
creates multiple stacked ProjectSet nodes. The ProjectSet nodes always get
input from an underlying node.
We also discussed and prototyped evaluating targetlist SRFs using ROWS
FROM(), but that turned out to be more complicated than we'd hoped.
While moving SRF evaluation to ProjectSet would allow to retain the old
"least common multiple" behavior when multiple SRFs are present in one
targetlist (i.e. continue returning rows until all SRFs are at the end of
their input at the same time), we decided to instead only return rows till
all SRFs are exhausted, returning NULL for already exhausted ones. We
deemed the previous behavior to be too confusing, unexpected and actually
not particularly useful.
As a side effect, the previously prohibited case of multiple set returning
arguments to a function, is now allowed. Not because it's particularly
desirable, but because it ends up working and there seems to be no argument
for adding code to prohibit it.
Currently the behavior for COALESCE and CASE containing SRFs has changed,
returning multiple rows from the expression, even when the SRF containing
"arm" of the expression is not evaluated. That's because the SRFs are
evaluated in a separate ProjectSet node. As that's quite confusing, we're
likely to instead prohibit SRFs in those places. But that's still being
discussed, and the code would reside in places not touched here, so that's
a task for later.
There's a lot of, now superfluous, code dealing with set return expressions
around. But as the changes to get rid of those are verbose largely boring,
it seems better for readability to keep the cleanup as a separate commit.
Author: Tom Lane and Andres Freund
Discussion: https://postgr.es/m/20160822214023.aaxz5l4igypowyri@alap3.anarazel.de
2017-01-18 21:46:50 +01:00
|
|
|
static void
|
|
|
|
_outProjectSet(StringInfo str, const ProjectSet *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PROJECTSET");
|
|
|
|
|
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
|
|
|
}
|
|
|
|
|
2009-10-10 03:43:50 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outModifyTable(StringInfo str, const ModifyTable *node)
|
2009-10-10 03:43:50 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("MODIFYTABLE");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
2009-10-10 03:43:50 +02:00
|
|
|
|
|
|
|
WRITE_ENUM_FIELD(operation, CmdType);
|
2011-02-26 00:56:23 +01:00
|
|
|
WRITE_BOOL_FIELD(canSetTag);
|
2015-02-18 00:04:11 +01:00
|
|
|
WRITE_UINT_FIELD(nominalRelation);
|
2018-10-07 20:33:17 +02:00
|
|
|
WRITE_UINT_FIELD(rootRelation);
|
Allow UPDATE to move rows between partitions.
When an UPDATE causes a row to no longer match the partition
constraint, try to move it to a different partition where it does
match the partition constraint. In essence, the UPDATE is split into
a DELETE from the old partition and an INSERT into the new one. This
can lead to surprising behavior in concurrency scenarios because
EvalPlanQual rechecks won't work as they normally did; the known
problems are documented. (There is a pending patch to improve the
situation further, but it needs more review.)
Amit Khandekar, reviewed and tested by Amit Langote, David Rowley,
Rajkumar Raghuwanshi, Dilip Kumar, Amul Sul, Thomas Munro, Álvaro
Herrera, Amit Kapila, and me. A few final revisions by me.
Discussion: http://postgr.es/m/CAJ3gD9do9o2ccQ7j7+tSgiE1REY65XRiMb=yJO3u3QhyP8EEPQ@mail.gmail.com
2018-01-19 21:33:06 +01:00
|
|
|
WRITE_BOOL_FIELD(partColsUpdated);
|
2009-10-10 03:43:50 +02:00
|
|
|
WRITE_NODE_FIELD(resultRelations);
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
WRITE_NODE_FIELD(updateColnosLists);
|
2013-07-18 23:10:16 +02:00
|
|
|
WRITE_NODE_FIELD(withCheckOptionLists);
|
2009-10-10 03:43:50 +02:00
|
|
|
WRITE_NODE_FIELD(returningLists);
|
2013-03-10 19:14:53 +01:00
|
|
|
WRITE_NODE_FIELD(fdwPrivLists);
|
2016-03-18 18:48:58 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(fdwDirectModifyPlans);
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
WRITE_NODE_FIELD(rowMarks);
|
|
|
|
WRITE_INT_FIELD(epqParam);
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
WRITE_ENUM_FIELD(onConflictAction, OnConflictAction);
|
|
|
|
WRITE_NODE_FIELD(arbiterIndexes);
|
|
|
|
WRITE_NODE_FIELD(onConflictSet);
|
Fix mishandling of resjunk columns in ON CONFLICT ... UPDATE tlists.
It's unusual to have any resjunk columns in an ON CONFLICT ... UPDATE
list, but it can happen when MULTIEXPR_SUBLINK SubPlans are present.
If it happens, the ON CONFLICT UPDATE code path would end up storing
tuples that include the values of the extra resjunk columns. That's
fairly harmless in the short run, but if new columns are added to
the table then the values would become accessible, possibly leading
to malfunctions if they don't match the datatypes of the new columns.
This had escaped notice through a confluence of missing sanity checks,
including
* There's no cross-check that a tuple presented to heap_insert or
heap_update matches the table rowtype. While it's difficult to
check that fully at reasonable cost, we can easily add assertions
that there aren't too many columns.
* The output-column-assignment cases in execExprInterp.c lacked
any sanity checks on the output column numbers, which seems like
an oversight considering there are plenty of assertion checks on
input column numbers. Add assertions there too.
* We failed to apply nodeModifyTable's ExecCheckPlanOutput() to
the ON CONFLICT UPDATE tlist. That wouldn't have caught this
specific error, since that function is chartered to ignore resjunk
columns; but it sure seems like a bad omission now that we've seen
this bug.
In HEAD, the right way to fix this is to make the processing of
ON CONFLICT UPDATE tlists work the same as regular UPDATE tlists
now do, that is don't add "SET x = x" entries, and use
ExecBuildUpdateProjection to evaluate the tlist and combine it with
old values of the not-set columns. This adds a little complication
to ExecBuildUpdateProjection, but allows removal of a comparable
amount of now-dead code from the planner.
In the back branches, the most expedient solution seems to be to
(a) use an output slot for the ON CONFLICT UPDATE projection that
actually matches the target table, and then (b) invent a variant of
ExecBuildProjectionInfo that can be told to not store values resulting
from resjunk columns, so it doesn't try to store into nonexistent
columns of the output slot. (We can't simply ignore the resjunk columns
altogether; they have to be evaluated for MULTIEXPR_SUBLINK to work.)
This works back to v10. In 9.6, projections work much differently and
we can't cheaply give them such an option. The 9.6 version of this
patch works by inserting a JunkFilter when it's necessary to get rid
of resjunk columns.
In addition, v11 and up have the reverse problem when trying to
perform ON CONFLICT UPDATE on a partitioned table. Through a
further oversight, adjust_partition_tlist() discarded resjunk columns
when re-ordering the ON CONFLICT UPDATE tlist to match a partition.
This accidentally prevented the storing-bogus-tuples problem, but
at the cost that MULTIEXPR_SUBLINK cases didn't work, typically
crashing if more than one row has to be updated. Fix by preserving
resjunk columns in that routine. (I failed to resist the temptation
to add more assertions there too, and to do some minor code
beautification.)
Per report from Andres Freund. Back-patch to all supported branches.
Security: CVE-2021-32028
2021-05-10 17:02:29 +02:00
|
|
|
WRITE_NODE_FIELD(onConflictCols);
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
WRITE_NODE_FIELD(onConflictWhere);
|
2015-08-06 02:44:27 +02:00
|
|
|
WRITE_UINT_FIELD(exclRelRTI);
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
WRITE_NODE_FIELD(exclRelTlist);
|
2022-03-28 16:45:58 +02:00
|
|
|
WRITE_NODE_FIELD(mergeActionLists);
|
2009-10-10 03:43:50 +02:00
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outAppend(StringInfo str, const Append *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("APPEND");
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Further adjust EXPLAIN's choices of table alias names.
This patch causes EXPLAIN to always assign a separate table alias to the
parent RTE of an append relation (inheritance set); before, such RTEs
were ignored if not actually scanned by the plan. Since the child RTEs
now always have that same alias to start with (cf. commit 55a1954da),
the net effect is that the parent RTE usually gets the alias used or
implied by the query text, and the children all get that alias with "_N"
appended. (The exception to "usually" is if there are duplicate aliases
in different subtrees of the original query; then some of those original
RTEs will also have "_N" appended.)
This results in more uniform output for partitioned-table plans than
we had before: the partitioned table itself gets the original alias,
and all child tables have aliases with "_N", rather than the previous
behavior where one of the children would get an alias without "_N".
The reason for giving the parent RTE an alias, even if it isn't scanned
by the plan, is that we now use the parent's alias to qualify Vars that
refer to an appendrel output column and appear above the Append or
MergeAppend that computes the appendrel. But below the append, Vars
refer to some one of the child relations, and are displayed that way.
This seems clearer than the old behavior where a Var that could carry
values from any child relation was displayed as if it referred to only
one of them.
While at it, change ruleutils.c so that the code paths used by EXPLAIN
deal in Plan trees not PlanState trees. This effectively reverts a
decision made in commit 1cc29fe7c, which seemed like a good idea at
the time to make ruleutils.c consistent with explain.c. However,
it's problematic because we'd really like to allow executor startup
pruning to remove all the children of an append node when possible,
leaving no child PlanState to resolve Vars against. (That's not done
here, but will be in the next patch.) This requires different handling
of subplans and initplans than before, but is otherwise a pretty
straightforward change.
Discussion: https://postgr.es/m/001001d4f44b$2a2cca50$7e865ef0$@lab.ntt.co.jp
2019-12-11 23:05:18 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(apprelids);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(appendplans);
|
Add support for asynchronous execution.
This implements asynchronous execution, which runs multiple parts of a
non-parallel-aware Append concurrently rather than serially to improve
performance when possible. Currently, the only node type that can be
run concurrently is a ForeignScan that is an immediate child of such an
Append. In the case where such ForeignScans access data on different
remote servers, this would run those ForeignScans concurrently, and
overlap the remote operations to be performed simultaneously, so it'll
improve the performance especially when the operations involve
time-consuming ones such as remote join and remote aggregation.
We may extend this to other node types such as joins or aggregates over
ForeignScans in the future.
This also adds the support for postgres_fdw, which is enabled by the
table-level/server-level option "async_capable". The default is false.
Robert Haas, Kyotaro Horiguchi, Thomas Munro, and myself. This commit
is mostly based on the patch proposed by Robert Haas, but also uses
stuff from the patch proposed by Kyotaro Horiguchi and from the patch
proposed by Thomas Munro. Reviewed by Kyotaro Horiguchi, Konstantin
Knizhnik, Andrey Lepikhov, Movead Li, Thomas Munro, Justin Pryzby, and
others.
Discussion: https://postgr.es/m/CA%2BTgmoaXQEt4tZ03FtQhnzeDEMzBck%2BLrni0UWHVVgOTnA6C1w%40mail.gmail.com
Discussion: https://postgr.es/m/CA%2BhUKGLBRyu0rHrDCMC4%3DRn3252gogyp1SjOgG8SEKKZv%3DFwfQ%40mail.gmail.com
Discussion: https://postgr.es/m/20200228.170650.667613673625155850.horikyota.ntt%40gmail.com
2021-03-31 11:45:00 +02:00
|
|
|
WRITE_INT_FIELD(nasyncplans);
|
Support Parallel Append plan nodes.
When we create an Append node, we can spread out the workers over the
subplans instead of piling on to each subplan one at a time, which
should typically be a bit more efficient, both because the startup
cost of any plan executed entirely by one worker is paid only once and
also because of reduced contention. We can also construct Append
plans using a mix of partial and non-partial subplans, which may allow
for parallelism in places that otherwise couldn't support it.
Unfortunately, this patch doesn't handle the important case of
parallelizing UNION ALL by running each branch in a separate worker;
the executor infrastructure is added here, but more planner work is
needed.
Amit Khandekar, Robert Haas, Amul Sul, reviewed and tested by
Ashutosh Bapat, Amit Langote, Rafia Sabih, Amit Kapila, and
Rajkumar Raghuwanshi.
Discussion: http://postgr.es/m/CAJ3gD9dy0K_E8r727heqXoBmWZ83HwLFwdcaSSmBQ1+S+vRuUQ@mail.gmail.com
2017-12-05 23:28:39 +01:00
|
|
|
WRITE_INT_FIELD(first_partial_plan);
|
2018-08-02 01:42:46 +02:00
|
|
|
WRITE_NODE_FIELD(part_prune_info);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2010-10-14 22:56:39 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outMergeAppend(StringInfo str, const MergeAppend *node)
|
2010-10-14 22:56:39 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("MERGEAPPEND");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
2010-10-14 22:56:39 +02:00
|
|
|
|
Further adjust EXPLAIN's choices of table alias names.
This patch causes EXPLAIN to always assign a separate table alias to the
parent RTE of an append relation (inheritance set); before, such RTEs
were ignored if not actually scanned by the plan. Since the child RTEs
now always have that same alias to start with (cf. commit 55a1954da),
the net effect is that the parent RTE usually gets the alias used or
implied by the query text, and the children all get that alias with "_N"
appended. (The exception to "usually" is if there are duplicate aliases
in different subtrees of the original query; then some of those original
RTEs will also have "_N" appended.)
This results in more uniform output for partitioned-table plans than
we had before: the partitioned table itself gets the original alias,
and all child tables have aliases with "_N", rather than the previous
behavior where one of the children would get an alias without "_N".
The reason for giving the parent RTE an alias, even if it isn't scanned
by the plan, is that we now use the parent's alias to qualify Vars that
refer to an appendrel output column and appear above the Append or
MergeAppend that computes the appendrel. But below the append, Vars
refer to some one of the child relations, and are displayed that way.
This seems clearer than the old behavior where a Var that could carry
values from any child relation was displayed as if it referred to only
one of them.
While at it, change ruleutils.c so that the code paths used by EXPLAIN
deal in Plan trees not PlanState trees. This effectively reverts a
decision made in commit 1cc29fe7c, which seemed like a good idea at
the time to make ruleutils.c consistent with explain.c. However,
it's problematic because we'd really like to allow executor startup
pruning to remove all the children of an append node when possible,
leaving no child PlanState to resolve Vars against. (That's not done
here, but will be in the next patch.) This requires different handling
of subplans and initplans than before, but is otherwise a pretty
straightforward change.
Discussion: https://postgr.es/m/001001d4f44b$2a2cca50$7e865ef0$@lab.ntt.co.jp
2019-12-11 23:05:18 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(apprelids);
|
2010-10-14 22:56:39 +02:00
|
|
|
WRITE_NODE_FIELD(mergeplans);
|
|
|
|
WRITE_INT_FIELD(numCols);
|
2018-12-22 06:53:37 +01:00
|
|
|
WRITE_ATTRNUMBER_ARRAY(sortColIdx, node->numCols);
|
|
|
|
WRITE_OID_ARRAY(sortOperators, node->numCols);
|
|
|
|
WRITE_OID_ARRAY(collations, node->numCols);
|
|
|
|
WRITE_BOOL_ARRAY(nullsFirst, node->numCols);
|
2018-08-02 01:42:46 +02:00
|
|
|
WRITE_NODE_FIELD(part_prune_info);
|
2010-10-14 22:56:39 +02:00
|
|
|
}
|
|
|
|
|
2008-10-04 23:56:55 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outRecursiveUnion(StringInfo str, const RecursiveUnion *node)
|
2008-10-04 23:56:55 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("RECURSIVEUNION");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
2008-10-04 23:56:55 +02:00
|
|
|
|
|
|
|
WRITE_INT_FIELD(wtParam);
|
2008-10-07 21:27:04 +02:00
|
|
|
WRITE_INT_FIELD(numCols);
|
2018-12-22 06:53:37 +01:00
|
|
|
WRITE_ATTRNUMBER_ARRAY(dupColIdx, node->numCols);
|
|
|
|
WRITE_OID_ARRAY(dupOperators, node->numCols);
|
2019-03-22 12:09:32 +01:00
|
|
|
WRITE_OID_ARRAY(dupCollations, node->numCols);
|
2008-10-07 21:27:04 +02:00
|
|
|
WRITE_LONG_FIELD(numGroups);
|
2008-10-04 23:56:55 +02:00
|
|
|
}
|
|
|
|
|
2005-04-20 00:35:18 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outBitmapAnd(StringInfo str, const BitmapAnd *node)
|
2005-04-20 00:35:18 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("BITMAPAND");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
2005-04-20 00:35:18 +02:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(bitmapplans);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outBitmapOr(StringInfo str, const BitmapOr *node)
|
2005-04-20 00:35:18 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("BITMAPOR");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
2005-04-20 00:35:18 +02:00
|
|
|
|
Support parallel bitmap heap scans.
The index is scanned by a single process, but then all cooperating
processes can iterate jointly over the resulting set of heap blocks.
In the future, we might also want to support using a parallel bitmap
index scan to set up for a parallel bitmap heap scan, but that's a
job for another day.
Dilip Kumar, with some corrections and cosmetic changes by me. The
larger patch set of which this is a part has been reviewed and tested
by (at least) Andres Freund, Amit Khandekar, Tushar Ahuja, Rafia
Sabih, Haribabu Kommi, Thomas Munro, and me.
Discussion: http://postgr.es/m/CAFiTN-uc4=0WxRGfCzs-xfkMYcSEWUC-Fon6thkJGjkh9i=13A@mail.gmail.com
2017-03-08 18:05:43 +01:00
|
|
|
WRITE_BOOL_FIELD(isshared);
|
2005-04-20 00:35:18 +02:00
|
|
|
WRITE_NODE_FIELD(bitmapplans);
|
|
|
|
}
|
|
|
|
|
Add a Gather executor node.
A Gather executor node runs any number of copies of a plan in an equal
number of workers and merges all of the results into a single tuple
stream. It can also run the plan itself, if the workers are
unavailable or haven't started up yet. It is intended to work with
the Partial Seq Scan node which will be added in future commits.
It could also be used to implement parallel query of a different sort
by itself, without help from Partial Seq Scan, if the single_copy mode
is used. In that mode, a worker executes the plan, and the parallel
leader does not, merely collecting the worker's results. So, a Gather
node could be inserted into a plan to split the execution of that plan
across two processes. Nested Gather nodes aren't currently supported,
but we might want to add support for that in the future.
There's nothing in the planner to actually generate Gather nodes yet,
so it's not quite time to break out the champagne. But we're getting
close.
Amit Kapila. Some designs suggestions were provided by me, and I also
reviewed the patch. Single-copy mode, documentation, and other minor
changes also by me.
2015-10-01 01:23:36 +02:00
|
|
|
static void
|
|
|
|
_outGather(StringInfo str, const Gather *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("GATHER");
|
|
|
|
|
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
|
|
|
|
2015-10-01 15:15:36 +02:00
|
|
|
WRITE_INT_FIELD(num_workers);
|
Force rescanning of parallel-aware scan nodes below a Gather[Merge].
The ExecReScan machinery contains various optimizations for postponing
or skipping rescans of plan subtrees; for example a HashAgg node may
conclude that it can re-use the table it built before, instead of
re-reading its input subtree. But that is wrong if the input contains
a parallel-aware table scan node, since the portion of the table scanned
by the leader process is likely to vary from one rescan to the next.
This explains the timing-dependent buildfarm failures we saw after
commit a2b70c89c.
The established mechanism for showing that a plan node's output is
potentially variable is to mark it as depending on some runtime Param.
Hence, to fix this, invent a dummy Param (one that has a PARAM_EXEC
parameter number, but carries no actual value) associated with each Gather
or GatherMerge node, mark parallel-aware nodes below that node as dependent
on that Param, and arrange for ExecReScanGather[Merge] to flag that Param
as changed whenever the Gather[Merge] node is rescanned.
This solution breaks an undocumented assumption made by the parallel
executor logic, namely that all rescans of nodes below a Gather[Merge]
will happen synchronously during the ReScan of the top node itself.
But that's fundamentally contrary to the design of the ExecReScan code,
and so was doomed to fail someday anyway (even if you want to argue
that the bug being fixed here wasn't a failure of that assumption).
A follow-on patch will address that issue. In the meantime, the worst
that's expected to happen is that given very bad timing luck, the leader
might have to do all the work during a rescan, because workers think
they have nothing to do, if they are able to start up before the eventual
ReScan of the leader's parallel-aware table scan node has reset the
shared scan state.
Although this problem exists in 9.6, there does not seem to be any way
for it to manifest there. Without GatherMerge, it seems that a plan tree
that has a rescan-short-circuiting node below Gather will always also
have one above it that will short-circuit in the same cases, preventing
the Gather from being rescanned. Hence we won't take the risk of
back-patching this change into 9.6. But v10 needs it.
Discussion: https://postgr.es/m/CAA4eK1JkByysFJNh9M349u_nNjqETuEnY_y1VUc_kJiU0bxtaQ@mail.gmail.com
2017-08-30 15:29:55 +02:00
|
|
|
WRITE_INT_FIELD(rescan_param);
|
2015-10-01 15:15:36 +02:00
|
|
|
WRITE_BOOL_FIELD(single_copy);
|
2016-02-07 17:39:22 +01:00
|
|
|
WRITE_BOOL_FIELD(invisible);
|
2017-11-16 18:06:14 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(initParam);
|
Add a Gather executor node.
A Gather executor node runs any number of copies of a plan in an equal
number of workers and merges all of the results into a single tuple
stream. It can also run the plan itself, if the workers are
unavailable or haven't started up yet. It is intended to work with
the Partial Seq Scan node which will be added in future commits.
It could also be used to implement parallel query of a different sort
by itself, without help from Partial Seq Scan, if the single_copy mode
is used. In that mode, a worker executes the plan, and the parallel
leader does not, merely collecting the worker's results. So, a Gather
node could be inserted into a plan to split the execution of that plan
across two processes. Nested Gather nodes aren't currently supported,
but we might want to add support for that in the future.
There's nothing in the planner to actually generate Gather nodes yet,
so it's not quite time to break out the champagne. But we're getting
close.
Amit Kapila. Some designs suggestions were provided by me, and I also
reviewed the patch. Single-copy mode, documentation, and other minor
changes also by me.
2015-10-01 01:23:36 +02:00
|
|
|
}
|
|
|
|
|
2017-03-09 13:40:36 +01:00
|
|
|
static void
|
|
|
|
_outGatherMerge(StringInfo str, const GatherMerge *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("GATHERMERGE");
|
|
|
|
|
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
|
|
|
|
|
|
|
WRITE_INT_FIELD(num_workers);
|
Force rescanning of parallel-aware scan nodes below a Gather[Merge].
The ExecReScan machinery contains various optimizations for postponing
or skipping rescans of plan subtrees; for example a HashAgg node may
conclude that it can re-use the table it built before, instead of
re-reading its input subtree. But that is wrong if the input contains
a parallel-aware table scan node, since the portion of the table scanned
by the leader process is likely to vary from one rescan to the next.
This explains the timing-dependent buildfarm failures we saw after
commit a2b70c89c.
The established mechanism for showing that a plan node's output is
potentially variable is to mark it as depending on some runtime Param.
Hence, to fix this, invent a dummy Param (one that has a PARAM_EXEC
parameter number, but carries no actual value) associated with each Gather
or GatherMerge node, mark parallel-aware nodes below that node as dependent
on that Param, and arrange for ExecReScanGather[Merge] to flag that Param
as changed whenever the Gather[Merge] node is rescanned.
This solution breaks an undocumented assumption made by the parallel
executor logic, namely that all rescans of nodes below a Gather[Merge]
will happen synchronously during the ReScan of the top node itself.
But that's fundamentally contrary to the design of the ExecReScan code,
and so was doomed to fail someday anyway (even if you want to argue
that the bug being fixed here wasn't a failure of that assumption).
A follow-on patch will address that issue. In the meantime, the worst
that's expected to happen is that given very bad timing luck, the leader
might have to do all the work during a rescan, because workers think
they have nothing to do, if they are able to start up before the eventual
ReScan of the leader's parallel-aware table scan node has reset the
shared scan state.
Although this problem exists in 9.6, there does not seem to be any way
for it to manifest there. Without GatherMerge, it seems that a plan tree
that has a rescan-short-circuiting node below Gather will always also
have one above it that will short-circuit in the same cases, preventing
the Gather from being rescanned. Hence we won't take the risk of
back-patching this change into 9.6. But v10 needs it.
Discussion: https://postgr.es/m/CAA4eK1JkByysFJNh9M349u_nNjqETuEnY_y1VUc_kJiU0bxtaQ@mail.gmail.com
2017-08-30 15:29:55 +02:00
|
|
|
WRITE_INT_FIELD(rescan_param);
|
2017-03-09 13:40:36 +01:00
|
|
|
WRITE_INT_FIELD(numCols);
|
2018-12-22 06:53:37 +01:00
|
|
|
WRITE_ATTRNUMBER_ARRAY(sortColIdx, node->numCols);
|
|
|
|
WRITE_OID_ARRAY(sortOperators, node->numCols);
|
|
|
|
WRITE_OID_ARRAY(collations, node->numCols);
|
|
|
|
WRITE_BOOL_ARRAY(nullsFirst, node->numCols);
|
2017-11-16 18:06:14 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(initParam);
|
2017-03-09 13:40:36 +01:00
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScan(StringInfo str, const Scan *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("SCAN");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScanInfo(str, node);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outSeqScan(StringInfo str, const SeqScan *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("SEQSCAN");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScanInfo(str, (const Scan *) node);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
Redesign tablesample method API, and do extensive code review.
The original implementation of TABLESAMPLE modeled the tablesample method
API on index access methods, which wasn't a good choice because, without
specialized DDL commands, there's no way to build an extension that can
implement a TSM. (Raw inserts into system catalogs are not an acceptable
thing to do, because we can't undo them during DROP EXTENSION, nor will
pg_upgrade behave sanely.) Instead adopt an API more like procedural
language handlers or foreign data wrappers, wherein the only SQL-level
support object needed is a single handler function identified by having
a special return type. This lets us get rid of the supporting catalog
altogether, so that no custom DDL support is needed for the feature.
Adjust the API so that it can support non-constant tablesample arguments
(the original coding assumed we could evaluate the argument expressions at
ExecInitSampleScan time, which is undesirable even if it weren't outright
unsafe), and discourage sampling methods from looking at invisible tuples.
Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable
within and across queries, as required by the SQL standard, and deal more
honestly with methods that can't support that requirement.
Make a full code-review pass over the tablesample additions, and fix
assorted bugs, omissions, infelicities, and cosmetic issues (such as
failure to put the added code stanzas in a consistent ordering).
Improve EXPLAIN's output of tablesample plans, too.
Back-patch to 9.5 so that we don't have to support the original API
in production.
2015-07-25 20:39:00 +02:00
|
|
|
static void
|
|
|
|
_outSampleScan(StringInfo str, const SampleScan *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("SAMPLESCAN");
|
|
|
|
|
|
|
|
_outScanInfo(str, (const Scan *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(tablesample);
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outIndexScan(StringInfo str, const IndexScan *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("INDEXSCAN");
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScanInfo(str, (const Scan *) node);
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2005-04-25 03:30:14 +02:00
|
|
|
WRITE_OID_FIELD(indexid);
|
|
|
|
WRITE_NODE_FIELD(indexqual);
|
|
|
|
WRITE_NODE_FIELD(indexqualorig);
|
2010-12-03 02:50:48 +01:00
|
|
|
WRITE_NODE_FIELD(indexorderby);
|
|
|
|
WRITE_NODE_FIELD(indexorderbyorig);
|
2015-05-18 03:22:12 +02:00
|
|
|
WRITE_NODE_FIELD(indexorderbyops);
|
2005-04-25 03:30:14 +02:00
|
|
|
WRITE_ENUM_FIELD(indexorderdir, ScanDirection);
|
2011-10-11 20:20:06 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outIndexOnlyScan(StringInfo str, const IndexOnlyScan *node)
|
2011-10-11 20:20:06 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("INDEXONLYSCAN");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScanInfo(str, (const Scan *) node);
|
2011-10-11 20:20:06 +02:00
|
|
|
|
|
|
|
WRITE_OID_FIELD(indexid);
|
|
|
|
WRITE_NODE_FIELD(indexqual);
|
Fix index-only scan plans, take 2.
Commit 4ace45677 failed to fix the problem fully, because the
same issue of attempting to fetch a non-returnable index column
can occur when rechecking the indexqual after using a lossy index
operator. Moreover, it broke EXPLAIN for such indexquals (which
indicates a gap in our test cases :-().
Revert the code changes of 4ace45677 in favor of adding a new field
to struct IndexOnlyScan, containing a version of the indexqual that
can be executed against the index-returned tuple without using any
non-returnable columns. (The restrictions imposed by check_index_only
guarantee this is possible, although we may have to recompute indexed
expressions.) Support construction of that during setrefs.c
processing by marking IndexOnlyScan.indextlist entries as resjunk
if they can't be returned, rather than removing them entirely.
(We could alternatively require setrefs.c to look up the IndexOptInfo
again, but abusing resjunk this way seems like a reasonably safe way
to avoid needing to do that.)
This solution isn't great from an API-stability standpoint: if there
are any extensions out there that build IndexOnlyScan structs directly,
they'll be broken in the next minor releases. However, only a very
invasive extension would be likely to do such a thing. There's no
change in the Path representation, so typical planner extensions
shouldn't have a problem.
As before, back-patch to all supported branches.
Discussion: https://postgr.es/m/3179992.1641150853@sss.pgh.pa.us
Discussion: https://postgr.es/m/17350-b5bdcf476e5badbb@postgresql.org
2022-01-03 21:42:27 +01:00
|
|
|
WRITE_NODE_FIELD(recheckqual);
|
2011-10-11 20:20:06 +02:00
|
|
|
WRITE_NODE_FIELD(indexorderby);
|
|
|
|
WRITE_NODE_FIELD(indextlist);
|
|
|
|
WRITE_ENUM_FIELD(indexorderdir, ScanDirection);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2005-04-20 00:35:18 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outBitmapIndexScan(StringInfo str, const BitmapIndexScan *node)
|
2005-04-20 00:35:18 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("BITMAPINDEXSCAN");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScanInfo(str, (const Scan *) node);
|
2005-04-20 00:35:18 +02:00
|
|
|
|
2005-04-25 03:30:14 +02:00
|
|
|
WRITE_OID_FIELD(indexid);
|
Support parallel bitmap heap scans.
The index is scanned by a single process, but then all cooperating
processes can iterate jointly over the resulting set of heap blocks.
In the future, we might also want to support using a parallel bitmap
index scan to set up for a parallel bitmap heap scan, but that's a
job for another day.
Dilip Kumar, with some corrections and cosmetic changes by me. The
larger patch set of which this is a part has been reviewed and tested
by (at least) Andres Freund, Amit Khandekar, Tushar Ahuja, Rafia
Sabih, Haribabu Kommi, Thomas Munro, and me.
Discussion: http://postgr.es/m/CAFiTN-uc4=0WxRGfCzs-xfkMYcSEWUC-Fon6thkJGjkh9i=13A@mail.gmail.com
2017-03-08 18:05:43 +01:00
|
|
|
WRITE_BOOL_FIELD(isshared);
|
2005-04-25 03:30:14 +02:00
|
|
|
WRITE_NODE_FIELD(indexqual);
|
|
|
|
WRITE_NODE_FIELD(indexqualorig);
|
2005-04-20 00:35:18 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outBitmapHeapScan(StringInfo str, const BitmapHeapScan *node)
|
2005-04-20 00:35:18 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("BITMAPHEAPSCAN");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScanInfo(str, (const Scan *) node);
|
2005-04-20 00:35:18 +02:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(bitmapqualorig);
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outTidScan(StringInfo str, const TidScan *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("TIDSCAN");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScanInfo(str, (const Scan *) node);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2005-11-26 23:14:57 +01:00
|
|
|
WRITE_NODE_FIELD(tidquals);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2021-02-27 10:59:36 +01:00
|
|
|
static void
|
|
|
|
_outTidRangeScan(StringInfo str, const TidRangeScan *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("TIDRANGESCAN");
|
|
|
|
|
|
|
|
_outScanInfo(str, (const Scan *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(tidrangequals);
|
|
|
|
}
|
|
|
|
|
1998-02-13 04:27:47 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outSubqueryScan(StringInfo str, const SubqueryScan *node)
|
1998-02-13 04:27:47 +01:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("SUBQUERYSCAN");
|
1998-12-15 03:24:15 +01:00
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScanInfo(str, (const Scan *) node);
|
1998-12-15 03:24:15 +01:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(subplan);
|
2022-04-06 08:45:00 +02:00
|
|
|
WRITE_ENUM_FIELD(scanstatus, SubqueryScanStatus);
|
1998-02-13 04:27:47 +01:00
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outFunctionScan(StringInfo str, const FunctionScan *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("FUNCTIONSCAN");
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScanInfo(str, (const Scan *) node);
|
2007-02-19 03:23:12 +01:00
|
|
|
|
Support multi-argument UNNEST(), and TABLE() syntax for multiple functions.
This patch adds the ability to write TABLE( function1(), function2(), ...)
as a single FROM-clause entry. The result is the concatenation of the
first row from each function, followed by the second row from each
function, etc; with NULLs inserted if any function produces fewer rows than
others. This is believed to be a much more useful behavior than what
Postgres currently does with multiple SRFs in a SELECT list.
This syntax also provides a reasonable way to combine use of column
definition lists with WITH ORDINALITY: put the column definition list
inside TABLE(), where it's clear that it doesn't control the ordinality
column as well.
Also implement SQL-compliant multiple-argument UNNEST(), by turning
UNNEST(a,b,c) into TABLE(unnest(a), unnest(b), unnest(c)).
The SQL standard specifies TABLE() with only a single function, not
multiple functions, and it seems to require an implicit UNNEST() which is
not what this patch does. There may be something wrong with that reading
of the spec, though, because if it's right then the spec's TABLE() is just
a pointless alternative spelling of UNNEST(). After further review of
that, we might choose to adopt a different syntax for what this patch does,
but in any case this functionality seems clearly worthwhile.
Andrew Gierth, reviewed by Zoltán Böszörményi and Heikki Linnakangas, and
significantly revised by me
2013-11-22 01:37:02 +01:00
|
|
|
WRITE_NODE_FIELD(functions);
|
2013-07-29 17:38:01 +02:00
|
|
|
WRITE_BOOL_FIELD(funcordinality);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2017-03-08 16:39:37 +01:00
|
|
|
static void
|
|
|
|
_outTableFuncScan(StringInfo str, const TableFuncScan *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("TABLEFUNCSCAN");
|
|
|
|
|
|
|
|
_outScanInfo(str, (const Scan *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(tablefunc);
|
|
|
|
}
|
|
|
|
|
2006-08-02 03:59:48 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outValuesScan(StringInfo str, const ValuesScan *node)
|
2006-08-02 03:59:48 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("VALUESSCAN");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScanInfo(str, (const Scan *) node);
|
2007-02-19 03:23:12 +01:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(values_lists);
|
2006-08-02 03:59:48 +02:00
|
|
|
}
|
|
|
|
|
2008-10-04 23:56:55 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outCteScan(StringInfo str, const CteScan *node)
|
2008-10-04 23:56:55 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("CTESCAN");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScanInfo(str, (const Scan *) node);
|
2008-10-04 23:56:55 +02:00
|
|
|
|
|
|
|
WRITE_INT_FIELD(ctePlanId);
|
|
|
|
WRITE_INT_FIELD(cteParam);
|
|
|
|
}
|
|
|
|
|
2017-04-01 06:17:18 +02:00
|
|
|
static void
|
|
|
|
_outNamedTuplestoreScan(StringInfo str, const NamedTuplestoreScan *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("NAMEDTUPLESTORESCAN");
|
|
|
|
|
|
|
|
_outScanInfo(str, (const Scan *) node);
|
|
|
|
|
|
|
|
WRITE_STRING_FIELD(enrname);
|
|
|
|
}
|
|
|
|
|
2008-10-04 23:56:55 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outWorkTableScan(StringInfo str, const WorkTableScan *node)
|
2008-10-04 23:56:55 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("WORKTABLESCAN");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScanInfo(str, (const Scan *) node);
|
2008-10-04 23:56:55 +02:00
|
|
|
|
|
|
|
WRITE_INT_FIELD(wtParam);
|
|
|
|
}
|
|
|
|
|
2011-02-20 06:17:18 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outForeignScan(StringInfo str, const ForeignScan *node)
|
2011-02-20 06:17:18 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("FOREIGNSCAN");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScanInfo(str, (const Scan *) node);
|
2011-02-20 06:17:18 +01:00
|
|
|
|
2016-03-18 18:48:58 +01:00
|
|
|
WRITE_ENUM_FIELD(operation, CmdType);
|
2021-06-06 09:08:21 +02:00
|
|
|
WRITE_UINT_FIELD(resultRelation);
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
WRITE_OID_FIELD(fs_server);
|
Revise FDW planning API, again.
Further reflection shows that a single callback isn't very workable if we
desire to let FDWs generate multiple Paths, because that forces the FDW to
do all work necessary to generate a valid Plan node for each Path. Instead
split the former PlanForeignScan API into three steps: GetForeignRelSize,
GetForeignPaths, GetForeignPlan. We had already bit the bullet of breaking
the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain,
and it's substantially more flexible for complex FDWs.
Add an fdw_private field to RelOptInfo so that the new functions can save
state there rather than possibly having to recalculate information two or
three times.
In addition, we'd not thought through what would be needed to allow an FDW
to set up subexpressions of its choice for runtime execution. We could
treat ForeignScan.fdw_private as an executable expression but that seems
likely to break existing FDWs unnecessarily (in particular, it would
restrict the set of node types allowable in fdw_private to those supported
by expression_tree_walker). Instead, invent a separate field fdw_exprs
which will receive the postprocessing appropriate for expression trees.
(One field is enough since it can be a list of expressions; also, we assume
the corresponding expression state tree(s) will be held within fdw_state,
so we don't need to add anything to ForeignScanState.)
Per review of Hanada Shigeru's pgsql_fdw patch. We may need to tweak this
further as we continue to work on that patch, but to me it feels a lot
closer to being right now.
2012-03-09 18:48:48 +01:00
|
|
|
WRITE_NODE_FIELD(fdw_exprs);
|
2011-02-20 06:17:18 +01:00
|
|
|
WRITE_NODE_FIELD(fdw_private);
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
WRITE_NODE_FIELD(fdw_scan_tlist);
|
2015-10-15 19:00:40 +02:00
|
|
|
WRITE_NODE_FIELD(fdw_recheck_quals);
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(fs_relids);
|
Revise FDW planning API, again.
Further reflection shows that a single callback isn't very workable if we
desire to let FDWs generate multiple Paths, because that forces the FDW to
do all work necessary to generate a valid Plan node for each Path. Instead
split the former PlanForeignScan API into three steps: GetForeignRelSize,
GetForeignPaths, GetForeignPlan. We had already bit the bullet of breaking
the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain,
and it's substantially more flexible for complex FDWs.
Add an fdw_private field to RelOptInfo so that the new functions can save
state there rather than possibly having to recalculate information two or
three times.
In addition, we'd not thought through what would be needed to allow an FDW
to set up subexpressions of its choice for runtime execution. We could
treat ForeignScan.fdw_private as an executable expression but that seems
likely to break existing FDWs unnecessarily (in particular, it would
restrict the set of node types allowable in fdw_private to those supported
by expression_tree_walker). Instead, invent a separate field fdw_exprs
which will receive the postprocessing appropriate for expression trees.
(One field is enough since it can be a list of expressions; also, we assume
the corresponding expression state tree(s) will be held within fdw_state,
so we don't need to add anything to ForeignScanState.)
Per review of Hanada Shigeru's pgsql_fdw patch. We may need to tweak this
further as we continue to work on that patch, but to me it feels a lot
closer to being right now.
2012-03-09 18:48:48 +01:00
|
|
|
WRITE_BOOL_FIELD(fsSystemCol);
|
2011-02-20 06:17:18 +01:00
|
|
|
}
|
|
|
|
|
2014-11-07 23:26:02 +01:00
|
|
|
static void
|
|
|
|
_outCustomScan(StringInfo str, const CustomScan *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("CUSTOMSCAN");
|
|
|
|
|
|
|
|
_outScanInfo(str, (const Scan *) node);
|
2014-11-22 00:21:46 +01:00
|
|
|
|
2014-11-07 23:26:02 +01:00
|
|
|
WRITE_UINT_FIELD(flags);
|
2015-08-06 02:44:27 +02:00
|
|
|
WRITE_NODE_FIELD(custom_plans);
|
2014-11-22 00:21:46 +01:00
|
|
|
WRITE_NODE_FIELD(custom_exprs);
|
|
|
|
WRITE_NODE_FIELD(custom_private);
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
WRITE_NODE_FIELD(custom_scan_tlist);
|
2015-05-01 14:50:35 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(custom_relids);
|
2016-03-29 17:00:18 +02:00
|
|
|
/* CustomName is a key to lookup CustomScanMethods */
|
2014-11-22 00:21:46 +01:00
|
|
|
appendStringInfoString(str, " :methods ");
|
2016-09-16 15:36:19 +02:00
|
|
|
outToken(str, node->methods->CustomName);
|
2014-11-07 23:26:02 +01:00
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outNestLoop(StringInfo str, const NestLoop *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("NESTLOOP");
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outJoinPlanInfo(str, (const Join *) node);
|
2010-07-12 19:01:06 +02:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(nestParams);
|
2002-11-25 19:12:12 +01:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outMergeJoin(StringInfo str, const MergeJoin *node)
|
2002-11-25 19:12:12 +01:00
|
|
|
{
|
2007-01-10 19:06:05 +01:00
|
|
|
int numCols;
|
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("MERGEJOIN");
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outJoinPlanInfo(str, (const Join *) node);
|
1998-11-22 11:48:45 +01:00
|
|
|
|
2017-04-08 04:20:03 +02:00
|
|
|
WRITE_BOOL_FIELD(skip_mark_restore);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(mergeclauses);
|
2007-01-10 19:06:05 +01:00
|
|
|
|
|
|
|
numCols = list_length(node->mergeclauses);
|
|
|
|
|
2018-12-22 06:53:37 +01:00
|
|
|
WRITE_OID_ARRAY(mergeFamilies, numCols);
|
|
|
|
WRITE_OID_ARRAY(mergeCollations, numCols);
|
|
|
|
WRITE_INT_ARRAY(mergeStrategies, numCols);
|
|
|
|
WRITE_BOOL_ARRAY(mergeNullsFirst, numCols);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
1999-11-23 21:07:06 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outHashJoin(StringInfo str, const HashJoin *node)
|
1999-11-23 21:07:06 +01:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("HASHJOIN");
|
1999-11-23 21:07:06 +01:00
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outJoinPlanInfo(str, (const Join *) node);
|
1999-11-23 21:07:06 +01:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(hashclauses);
|
Fix representation of hash keys in Hash/HashJoin nodes.
In 5f32b29c1819 I changed the creation of HashState.hashkeys to
actually use HashState as the parent (instead of HashJoinState, which
was incorrect, as they were executed below HashState), to fix the
problem of hashkeys expressions otherwise relying on slot types
appropriate for HashJoinState, rather than HashState as would be
correct. That reliance was only introduced in 12, which is why it
previously worked to use HashJoinState as the parent (although I'd be
unsurprised if there were problematic cases).
Unfortunately that's not a sufficient solution, because before this
commit, the to-be-hashed expressions referenced inner/outer as
appropriate for the HashJoin, not Hash. That didn't have obvious bad
consequences, because the slots containing the tuples were put into
ecxt_innertuple when hashing a tuple for HashState (even though Hash
doesn't have an inner plan).
There are less common cases where this can cause visible problems
however (rather than just confusion when inspecting such executor
trees). E.g. "ERROR: bogus varno: 65000", when explaining queries
containing a HashJoin where the subsidiary Hash node's hash keys
reference a subplan. While normally hashkeys aren't displayed by
EXPLAIN, if one of those expressions references a subplan, that
subplan may be printed as part of the Hash node - which then failed
because an inner plan was referenced, and Hash doesn't have that.
It seems quite possible that there's other broken cases, too.
Fix the problem by properly splitting the expression for the HashJoin
and Hash nodes at plan time, and have them reference the proper
subsidiary node. While other workarounds are possible, fixing this
correctly seems easy enough. It was a pretty ugly hack to have
ExecInitHashJoin put the expression into the already initialized
HashState, in the first place.
I decided to not just split inner/outer hashkeys inside
make_hashjoin(), but also to separate out hashoperators and
hashcollations at plan time. Otherwise we would have ended up having
two very similar loops, one at plan time and the other during executor
startup. The work seems to more appropriately belong to plan time,
anyway.
Reported-By: Nikita Glukhov, Alexander Korotkov
Author: Andres Freund
Reviewed-By: Tom Lane, in an earlier version
Discussion: https://postgr.es/m/CAPpHfdvGVegF_TKKRiBrSmatJL2dR9uwFCuR+teQ_8tEXU8mxg@mail.gmail.com
Backpatch: 12-
2019-08-02 09:02:46 +02:00
|
|
|
WRITE_NODE_FIELD(hashoperators);
|
|
|
|
WRITE_NODE_FIELD(hashcollations);
|
|
|
|
WRITE_NODE_FIELD(hashkeys);
|
1999-11-23 21:07:06 +01:00
|
|
|
}
|
|
|
|
|
2000-09-29 20:21:41 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outAgg(StringInfo str, const Agg *node)
|
2000-09-29 20:21:41 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("AGG");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
2000-09-29 20:21:41 +02:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_ENUM_FIELD(aggstrategy, AggStrategy);
|
2016-06-26 20:33:38 +02:00
|
|
|
WRITE_ENUM_FIELD(aggsplit, AggSplit);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_INT_FIELD(numCols);
|
2018-12-22 06:53:37 +01:00
|
|
|
WRITE_ATTRNUMBER_ARRAY(grpColIdx, node->numCols);
|
|
|
|
WRITE_OID_ARRAY(grpOperators, node->numCols);
|
2019-03-22 12:09:32 +01:00
|
|
|
WRITE_OID_ARRAY(grpCollations, node->numCols);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_LONG_FIELD(numGroups);
|
2020-02-28 18:32:35 +01:00
|
|
|
WRITE_UINT64_FIELD(transitionSpace);
|
2016-08-24 20:37:50 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(aggParams);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
WRITE_NODE_FIELD(groupingSets);
|
|
|
|
WRITE_NODE_FIELD(chain);
|
2000-09-29 20:21:41 +02:00
|
|
|
}
|
|
|
|
|
2008-12-28 19:54:01 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outWindowAgg(StringInfo str, const WindowAgg *node)
|
2008-12-28 19:54:01 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("WINDOWAGG");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
2008-12-28 19:54:01 +01:00
|
|
|
|
2008-12-31 01:08:39 +01:00
|
|
|
WRITE_UINT_FIELD(winref);
|
2008-12-28 19:54:01 +01:00
|
|
|
WRITE_INT_FIELD(partNumCols);
|
2018-12-22 06:53:37 +01:00
|
|
|
WRITE_ATTRNUMBER_ARRAY(partColIdx, node->partNumCols);
|
|
|
|
WRITE_OID_ARRAY(partOperators, node->partNumCols);
|
2019-03-22 12:09:32 +01:00
|
|
|
WRITE_OID_ARRAY(partCollations, node->partNumCols);
|
2008-12-28 19:54:01 +01:00
|
|
|
WRITE_INT_FIELD(ordNumCols);
|
2018-12-22 06:53:37 +01:00
|
|
|
WRITE_ATTRNUMBER_ARRAY(ordColIdx, node->ordNumCols);
|
|
|
|
WRITE_OID_ARRAY(ordOperators, node->ordNumCols);
|
2019-03-22 12:09:32 +01:00
|
|
|
WRITE_OID_ARRAY(ordCollations, node->ordNumCols);
|
2008-12-31 01:08:39 +01:00
|
|
|
WRITE_INT_FIELD(frameOptions);
|
2010-02-12 18:33:21 +01:00
|
|
|
WRITE_NODE_FIELD(startOffset);
|
|
|
|
WRITE_NODE_FIELD(endOffset);
|
Teach planner and executor about monotonic window funcs
Window functions such as row_number() always return a value higher than
the previously returned value for tuples in any given window partition.
Traditionally queries such as;
SELECT * FROM (
SELECT *, row_number() over (order by c) rn
FROM t
) t WHERE rn <= 10;
were executed fairly inefficiently. Neither the query planner nor the
executor knew that once rn made it to 11 that nothing further would match
the outer query's WHERE clause. It would blindly continue until all
tuples were exhausted from the subquery.
Here we implement means to make the above execute more efficiently.
This is done by way of adding a pg_proc.prosupport function to various of
the built-in window functions and adding supporting code to allow the
support function to inform the planner if the window function is
monotonically increasing, monotonically decreasing, both or neither. The
planner is then able to make use of that information and possibly allow
the executor to short-circuit execution by way of adding a "run condition"
to the WindowAgg to allow it to determine if some of its execution work
can be skipped.
This "run condition" is not like a normal filter. These run conditions
are only built using quals comparing values to monotonic window functions.
For monotonic increasing functions, quals making use of the btree
operators for <, <= and = can be used (assuming the window function column
is on the left). You can see here that once such a condition becomes false
that a monotonic increasing function could never make it subsequently true
again. For monotonically decreasing functions the >, >= and = btree
operators for the given type can be used for run conditions.
The best-case situation for this is when there is a single WindowAgg node
without a PARTITION BY clause. Here when the run condition becomes false
the WindowAgg node can simply return NULL. No more tuples will ever match
the run condition. It's a little more complex when there is a PARTITION
BY clause. In this case, we cannot return NULL as we must still process
other partitions. To speed this case up we pull tuples from the outer
plan to check if they're from the same partition and simply discard them
if they are. When we find a tuple belonging to another partition we start
processing as normal again until the run condition becomes false or we run
out of tuples to process.
When there are multiple WindowAgg nodes to evaluate then this complicates
the situation. For intermediate WindowAggs we must ensure we always
return all tuples to the calling node. Any filtering done could lead to
incorrect results in WindowAgg nodes above. For all intermediate nodes,
we can still save some work when the run condition becomes false. We've
no need to evaluate the WindowFuncs anymore. Other WindowAgg nodes cannot
reference the value of these and these tuples will not appear in the final
result anyway. The savings here are small in comparison to what can be
saved in the top-level WingowAgg, but still worthwhile.
Intermediate WindowAgg nodes never filter out tuples, but here we change
WindowAgg so that the top-level WindowAgg filters out tuples that don't
match the intermediate WindowAgg node's run condition. Such filters
appear in the "Filter" clause in EXPLAIN for the top-level WindowAgg node.
Here we add prosupport functions to allow the above to work for;
row_number(), rank(), dense_rank(), count(*) and count(expr). It appears
technically possible to do the same for min() and max(), however, it seems
unlikely to be useful enough, so that's not done here.
Bump catversion
Author: David Rowley
Reviewed-by: Andy Fan, Zhihong Yu
Discussion: https://postgr.es/m/CAApHDvqvp3At8++yF8ij06sdcoo1S_b2YoaT9D4Nf+MObzsrLQ@mail.gmail.com
2022-04-08 00:34:36 +02:00
|
|
|
WRITE_NODE_FIELD(runCondition);
|
|
|
|
WRITE_NODE_FIELD(runConditionOrig);
|
Support all SQL:2011 options for window frame clauses.
This patch adds the ability to use "RANGE offset PRECEDING/FOLLOWING"
frame boundaries in window functions. We'd punted on that back in the
original patch to add window functions, because it was not clear how to
do it in a reasonably data-type-extensible fashion. That problem is
resolved here by adding the ability for btree operator classes to provide
an "in_range" support function that defines how to add or subtract the
RANGE offset value. Factoring it this way also allows the operator class
to avoid overflow problems near the ends of the datatype's range, if it
wishes to expend effort on that. (In the committed patch, the integer
opclasses handle that issue, but it did not seem worth the trouble to
avoid overflow failures for datetime types.)
The patch includes in_range support for the integer_ops opfamily
(int2/int4/int8) as well as the standard datetime types. Support for
other numeric types has been requested, but that seems like suitable
material for a follow-on patch.
In addition, the patch adds GROUPS mode which counts the offset in
ORDER-BY peer groups rather than rows, and it adds the frame_exclusion
options specified by SQL:2011. As far as I can see, we are now fully
up to spec on window framing options.
Existing behaviors remain unchanged, except that I changed the errcode
for a couple of existing error reports to meet the SQL spec's expectation
that negative "offset" values should be reported as SQLSTATE 22013.
Internally and in relevant parts of the documentation, we now consistently
use the terminology "offset PRECEDING/FOLLOWING" rather than "value
PRECEDING/FOLLOWING", since the term "value" is confusingly vague.
Oliver Ford, reviewed and whacked around some by me
Discussion: https://postgr.es/m/CAGMVOdu9sivPAxbNN0X+q19Sfv9edEPv=HibOJhB14TJv_RCQg@mail.gmail.com
2018-02-07 06:06:50 +01:00
|
|
|
WRITE_OID_FIELD(startInRangeFunc);
|
|
|
|
WRITE_OID_FIELD(endInRangeFunc);
|
|
|
|
WRITE_OID_FIELD(inRangeColl);
|
|
|
|
WRITE_BOOL_FIELD(inRangeAsc);
|
|
|
|
WRITE_BOOL_FIELD(inRangeNullsFirst);
|
Teach planner and executor about monotonic window funcs
Window functions such as row_number() always return a value higher than
the previously returned value for tuples in any given window partition.
Traditionally queries such as;
SELECT * FROM (
SELECT *, row_number() over (order by c) rn
FROM t
) t WHERE rn <= 10;
were executed fairly inefficiently. Neither the query planner nor the
executor knew that once rn made it to 11 that nothing further would match
the outer query's WHERE clause. It would blindly continue until all
tuples were exhausted from the subquery.
Here we implement means to make the above execute more efficiently.
This is done by way of adding a pg_proc.prosupport function to various of
the built-in window functions and adding supporting code to allow the
support function to inform the planner if the window function is
monotonically increasing, monotonically decreasing, both or neither. The
planner is then able to make use of that information and possibly allow
the executor to short-circuit execution by way of adding a "run condition"
to the WindowAgg to allow it to determine if some of its execution work
can be skipped.
This "run condition" is not like a normal filter. These run conditions
are only built using quals comparing values to monotonic window functions.
For monotonic increasing functions, quals making use of the btree
operators for <, <= and = can be used (assuming the window function column
is on the left). You can see here that once such a condition becomes false
that a monotonic increasing function could never make it subsequently true
again. For monotonically decreasing functions the >, >= and = btree
operators for the given type can be used for run conditions.
The best-case situation for this is when there is a single WindowAgg node
without a PARTITION BY clause. Here when the run condition becomes false
the WindowAgg node can simply return NULL. No more tuples will ever match
the run condition. It's a little more complex when there is a PARTITION
BY clause. In this case, we cannot return NULL as we must still process
other partitions. To speed this case up we pull tuples from the outer
plan to check if they're from the same partition and simply discard them
if they are. When we find a tuple belonging to another partition we start
processing as normal again until the run condition becomes false or we run
out of tuples to process.
When there are multiple WindowAgg nodes to evaluate then this complicates
the situation. For intermediate WindowAggs we must ensure we always
return all tuples to the calling node. Any filtering done could lead to
incorrect results in WindowAgg nodes above. For all intermediate nodes,
we can still save some work when the run condition becomes false. We've
no need to evaluate the WindowFuncs anymore. Other WindowAgg nodes cannot
reference the value of these and these tuples will not appear in the final
result anyway. The savings here are small in comparison to what can be
saved in the top-level WingowAgg, but still worthwhile.
Intermediate WindowAgg nodes never filter out tuples, but here we change
WindowAgg so that the top-level WindowAgg filters out tuples that don't
match the intermediate WindowAgg node's run condition. Such filters
appear in the "Filter" clause in EXPLAIN for the top-level WindowAgg node.
Here we add prosupport functions to allow the above to work for;
row_number(), rank(), dense_rank(), count(*) and count(expr). It appears
technically possible to do the same for min() and max(), however, it seems
unlikely to be useful enough, so that's not done here.
Bump catversion
Author: David Rowley
Reviewed-by: Andy Fan, Zhihong Yu
Discussion: https://postgr.es/m/CAApHDvqvp3At8++yF8ij06sdcoo1S_b2YoaT9D4Nf+MObzsrLQ@mail.gmail.com
2022-04-08 00:34:36 +02:00
|
|
|
WRITE_BOOL_FIELD(topWindow);
|
2008-12-28 19:54:01 +01:00
|
|
|
}
|
|
|
|
|
2002-05-12 22:10:05 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outGroup(StringInfo str, const Group *node)
|
2002-05-12 22:10:05 +02:00
|
|
|
{
|
2003-05-06 02:20:33 +02:00
|
|
|
WRITE_NODE_TYPE("GROUP");
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
2002-05-12 22:10:05 +02:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_INT_FIELD(numCols);
|
2018-12-22 06:53:37 +01:00
|
|
|
WRITE_ATTRNUMBER_ARRAY(grpColIdx, node->numCols);
|
|
|
|
WRITE_OID_ARRAY(grpOperators, node->numCols);
|
2019-03-22 12:09:32 +01:00
|
|
|
WRITE_OID_ARRAY(grpCollations, node->numCols);
|
2002-05-12 22:10:05 +02:00
|
|
|
}
|
|
|
|
|
2000-03-24 03:58:25 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outMaterial(StringInfo str, const Material *node)
|
2000-03-24 03:58:25 +01:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("MATERIAL");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
2000-03-24 03:58:25 +01:00
|
|
|
}
|
|
|
|
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
static void
|
2021-07-14 02:43:58 +02:00
|
|
|
_outMemoize(StringInfo str, const Memoize *node)
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
{
|
2021-07-14 02:43:58 +02:00
|
|
|
WRITE_NODE_TYPE("MEMOIZE");
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
|
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
|
|
|
|
|
|
|
WRITE_INT_FIELD(numKeys);
|
|
|
|
WRITE_OID_ARRAY(hashOperators, node->numKeys);
|
|
|
|
WRITE_OID_ARRAY(collations, node->numKeys);
|
|
|
|
WRITE_NODE_FIELD(param_exprs);
|
|
|
|
WRITE_BOOL_FIELD(singlerow);
|
2021-11-23 22:06:59 +01:00
|
|
|
WRITE_BOOL_FIELD(binary_mode);
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
WRITE_UINT_FIELD(est_entries);
|
2021-11-24 11:29:14 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(keyparamids);
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
_outSortInfo(StringInfo str, const Sort *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
1996-07-09 08:22:35 +02:00
|
|
|
|
2003-05-06 02:20:33 +02:00
|
|
|
WRITE_INT_FIELD(numCols);
|
2018-12-22 06:53:37 +01:00
|
|
|
WRITE_ATTRNUMBER_ARRAY(sortColIdx, node->numCols);
|
|
|
|
WRITE_OID_ARRAY(sortOperators, node->numCols);
|
|
|
|
WRITE_OID_ARRAY(collations, node->numCols);
|
|
|
|
WRITE_BOOL_ARRAY(nullsFirst, node->numCols);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
static void
|
|
|
|
_outSort(StringInfo str, const Sort *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("SORT");
|
|
|
|
|
|
|
|
_outSortInfo(str, node);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outIncrementalSort(StringInfo str, const IncrementalSort *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("INCREMENTALSORT");
|
|
|
|
|
|
|
|
_outSortInfo(str, (const Sort *) node);
|
|
|
|
|
|
|
|
WRITE_INT_FIELD(nPresortedCols);
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outUnique(StringInfo str, const Unique *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("UNIQUE");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_INT_FIELD(numCols);
|
2018-12-22 06:53:37 +01:00
|
|
|
WRITE_ATTRNUMBER_ARRAY(uniqColIdx, node->numCols);
|
|
|
|
WRITE_OID_ARRAY(uniqOperators, node->numCols);
|
2019-03-22 12:09:32 +01:00
|
|
|
WRITE_OID_ARRAY(uniqCollations, node->numCols);
|
2000-04-27 01:39:10 +02:00
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
|
2008-09-09 20:58:09 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outHash(StringInfo str, const Hash *node)
|
2008-09-09 20:58:09 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("HASH");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
2009-03-21 01:04:40 +01:00
|
|
|
|
Fix representation of hash keys in Hash/HashJoin nodes.
In 5f32b29c1819 I changed the creation of HashState.hashkeys to
actually use HashState as the parent (instead of HashJoinState, which
was incorrect, as they were executed below HashState), to fix the
problem of hashkeys expressions otherwise relying on slot types
appropriate for HashJoinState, rather than HashState as would be
correct. That reliance was only introduced in 12, which is why it
previously worked to use HashJoinState as the parent (although I'd be
unsurprised if there were problematic cases).
Unfortunately that's not a sufficient solution, because before this
commit, the to-be-hashed expressions referenced inner/outer as
appropriate for the HashJoin, not Hash. That didn't have obvious bad
consequences, because the slots containing the tuples were put into
ecxt_innertuple when hashing a tuple for HashState (even though Hash
doesn't have an inner plan).
There are less common cases where this can cause visible problems
however (rather than just confusion when inspecting such executor
trees). E.g. "ERROR: bogus varno: 65000", when explaining queries
containing a HashJoin where the subsidiary Hash node's hash keys
reference a subplan. While normally hashkeys aren't displayed by
EXPLAIN, if one of those expressions references a subplan, that
subplan may be printed as part of the Hash node - which then failed
because an inner plan was referenced, and Hash doesn't have that.
It seems quite possible that there's other broken cases, too.
Fix the problem by properly splitting the expression for the HashJoin
and Hash nodes at plan time, and have them reference the proper
subsidiary node. While other workarounds are possible, fixing this
correctly seems easy enough. It was a pretty ugly hack to have
ExecInitHashJoin put the expression into the already initialized
HashState, in the first place.
I decided to not just split inner/outer hashkeys inside
make_hashjoin(), but also to separate out hashoperators and
hashcollations at plan time. Otherwise we would have ended up having
two very similar loops, one at plan time and the other during executor
startup. The work seems to more appropriately belong to plan time,
anyway.
Reported-By: Nikita Glukhov, Alexander Korotkov
Author: Andres Freund
Reviewed-By: Tom Lane, in an earlier version
Discussion: https://postgr.es/m/CAPpHfdvGVegF_TKKRiBrSmatJL2dR9uwFCuR+teQ_8tEXU8mxg@mail.gmail.com
Backpatch: 12-
2019-08-02 09:02:46 +02:00
|
|
|
WRITE_NODE_FIELD(hashkeys);
|
2009-03-21 01:04:40 +01:00
|
|
|
WRITE_OID_FIELD(skewTable);
|
|
|
|
WRITE_INT_FIELD(skewColumn);
|
2009-12-29 21:11:45 +01:00
|
|
|
WRITE_BOOL_FIELD(skewInherit);
|
Add parallel-aware hash joins.
Introduce parallel-aware hash joins that appear in EXPLAIN plans as Parallel
Hash Join with Parallel Hash. While hash joins could already appear in
parallel queries, they were previously always parallel-oblivious and had a
partial subplan only on the outer side, meaning that the work of the inner
subplan was duplicated in every worker.
After this commit, the planner will consider using a partial subplan on the
inner side too, using the Parallel Hash node to divide the work over the
available CPU cores and combine its results in shared memory. If the join
needs to be split into multiple batches in order to respect work_mem, then
workers process different batches as much as possible and then work together
on the remaining batches.
The advantages of a parallel-aware hash join over a parallel-oblivious hash
join used in a parallel query are that it:
* avoids wasting memory on duplicated hash tables
* avoids wasting disk space on duplicated batch files
* divides the work of building the hash table over the CPUs
One disadvantage is that there is some communication between the participating
CPUs which might outweigh the benefits of parallelism in the case of small
hash tables. This is avoided by the planner's existing reluctance to supply
partial plans for small scans, but it may be necessary to estimate
synchronization costs in future if that situation changes. Another is that
outer batch 0 must be written to disk if multiple batches are required.
A potential future advantage of parallel-aware hash joins is that right and
full outer joins could be supported, since there is a single set of matched
bits for each hashtable, but that is not yet implemented.
A new GUC enable_parallel_hash is defined to control the feature, defaulting
to on.
Author: Thomas Munro
Reviewed-By: Andres Freund, Robert Haas
Tested-By: Rafia Sabih, Prabhat Sahu
Discussion:
https://postgr.es/m/CAEepm=2W=cOkiZxcg6qiFQP-dHUe09aqTrEMM7yJDrHMhDv_RA@mail.gmail.com
https://postgr.es/m/CAEepm=37HKyJ4U6XOLi=JgfSHM3o6B-GaeO-6hkOmneTDkH+Uw@mail.gmail.com
2017-12-21 08:39:21 +01:00
|
|
|
WRITE_FLOAT_FIELD(rows_total, "%.0f");
|
2008-09-09 20:58:09 +02:00
|
|
|
}
|
|
|
|
|
2000-10-05 21:11:39 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outSetOp(StringInfo str, const SetOp *node)
|
2000-10-05 21:11:39 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("SETOP");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
2000-10-05 21:11:39 +02:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_ENUM_FIELD(cmd, SetOpCmd);
|
2008-08-07 05:04:04 +02:00
|
|
|
WRITE_ENUM_FIELD(strategy, SetOpStrategy);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_INT_FIELD(numCols);
|
2018-12-22 06:53:37 +01:00
|
|
|
WRITE_ATTRNUMBER_ARRAY(dupColIdx, node->numCols);
|
|
|
|
WRITE_OID_ARRAY(dupOperators, node->numCols);
|
2019-03-22 12:09:32 +01:00
|
|
|
WRITE_OID_ARRAY(dupCollations, node->numCols);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_INT_FIELD(flagColIdx);
|
2008-08-07 21:35:02 +02:00
|
|
|
WRITE_INT_FIELD(firstFlag);
|
2008-08-07 05:04:04 +02:00
|
|
|
WRITE_LONG_FIELD(numGroups);
|
2000-10-05 21:11:39 +02:00
|
|
|
}
|
|
|
|
|
2009-10-12 20:10:51 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outLockRows(StringInfo str, const LockRows *node)
|
2009-10-12 20:10:51 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("LOCKROWS");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
2009-10-12 20:10:51 +02:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(rowMarks);
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
WRITE_INT_FIELD(epqParam);
|
2009-10-12 20:10:51 +02:00
|
|
|
}
|
|
|
|
|
2000-10-26 23:38:24 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outLimit(StringInfo str, const Limit *node)
|
2000-10-26 23:38:24 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("LIMIT");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInfo(str, (const Plan *) node);
|
2000-10-26 23:38:24 +02:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(limitOffset);
|
|
|
|
WRITE_NODE_FIELD(limitCount);
|
2020-04-07 22:22:13 +02:00
|
|
|
WRITE_ENUM_FIELD(limitOption, LimitOption);
|
|
|
|
WRITE_INT_FIELD(uniqNumCols);
|
|
|
|
WRITE_ATTRNUMBER_ARRAY(uniqColIdx, node->uniqNumCols);
|
|
|
|
WRITE_OID_ARRAY(uniqOperators, node->uniqNumCols);
|
|
|
|
WRITE_OID_ARRAY(uniqCollations, node->uniqNumCols);
|
2000-10-26 23:38:24 +02:00
|
|
|
}
|
|
|
|
|
2010-07-12 19:01:06 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outNestLoopParam(StringInfo str, const NestLoopParam *node)
|
2010-07-12 19:01:06 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("NESTLOOPPARAM");
|
|
|
|
|
|
|
|
WRITE_INT_FIELD(paramno);
|
|
|
|
WRITE_NODE_FIELD(paramval);
|
|
|
|
}
|
|
|
|
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanRowMark(StringInfo str, const PlanRowMark *node)
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PLANROWMARK");
|
|
|
|
|
|
|
|
WRITE_UINT_FIELD(rti);
|
|
|
|
WRITE_UINT_FIELD(prti);
|
2011-02-10 05:27:07 +01:00
|
|
|
WRITE_UINT_FIELD(rowmarkId);
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
WRITE_ENUM_FIELD(markType, RowMarkType);
|
Improve representation of PlanRowMark.
This patch fixes two inadequacies of the PlanRowMark representation.
First, that the original LockingClauseStrength isn't stored (and cannot be
inferred for foreign tables, which always get ROW_MARK_COPY). Since some
PlanRowMarks are created out of whole cloth and don't actually have an
ancestral RowMarkClause, this requires adding a dummy LCS_NONE value to
enum LockingClauseStrength, which is fairly annoying but the alternatives
seem worse. This fix allows getting rid of the use of get_parse_rowmark()
in FDWs (as per the discussion around commits 462bd95705a0c23b and
8ec8760fc87ecde0), and it simplifies some things elsewhere.
Second, that the representation assumed that all child tables in an
inheritance hierarchy would use the same RowMarkType. That's true today
but will soon not be true. We add an "allMarkTypes" field that identifies
the union of mark types used in all a parent table's children, and use
that where appropriate (currently, only in preprocess_targetlist()).
In passing fix a couple of minor infelicities left over from the SKIP
LOCKED patch, notably that _outPlanRowMark still thought waitPolicy
is a bool.
Catversion bump is required because the numeric values of enum
LockingClauseStrength can appear in on-disk rules.
Extracted from a much larger patch to support foreign table inheritance;
it seemed worth breaking this out, since it's a separable concern.
Shigeru Hanada and Etsuro Fujita, somewhat modified by me
2015-03-15 23:41:47 +01:00
|
|
|
WRITE_INT_FIELD(allMarkTypes);
|
|
|
|
WRITE_ENUM_FIELD(strength, LockClauseStrength);
|
|
|
|
WRITE_ENUM_FIELD(waitPolicy, LockWaitPolicy);
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
WRITE_BOOL_FIELD(isParent);
|
|
|
|
}
|
|
|
|
|
2018-06-10 22:30:14 +02:00
|
|
|
static void
|
|
|
|
_outPartitionPruneInfo(StringInfo str, const PartitionPruneInfo *node)
|
2018-08-02 01:42:46 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PARTITIONPRUNEINFO");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(prune_infos);
|
|
|
|
WRITE_BITMAPSET_FIELD(other_subplans);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outPartitionedRelPruneInfo(StringInfo str, const PartitionedRelPruneInfo *node)
|
2018-06-10 22:30:14 +02:00
|
|
|
{
|
2018-08-02 01:42:46 +02:00
|
|
|
WRITE_NODE_TYPE("PARTITIONEDRELPRUNEINFO");
|
2018-06-10 22:30:14 +02:00
|
|
|
|
2018-10-04 20:03:37 +02:00
|
|
|
WRITE_UINT_FIELD(rtindex);
|
2018-06-10 22:30:14 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(present_parts);
|
|
|
|
WRITE_INT_FIELD(nparts);
|
2018-12-22 06:53:37 +01:00
|
|
|
WRITE_INT_ARRAY(subplan_map, node->nparts);
|
|
|
|
WRITE_INT_ARRAY(subpart_map, node->nparts);
|
Allow ATTACH PARTITION with only ShareUpdateExclusiveLock.
We still require AccessExclusiveLock on the partition itself, because
otherwise an insert that violates the newly-imposed partition
constraint could be in progress at the same time that we're changing
that constraint; only the lock level on the parent relation is
weakened.
To make this safe, we have to cope with (at least) three separate
problems. First, relevant DDL might commit while we're in the process
of building a PartitionDesc. If so, find_inheritance_children() might
see a new partition while the RELOID system cache still has the old
partition bound cached, and even before invalidation messages have
been queued. To fix that, if we see that the pg_class tuple seems to
be missing or to have a null relpartbound, refetch the value directly
from the table. We can't get the wrong value, because DETACH PARTITION
still requires AccessExclusiveLock throughout; if we ever want to
change that, this will need more thought. In testing, I found it quite
difficult to hit even the null-relpartbound case; the race condition
is extremely tight, but the theoretical risk is there.
Second, successive calls to RelationGetPartitionDesc might not return
the same answer. The query planner will get confused if lookup up the
PartitionDesc for a particular relation does not return a consistent
answer for the entire duration of query planning. Likewise, query
execution will get confused if the same relation seems to have a
different PartitionDesc at different times. Invent a new
PartitionDirectory concept and use it to ensure consistency. This
ensures that a single invocation of either the planner or the executor
sees the same view of the PartitionDesc from beginning to end, but it
does not guarantee that the planner and the executor see the same
view. Since this allows pointers to old PartitionDesc entries to
survive even after a relcache rebuild, also postpone removing the old
PartitionDesc entry until we're certain no one is using it.
For the most part, it seems to be OK for the planner and executor to
have different views of the PartitionDesc, because the executor will
just ignore any concurrently added partitions which were unknown at
plan time; those partitions won't be part of the inheritance
expansion, but invalidation messages will trigger replanning at some
point. Normally, this happens by the time the very next command is
executed, but if the next command acquires no locks and executes a
prepared query, it can manage not to notice until a new transaction is
started. We might want to tighten that up, but it's material for a
separate patch. There would still be a small window where a query
that started just after an ATTACH PARTITION command committed might
fail to notice its results -- but only if the command starts before
the commit has been acknowledged to the user. All in all, the warts
here around serializability seem small enough to be worth accepting
for the considerable advantage of being able to add partitions without
a full table lock.
Although in general the consequences of new partitions showing up
between planning and execution are limited to the query not noticing
the new partitions, run-time partition pruning will get confused in
that case, so that's the third problem that this patch fixes.
Run-time partition pruning assumes that indexes into the PartitionDesc
are stable between planning and execution. So, add code so that if
new partitions are added between plan time and execution time, the
indexes stored in the subplan_map[] and subpart_map[] arrays within
the plan's PartitionedRelPruneInfo get adjusted accordingly. There
does not seem to be a simple way to generalize this scheme to cope
with partitions that are removed, mostly because they could then get
added back again with different bounds, but it works OK for added
partitions.
This code does not try to ensure that every backend participating in
a parallel query sees the same view of the PartitionDesc. That
currently doesn't matter, because we never pass PartitionDesc
indexes between backends. Each backend will ignore the concurrently
added partitions which it notices, and it doesn't matter if different
backends are ignoring different sets of concurrently added partitions.
If in the future that matters, for example because we allow writes in
parallel query and want all participants to do tuple routing to the same
set of partitions, the PartitionDirectory concept could be improved to
share PartitionDescs across backends. There is a draft patch to
serialize and restore PartitionDescs on the thread where this patch
was discussed, which may be a useful place to start.
Patch by me. Thanks to Alvaro Herrera, David Rowley, Simon Riggs,
Amit Langote, and Michael Paquier for discussion, and to Alvaro
Herrera for some review.
Discussion: http://postgr.es/m/CA+Tgmobt2upbSocvvDej3yzokd7AkiT+PvgFH+a9-5VV1oJNSQ@mail.gmail.com
Discussion: http://postgr.es/m/CA+TgmoZE0r9-cyA-aY6f8WFEROaDLLL7Vf81kZ8MtFCkxpeQSw@mail.gmail.com
Discussion: http://postgr.es/m/CA+TgmoY13KQZF-=HNTrt9UYWYx3_oYOQpu9ioNT49jGgiDpUEA@mail.gmail.com
2019-03-07 17:13:12 +01:00
|
|
|
WRITE_OID_ARRAY(relid_map, node->nparts);
|
Restructure creation of run-time pruning steps.
Previously, gen_partprune_steps() always built executor pruning steps
using all suitable clauses, including those containing PARAM_EXEC
Params. This meant that the pruning steps were only completely safe
for executor run-time (scan start) pruning. To prune at executor
startup, we had to ignore the steps involving exec Params. But this
doesn't really work in general, since there may be logic changes
needed as well --- for example, pruning according to the last operator's
btree strategy is the wrong thing if we're not applying that operator.
The rules embodied in gen_partprune_steps() and its minions are
sufficiently complicated that tracking their incremental effects in
other logic seems quite impractical.
Short of a complete redesign, the only safe fix seems to be to run
gen_partprune_steps() twice, once to create executor startup pruning
steps and then again for run-time pruning steps. We can save a few
cycles however by noting during the first scan whether we rejected
any clauses because they involved exec Params --- if not, we don't
need to do the second scan.
In support of this, refactor the internal APIs in partprune.c to make
more use of passing information in the GeneratePruningStepsContext
struct, rather than as separate arguments.
This is, I hope, the last piece of our response to a bug report from
Alan Jackson. Back-patch to v11 where this code came in.
Discussion: https://postgr.es/m/FAD28A83-AC73-489E-A058-2681FA31D648@tvsquared.com
2019-05-18 01:44:19 +02:00
|
|
|
WRITE_NODE_FIELD(initial_pruning_steps);
|
|
|
|
WRITE_NODE_FIELD(exec_pruning_steps);
|
2018-06-10 22:30:14 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(execparamids);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outPartitionPruneStepOp(StringInfo str, const PartitionPruneStepOp *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PARTITIONPRUNESTEPOP");
|
|
|
|
|
|
|
|
WRITE_INT_FIELD(step.step_id);
|
|
|
|
WRITE_INT_FIELD(opstrategy);
|
|
|
|
WRITE_NODE_FIELD(exprs);
|
|
|
|
WRITE_NODE_FIELD(cmpfns);
|
|
|
|
WRITE_BITMAPSET_FIELD(nullkeys);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outPartitionPruneStepCombine(StringInfo str, const PartitionPruneStepCombine *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PARTITIONPRUNESTEPCOMBINE");
|
|
|
|
|
|
|
|
WRITE_INT_FIELD(step.step_id);
|
|
|
|
WRITE_ENUM_FIELD(combineOp, PartitionPruneCombineOp);
|
|
|
|
WRITE_NODE_FIELD(source_stepids);
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlanInvalItem(StringInfo str, const PlanInvalItem *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2008-09-09 20:58:09 +02:00
|
|
|
WRITE_NODE_TYPE("PLANINVALITEM");
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2008-09-09 20:58:09 +02:00
|
|
|
WRITE_INT_FIELD(cacheId);
|
2011-08-17 01:27:46 +02:00
|
|
|
WRITE_UINT_FIELD(hashValue);
|
2002-11-25 19:12:12 +01:00
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*****************************************************************************
|
|
|
|
*
|
|
|
|
* Stuff from primnodes.h.
|
|
|
|
*
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outAlias(StringInfo str, const Alias *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_TYPE("ALIAS");
|
1998-12-15 05:00:46 +01:00
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_STRING_FIELD(aliasname);
|
|
|
|
WRITE_NODE_FIELD(colnames);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outRangeVar(StringInfo str, const RangeVar *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_TYPE("RANGEVAR");
|
1998-12-15 05:00:46 +01:00
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
/*
|
|
|
|
* we deliberately ignore catalogname here, since it is presently not
|
|
|
|
* semantically meaningful
|
|
|
|
*/
|
|
|
|
WRITE_STRING_FIELD(schemaname);
|
|
|
|
WRITE_STRING_FIELD(relname);
|
2016-12-23 19:35:11 +01:00
|
|
|
WRITE_BOOL_FIELD(inh);
|
2010-12-13 18:34:26 +01:00
|
|
|
WRITE_CHAR_FIELD(relpersistence);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(alias);
|
2008-09-01 22:42:46 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
2017-03-08 16:39:37 +01:00
|
|
|
static void
|
|
|
|
_outTableFunc(StringInfo str, const TableFunc *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("TABLEFUNC");
|
|
|
|
|
JSON_TABLE
This feature allows jsonb data to be treated as a table and thus used in
a FROM clause like other tabular data. Data can be selected from the
jsonb using jsonpath expressions, and hoisted out of nested structures
in the jsonb to form multiple rows, more or less like an outer join.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zhihong Yu (whose
name I previously misspelled), Himanshu Upadhyaya, Daniel Gustafsson,
Justin Pryzby.
Discussion: https://postgr.es/m/7e2cb85d-24cf-4abb-30a5-1a33715959bd@postgrespro.ru
2022-04-04 21:36:03 +02:00
|
|
|
WRITE_ENUM_FIELD(functype, TableFuncType);
|
2017-03-08 16:39:37 +01:00
|
|
|
WRITE_NODE_FIELD(ns_uris);
|
2017-06-16 09:16:11 +02:00
|
|
|
WRITE_NODE_FIELD(ns_names);
|
2017-03-08 16:39:37 +01:00
|
|
|
WRITE_NODE_FIELD(docexpr);
|
|
|
|
WRITE_NODE_FIELD(rowexpr);
|
|
|
|
WRITE_NODE_FIELD(colnames);
|
|
|
|
WRITE_NODE_FIELD(coltypes);
|
|
|
|
WRITE_NODE_FIELD(coltypmods);
|
|
|
|
WRITE_NODE_FIELD(colcollations);
|
|
|
|
WRITE_NODE_FIELD(colexprs);
|
|
|
|
WRITE_NODE_FIELD(coldefexprs);
|
JSON_TABLE
This feature allows jsonb data to be treated as a table and thus used in
a FROM clause like other tabular data. Data can be selected from the
jsonb using jsonpath expressions, and hoisted out of nested structures
in the jsonb to form multiple rows, more or less like an outer join.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zhihong Yu (whose
name I previously misspelled), Himanshu Upadhyaya, Daniel Gustafsson,
Justin Pryzby.
Discussion: https://postgr.es/m/7e2cb85d-24cf-4abb-30a5-1a33715959bd@postgrespro.ru
2022-04-04 21:36:03 +02:00
|
|
|
WRITE_NODE_FIELD(colvalexprs);
|
2017-03-08 16:39:37 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(notnulls);
|
JSON_TABLE
This feature allows jsonb data to be treated as a table and thus used in
a FROM clause like other tabular data. Data can be selected from the
jsonb using jsonpath expressions, and hoisted out of nested structures
in the jsonb to form multiple rows, more or less like an outer join.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zhihong Yu (whose
name I previously misspelled), Himanshu Upadhyaya, Daniel Gustafsson,
Justin Pryzby.
Discussion: https://postgr.es/m/7e2cb85d-24cf-4abb-30a5-1a33715959bd@postgrespro.ru
2022-04-04 21:36:03 +02:00
|
|
|
WRITE_NODE_FIELD(plan);
|
2017-03-08 16:39:37 +01:00
|
|
|
WRITE_INT_FIELD(ordinalitycol);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
2007-02-20 18:32:18 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outIntoClause(StringInfo str, const IntoClause *node)
|
2007-02-20 18:32:18 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("INTOCLAUSE");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(rel);
|
|
|
|
WRITE_NODE_FIELD(colNames);
|
2019-06-09 23:00:36 +02:00
|
|
|
WRITE_STRING_FIELD(accessMethod);
|
2007-02-20 18:32:18 +01:00
|
|
|
WRITE_NODE_FIELD(options);
|
|
|
|
WRITE_ENUM_FIELD(onCommit, OnCommitAction);
|
|
|
|
WRITE_STRING_FIELD(tableSpaceName);
|
Clean up the mess around EXPLAIN and materialized views.
Revert the matview-related changes in explain.c's API, as per recent
complaint from Robert Haas. The reason for these appears to have been
principally some ill-considered choices around having intorel_startup do
what ought to be parse-time checking, plus a poor arrangement for passing
it the view parsetree it needs to store into pg_rewrite when creating a
materialized view. Do the latter by having parse analysis stick a copy
into the IntoClause, instead of doing it at runtime. (On the whole,
I seriously question the choice to represent CREATE MATERIALIZED VIEW as a
variant of SELECT INTO/CREATE TABLE AS, because that means injecting even
more complexity into what was already a horrid legacy kluge. However,
I didn't go so far as to rethink that choice ... yet.)
I also moved several error checks into matview parse analysis, and
made the check for external Params in a matview more accurate.
In passing, clean things up a bit more around interpretOidsOption(),
and fix things so that we can use that to force no-oids for views,
sequences, etc, thereby eliminating the need to cons up "oids = false"
options when creating them.
catversion bump due to change in IntoClause. (I wonder though if we
really need readfuncs/outfuncs support for IntoClause anymore.)
2013-04-13 01:25:20 +02:00
|
|
|
WRITE_NODE_FIELD(viewQuery);
|
2011-11-25 05:21:06 +01:00
|
|
|
WRITE_BOOL_FIELD(skipData);
|
2007-02-20 18:32:18 +01:00
|
|
|
}
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outVar(StringInfo str, const Var *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("VAR");
|
|
|
|
|
Remove arbitrary 64K-or-so limit on rangetable size.
Up to now the size of a query's rangetable has been limited by the
constants INNER_VAR et al, which mustn't be equal to any real
rangetable index. 65000 doubtless seemed like enough for anybody,
and it still is orders of magnitude larger than the number of joins
we can realistically handle. However, we need a rangetable entry
for each child partition that is (or might be) processed by a query.
Queries with a few thousand partitions are getting more realistic,
so that the day when that limit becomes a problem is in sight,
even if it's not here yet. Hence, let's raise the limit.
Rather than just increase the values of INNER_VAR et al, this patch
adopts the approach of making them small negative values, so that
rangetables could theoretically become as long as INT_MAX.
The bulk of the patch is concerned with changing Var.varno and some
related variables from "Index" (unsigned int) to plain "int". This
is basically cosmetic, with little actual effect other than to help
debuggers print their values nicely. As such, I've only bothered
with changing places that could actually see INNER_VAR et al, which
the parser and most of the planner don't. We do have to be careful
in places that are performing less/greater comparisons on varnos,
but there are very few such places, other than the IS_SPECIAL_VARNO
macro itself.
A notable side effect of this patch is that while it used to be
possible to add INNER_VAR et al to a Bitmapset, that will now
draw an error. I don't see any likelihood that it wouldn't be a
bug to include these fake varnos in a bitmapset of real varnos,
so I think this is all to the good.
Although this touches outfuncs/readfuncs, I don't think a catversion
bump is required, since stored rules would never contain Vars
with these fake varnos.
Andrey Lepikhov and Tom Lane, after a suggestion by Peter Eisentraut
Discussion: https://postgr.es/m/43c7f2f5-1e27-27aa-8c65-c91859d15190@postgrespro.ru
2021-09-15 20:11:21 +02:00
|
|
|
WRITE_INT_FIELD(varno);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_INT_FIELD(varattno);
|
|
|
|
WRITE_OID_FIELD(vartype);
|
|
|
|
WRITE_INT_FIELD(vartypmod);
|
2011-02-08 22:04:18 +01:00
|
|
|
WRITE_OID_FIELD(varcollid);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_UINT_FIELD(varlevelsup);
|
Reconsider the representation of join alias Vars.
The core idea of this patch is to make the parser generate join alias
Vars (that is, ones with varno pointing to a JOIN RTE) only when the
alias Var is actually different from any raw join input, that is a type
coercion and/or COALESCE is necessary to generate the join output value.
Otherwise just generate varno/varattno pointing to the relevant join
input column.
In effect, this means that the planner's flatten_join_alias_vars()
transformation is already done in the parser, for all cases except
(a) columns that are merged by JOIN USING and are transformed in the
process, and (b) whole-row join Vars. In principle that would allow
us to skip doing flatten_join_alias_vars() in many more queries than
we do now, but we don't have quite enough infrastructure to know that
we can do so --- in particular there's no cheap way to know whether
there are any whole-row join Vars. I'm not sure if it's worth the
trouble to add a Query-level flag for that, and in any case it seems
like fit material for a separate patch. But even without skipping the
work entirely, this should make flatten_join_alias_vars() faster,
particularly where there are nested joins that it previously had to
flatten recursively.
An essential part of this change is to replace Var nodes'
varnoold/varoattno fields with varnosyn/varattnosyn, which have
considerably more tightly-defined meanings than the old fields: when
they differ from varno/varattno, they identify the Var's position in
an aliased JOIN RTE, and the join alias is what ruleutils.c should
print for the Var. This is necessary because the varno change
destroyed ruleutils.c's ability to find the JOIN RTE from the Var's
varno.
Another way in which this change broke ruleutils.c is that it's no
longer feasible to determine, from a JOIN RTE's joinaliasvars list,
which join columns correspond to which columns of the join's immediate
input relations. (If those are sub-joins, the joinaliasvars entries
may point to columns of their base relations, not the sub-joins.)
But that was a horrid mess requiring a lot of fragile assumptions
already, so let's just bite the bullet and add some more JOIN RTE
fields to make it more straightforward to figure that out. I added
two integer-List fields containing the relevant column numbers from
the left and right input rels, plus a count of how many merged columns
there are.
This patch depends on the ParseNamespaceColumn infrastructure that
I added in commit 5815696bc. The biggest bit of code change is
restructuring transformFromClauseItem's handling of JOINs so that
the ParseNamespaceColumn data is propagated upward correctly.
Other than that and the ruleutils fixes, everything pretty much
just works, though some processing is now inessential. I grabbed
two pieces of low-hanging fruit in that line:
1. In find_expr_references, we don't need to recurse into join alias
Vars anymore. There aren't any except for references to merged USING
columns, which are more properly handled when we scan the join's RTE.
This change actually fixes an edge-case issue: we will now record a
dependency on any type-coercion function present in a USING column's
joinaliasvar, even if that join column has no references in the query
text. The odds of the missing dependency causing a problem seem quite
small: you'd have to posit somebody dropping an implicit cast between
two data types, without removing the types themselves, and then having
a stored rule containing a whole-row Var for a join whose USING merge
depends on that cast. So I don't feel a great need to change this in
the back branches. But in theory this way is more correct.
2. markRTEForSelectPriv and markTargetListOrigin don't need to recurse
into join alias Vars either, because the cases they care about don't
apply to alias Vars for USING columns that are semantically distinct
from the underlying columns. This removes the only case in which
markVarForSelectPriv could be called with NULL for the RTE, so adjust
the comments to describe that hack as being strictly internal to
markRTEForSelectPriv.
catversion bump required due to changes in stored rules.
Discussion: https://postgr.es/m/7115.1577986646@sss.pgh.pa.us
2020-01-09 17:56:59 +01:00
|
|
|
WRITE_UINT_FIELD(varnosyn);
|
|
|
|
WRITE_INT_FIELD(varattnosyn);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outConst(StringInfo str, const Const *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("CONST");
|
|
|
|
|
|
|
|
WRITE_OID_FIELD(consttype);
|
2007-03-17 01:11:05 +01:00
|
|
|
WRITE_INT_FIELD(consttypmod);
|
2011-02-08 22:04:18 +01:00
|
|
|
WRITE_OID_FIELD(constcollid);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_INT_FIELD(constlen);
|
|
|
|
WRITE_BOOL_FIELD(constbyval);
|
|
|
|
WRITE_BOOL_FIELD(constisnull);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
1998-12-15 05:00:46 +01:00
|
|
|
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, " :constvalue ");
|
1996-07-09 08:22:35 +02:00
|
|
|
if (node->constisnull)
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "<>");
|
1996-07-09 08:22:35 +02:00
|
|
|
else
|
2016-04-08 23:26:36 +02:00
|
|
|
outDatum(str, node->constvalue, node->constlen, node->constbyval);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outParam(StringInfo str, const Param *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PARAM");
|
|
|
|
|
2006-04-22 03:26:01 +02:00
|
|
|
WRITE_ENUM_FIELD(paramkind, ParamKind);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_INT_FIELD(paramid);
|
|
|
|
WRITE_OID_FIELD(paramtype);
|
2006-12-10 23:13:27 +01:00
|
|
|
WRITE_INT_FIELD(paramtypmod);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(paramcollid);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outAggref(StringInfo str, const Aggref *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("AGGREF");
|
1998-12-15 05:00:46 +01:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_OID_FIELD(aggfnoid);
|
|
|
|
WRITE_OID_FIELD(aggtype);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(aggcollid);
|
|
|
|
WRITE_OID_FIELD(inputcollid);
|
Fix handling of argument and result datatypes for partial aggregation.
When doing partial aggregation, the args list of the upper (combining)
Aggref node is replaced by a Var representing the output of the partial
aggregation steps, which has either the aggregate's transition data type
or a serialized representation of that. However, nodeAgg.c blindly
continued to use the args list as an indication of the user-level argument
types. This broke resolution of polymorphic transition datatypes at
executor startup (though it accidentally failed to fail for the ANYARRAY
case, which is likely the only one anyone had tested). Moreover, the
constructed FuncExpr passed to the finalfunc contained completely wrong
information, which would have led to bogus answers or crashes for any case
where the finalfunc examined that information (which is only likely to be
with polymorphic aggregates using a non-polymorphic transition type).
As an independent bug, apply_partialaggref_adjustment neglected to resolve
a polymorphic transition datatype before assigning it as the output type
of the lower-level Aggref node. This again accidentally failed to fail
for ANYARRAY but would be unlikely to work in other cases.
To fix the first problem, record the user-level argument types in a
separate OID-list field of Aggref, and look to that rather than the args
list when asking what the argument types were. (It turns out to be
convenient to include any "direct" arguments in this list too, although
those are not currently subject to being overwritten.)
Rather than adding yet another resolve_aggregate_transtype() call to fix
the second problem, add an aggtranstype field to Aggref, and store the
resolved transition type OID there when the planner first computes it.
(By doing this in the planner and not the parser, we can allow the
aggregate's transition type to change from time to time, although no DDL
support yet exists for that.) This saves nothing of consequence for
simple non-polymorphic aggregates, but for polymorphic transition types
we save a catalog lookup during executor startup as well as several
planner lookups that are new in 9.6 due to parallel query planning.
In passing, fix an error that was introduced into count_agg_clauses_walker
some time ago: it was applying exprTypmod() to something that wasn't an
expression node at all, but a TargetEntry. exprTypmod silently returned
-1 so that there was not an obvious failure, but this broke the intended
sensitivity of aggregate space consumption estimates to the typmod of
varchar and similar data types. This part needs to be back-patched.
Catversion bump due to change of stored Aggref nodes.
Discussion: <8229.1466109074@sss.pgh.pa.us>
2016-06-18 03:44:37 +02:00
|
|
|
WRITE_OID_FIELD(aggtranstype);
|
|
|
|
WRITE_NODE_FIELD(aggargtypes);
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
WRITE_NODE_FIELD(aggdirectargs);
|
2006-07-27 21:52:07 +02:00
|
|
|
WRITE_NODE_FIELD(args);
|
2009-12-15 18:57:48 +01:00
|
|
|
WRITE_NODE_FIELD(aggorder);
|
|
|
|
WRITE_NODE_FIELD(aggdistinct);
|
2013-07-17 02:15:36 +02:00
|
|
|
WRITE_NODE_FIELD(aggfilter);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_BOOL_FIELD(aggstar);
|
Allow aggregate functions to be VARIADIC.
There's no inherent reason why an aggregate function can't be variadic
(even VARIADIC ANY) if its transition function can handle the case.
Indeed, this patch to add the feature touches none of the planner or
executor, and little of the parser; the main missing stuff was DDL and
pg_dump support.
It is true that variadic aggregates can create the same sort of ambiguity
about parameters versus ORDER BY keys that was complained of when we
(briefly) had both one- and two-argument forms of string_agg(). However,
the policy formed in response to that discussion only said that we'd not
create any built-in aggregates with varying numbers of arguments, not that
we shouldn't allow users to do it. So the logical extension of that is
we can allow users to make variadic aggregates as long as we're wary about
shipping any such in core.
In passing, this patch allows aggregate function arguments to be named, to
the extent of remembering the names in pg_proc and dumping them in pg_dump.
You can't yet call an aggregate using named-parameter notation. That seems
like a likely future extension, but it'll take some work, and it's not what
this patch is really about. Likewise, there's still some work needed to
make window functions handle VARIADIC fully, but I left that for another
day.
initdb forced because of new aggvariadic field in Aggref parse nodes.
2013-09-03 23:08:38 +02:00
|
|
|
WRITE_BOOL_FIELD(aggvariadic);
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
WRITE_CHAR_FIELD(aggkind);
|
2009-12-15 18:57:48 +01:00
|
|
|
WRITE_UINT_FIELD(agglevelsup);
|
2016-06-26 20:33:38 +02:00
|
|
|
WRITE_ENUM_FIELD(aggsplit, AggSplit);
|
2020-11-24 09:45:00 +01:00
|
|
|
WRITE_INT_FIELD(aggno);
|
|
|
|
WRITE_INT_FIELD(aggtransno);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
1998-01-17 05:53:46 +01:00
|
|
|
}
|
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
static void
|
|
|
|
_outGroupingFunc(StringInfo str, const GroupingFunc *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("GROUPINGFUNC");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(args);
|
|
|
|
WRITE_NODE_FIELD(refs);
|
|
|
|
WRITE_NODE_FIELD(cols);
|
2015-08-06 02:44:27 +02:00
|
|
|
WRITE_UINT_FIELD(agglevelsup);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
2008-12-28 19:54:01 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outWindowFunc(StringInfo str, const WindowFunc *node)
|
2008-12-28 19:54:01 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("WINDOWFUNC");
|
|
|
|
|
|
|
|
WRITE_OID_FIELD(winfnoid);
|
|
|
|
WRITE_OID_FIELD(wintype);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(wincollid);
|
|
|
|
WRITE_OID_FIELD(inputcollid);
|
2008-12-28 19:54:01 +01:00
|
|
|
WRITE_NODE_FIELD(args);
|
2013-07-17 02:15:36 +02:00
|
|
|
WRITE_NODE_FIELD(aggfilter);
|
2008-12-28 19:54:01 +01:00
|
|
|
WRITE_UINT_FIELD(winref);
|
|
|
|
WRITE_BOOL_FIELD(winstar);
|
|
|
|
WRITE_BOOL_FIELD(winagg);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2019-02-01 16:50:32 +01:00
|
|
|
_outSubscriptingRef(StringInfo str, const SubscriptingRef *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2019-02-01 16:50:32 +01:00
|
|
|
WRITE_NODE_TYPE("SUBSCRIPTINGREF");
|
1998-12-15 05:00:46 +01:00
|
|
|
|
2019-02-01 16:50:32 +01:00
|
|
|
WRITE_OID_FIELD(refcontainertype);
|
2003-04-09 01:20:04 +02:00
|
|
|
WRITE_OID_FIELD(refelemtype);
|
Support subscripting of arbitrary types, not only arrays.
This patch generalizes the subscripting infrastructure so that any
data type can be subscripted, if it provides a handler function to
define what that means. Traditional variable-length (varlena) arrays
all use array_subscript_handler(), while the existing fixed-length
types that support subscripting use raw_array_subscript_handler().
It's expected that other types that want to use subscripting notation
will define their own handlers. (This patch provides no such new
features, though; it only lays the foundation for them.)
To do this, move the parser's semantic processing of subscripts
(including coercion to whatever data type is required) into a
method callback supplied by the handler. On the execution side,
replace the ExecEvalSubscriptingRef* layer of functions with direct
calls to callback-supplied execution routines. (Thus, essentially
no new run-time overhead should be caused by this patch. Indeed,
there is room to remove some overhead by supplying specialized
execution routines. This patch does a little bit in that line,
but more could be done.)
Additional work is required here and there to remove formerly
hard-wired assumptions about the result type, collation, etc
of a SubscriptingRef expression node; and to remove assumptions
that the subscript values must be integers.
One useful side-effect of this is that we now have a less squishy
mechanism for identifying whether a data type is a "true" array:
instead of wiring in weird rules about typlen, we can look to see
if pg_type.typsubscript == F_ARRAY_SUBSCRIPT_HANDLER. For this
to be bulletproof, we have to forbid user-defined types from using
that handler directly; but there seems no good reason for them to
do so.
This patch also removes assumptions that the number of subscripts
is limited to MAXDIM (6), or indeed has any hard-wired limit.
That limit still applies to types handled by array_subscript_handler
or raw_array_subscript_handler, but to discourage other dependencies
on this constant, I've moved it from c.h to utils/array.h.
Dmitry Dolgov, reviewed at various times by Tom Lane, Arthur Zakirov,
Peter Eisentraut, Pavel Stehule
Discussion: https://postgr.es/m/CA+q6zcVDuGBv=M0FqBYX8DPebS3F_0KQ6OVFobGJPM507_SZ_w@mail.gmail.com
Discussion: https://postgr.es/m/CA+q6zcVovR+XY4mfk-7oNk-rF91gH0PebnNfuUjuuDsyHjOcVA@mail.gmail.com
2020-12-09 18:40:37 +01:00
|
|
|
WRITE_OID_FIELD(refrestype);
|
2007-03-17 01:11:05 +01:00
|
|
|
WRITE_INT_FIELD(reftypmod);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(refcollid);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(refupperindexpr);
|
|
|
|
WRITE_NODE_FIELD(reflowerindexpr);
|
|
|
|
WRITE_NODE_FIELD(refexpr);
|
|
|
|
WRITE_NODE_FIELD(refassgnexpr);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outFuncExpr(StringInfo str, const FuncExpr *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_TYPE("FUNCEXPR");
|
2002-11-25 19:12:12 +01:00
|
|
|
|
|
|
|
WRITE_OID_FIELD(funcid);
|
|
|
|
WRITE_OID_FIELD(funcresulttype);
|
|
|
|
WRITE_BOOL_FIELD(funcretset);
|
2013-01-22 02:25:26 +01:00
|
|
|
WRITE_BOOL_FIELD(funcvariadic);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_ENUM_FIELD(funcformat, CoercionForm);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(funccollid);
|
|
|
|
WRITE_OID_FIELD(inputcollid);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(args);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
2009-10-08 04:39:25 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outNamedArgExpr(StringInfo str, const NamedArgExpr *node)
|
2009-10-08 04:39:25 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("NAMEDARGEXPR");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(arg);
|
|
|
|
WRITE_STRING_FIELD(name);
|
|
|
|
WRITE_INT_FIELD(argnumber);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outOpExpr(StringInfo str, const OpExpr *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("OPEXPR");
|
|
|
|
|
|
|
|
WRITE_OID_FIELD(opno);
|
|
|
|
WRITE_OID_FIELD(opfuncid);
|
|
|
|
WRITE_OID_FIELD(opresulttype);
|
|
|
|
WRITE_BOOL_FIELD(opretset);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(opcollid);
|
|
|
|
WRITE_OID_FIELD(inputcollid);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(args);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outDistinctExpr(StringInfo str, const DistinctExpr *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_TYPE("DISTINCTEXPR");
|
2002-11-25 19:12:12 +01:00
|
|
|
|
|
|
|
WRITE_OID_FIELD(opno);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_OID_FIELD(opfuncid);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_OID_FIELD(opresulttype);
|
|
|
|
WRITE_BOOL_FIELD(opretset);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(opcollid);
|
|
|
|
WRITE_OID_FIELD(inputcollid);
|
|
|
|
WRITE_NODE_FIELD(args);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outNullIfExpr(StringInfo str, const NullIfExpr *node)
|
2011-03-20 01:29:08 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("NULLIFEXPR");
|
|
|
|
|
|
|
|
WRITE_OID_FIELD(opno);
|
|
|
|
WRITE_OID_FIELD(opfuncid);
|
|
|
|
WRITE_OID_FIELD(opresulttype);
|
|
|
|
WRITE_BOOL_FIELD(opretset);
|
|
|
|
WRITE_OID_FIELD(opcollid);
|
|
|
|
WRITE_OID_FIELD(inputcollid);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(args);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2003-06-29 02:33:44 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outScalarArrayOpExpr(StringInfo str, const ScalarArrayOpExpr *node)
|
2003-06-29 02:33:44 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("SCALARARRAYOPEXPR");
|
|
|
|
|
|
|
|
WRITE_OID_FIELD(opno);
|
|
|
|
WRITE_OID_FIELD(opfuncid);
|
2021-04-08 13:51:22 +02:00
|
|
|
WRITE_OID_FIELD(hashfuncid);
|
2021-07-07 06:29:17 +02:00
|
|
|
WRITE_OID_FIELD(negfuncid);
|
2003-06-29 02:33:44 +02:00
|
|
|
WRITE_BOOL_FIELD(useOr);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(inputcollid);
|
2003-06-29 02:33:44 +02:00
|
|
|
WRITE_NODE_FIELD(args);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2003-06-29 02:33:44 +02:00
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outBoolExpr(StringInfo str, const BoolExpr *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-12-12 16:49:42 +01:00
|
|
|
char *opstr = NULL;
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_TYPE("BOOLEXPR");
|
|
|
|
|
|
|
|
/* do-it-yourself enum representation */
|
|
|
|
switch (node->boolop)
|
|
|
|
{
|
|
|
|
case AND_EXPR:
|
|
|
|
opstr = "and";
|
|
|
|
break;
|
|
|
|
case OR_EXPR:
|
|
|
|
opstr = "or";
|
|
|
|
break;
|
|
|
|
case NOT_EXPR:
|
|
|
|
opstr = "not";
|
|
|
|
break;
|
|
|
|
}
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, " :boolop ");
|
2016-09-16 15:36:19 +02:00
|
|
|
outToken(str, opstr);
|
2002-12-12 16:49:42 +01:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(args);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outSubLink(StringInfo str, const SubLink *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("SUBLINK");
|
|
|
|
|
|
|
|
WRITE_ENUM_FIELD(subLinkType, SubLinkType);
|
Implement UPDATE tab SET (col1,col2,...) = (SELECT ...), ...
This SQL-standard feature allows a sub-SELECT yielding multiple columns
(but only one row) to be used to compute the new values of several columns
to be updated. While the same results can be had with an independent
sub-SELECT per column, such a workaround can require a great deal of
duplicated computation.
The standard actually says that the source for a multi-column assignment
could be any row-valued expression. The implementation used here is
tightly tied to our existing sub-SELECT support and can't handle other
cases; the Bison grammar would have some issues with them too. However,
I don't feel too bad about this since other cases can be converted into
sub-SELECTs. For instance, "SET (a,b,c) = row_valued_function(x)" could
be written "SET (a,b,c) = (SELECT * FROM row_valued_function(x))".
2014-06-18 19:22:25 +02:00
|
|
|
WRITE_INT_FIELD(subLinkId);
|
2005-12-28 02:30:02 +01:00
|
|
|
WRITE_NODE_FIELD(testexpr);
|
2003-01-10 22:08:15 +01:00
|
|
|
WRITE_NODE_FIELD(operName);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(subselect);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outSubPlan(StringInfo str, const SubPlan *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
2002-12-14 01:17:59 +01:00
|
|
|
WRITE_NODE_TYPE("SUBPLAN");
|
2002-12-12 16:49:42 +01:00
|
|
|
|
2002-12-14 01:17:59 +01:00
|
|
|
WRITE_ENUM_FIELD(subLinkType, SubLinkType);
|
2005-12-28 02:30:02 +01:00
|
|
|
WRITE_NODE_FIELD(testexpr);
|
2004-05-26 06:41:50 +02:00
|
|
|
WRITE_NODE_FIELD(paramIds);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_INT_FIELD(plan_id);
|
2009-04-05 21:59:40 +02:00
|
|
|
WRITE_STRING_FIELD(plan_name);
|
2007-02-22 23:00:26 +01:00
|
|
|
WRITE_OID_FIELD(firstColType);
|
2009-03-10 23:09:26 +01:00
|
|
|
WRITE_INT_FIELD(firstColTypmod);
|
2011-02-08 22:04:18 +01:00
|
|
|
WRITE_OID_FIELD(firstColCollation);
|
2003-01-10 22:08:15 +01:00
|
|
|
WRITE_BOOL_FIELD(useHashTable);
|
|
|
|
WRITE_BOOL_FIELD(unknownEqFalse);
|
2017-02-15 00:09:47 +01:00
|
|
|
WRITE_BOOL_FIELD(parallel_safe);
|
2004-05-26 06:41:50 +02:00
|
|
|
WRITE_NODE_FIELD(setParam);
|
|
|
|
WRITE_NODE_FIELD(parParam);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(args);
|
2008-08-22 02:16:04 +02:00
|
|
|
WRITE_FLOAT_FIELD(startup_cost, "%.2f");
|
|
|
|
WRITE_FLOAT_FIELD(per_call_cost, "%.2f");
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outAlternativeSubPlan(StringInfo str, const AlternativeSubPlan *node)
|
2008-08-22 02:16:04 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("ALTERNATIVESUBPLAN");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subplans);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2000-09-12 23:07:18 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outFieldSelect(StringInfo str, const FieldSelect *node)
|
2000-09-12 23:07:18 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("FIELDSELECT");
|
2000-09-12 23:07:18 +02:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(arg);
|
|
|
|
WRITE_INT_FIELD(fieldnum);
|
|
|
|
WRITE_OID_FIELD(resulttype);
|
|
|
|
WRITE_INT_FIELD(resulttypmod);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(resultcollid);
|
2000-09-12 23:07:18 +02:00
|
|
|
}
|
|
|
|
|
2004-06-09 21:08:20 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outFieldStore(StringInfo str, const FieldStore *node)
|
2004-06-09 21:08:20 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("FIELDSTORE");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(arg);
|
|
|
|
WRITE_NODE_FIELD(newvals);
|
|
|
|
WRITE_NODE_FIELD(fieldnums);
|
|
|
|
WRITE_OID_FIELD(resulttype);
|
|
|
|
}
|
|
|
|
|
2000-09-12 23:07:18 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outRelabelType(StringInfo str, const RelabelType *node)
|
2000-09-12 23:07:18 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("RELABELTYPE");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(arg);
|
|
|
|
WRITE_OID_FIELD(resulttype);
|
|
|
|
WRITE_INT_FIELD(resulttypmod);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(resultcollid);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_ENUM_FIELD(relabelformat, CoercionForm);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2000-09-12 23:07:18 +02:00
|
|
|
}
|
|
|
|
|
2007-06-05 23:31:09 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outCoerceViaIO(StringInfo str, const CoerceViaIO *node)
|
2007-06-05 23:31:09 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("COERCEVIAIO");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(arg);
|
|
|
|
WRITE_OID_FIELD(resulttype);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(resultcollid);
|
2007-06-05 23:31:09 +02:00
|
|
|
WRITE_ENUM_FIELD(coerceformat, CoercionForm);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2007-06-05 23:31:09 +02:00
|
|
|
}
|
|
|
|
|
2007-03-28 01:21:12 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outArrayCoerceExpr(StringInfo str, const ArrayCoerceExpr *node)
|
2007-03-28 01:21:12 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("ARRAYCOERCEEXPR");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(arg);
|
Support arrays over domains.
Allowing arrays with a domain type as their element type was left un-done
in the original domain patch, but not for any very good reason. This
omission leads to such surprising results as array_agg() not working on
a domain column, because the parser can't identify a suitable output type
for the polymorphic aggregate.
In order to fix this, first clean up the APIs of coerce_to_domain() and
some internal functions in parse_coerce.c so that we consistently pass
around a CoercionContext along with CoercionForm. Previously, we sometimes
passed an "isExplicit" boolean flag instead, which is strictly less
information; and coerce_to_domain() didn't even get that, but instead had
to reverse-engineer isExplicit from CoercionForm. That's contrary to the
documentation in primnodes.h that says that CoercionForm only affects
display and not semantics. I don't think this change fixes any live bugs,
but it makes things more consistent. The main reason for doing it though
is that now build_coercion_expression() receives ccontext, which it needs
in order to be able to recursively invoke coerce_to_target_type().
Next, reimplement ArrayCoerceExpr so that the node does not directly know
any details of what has to be done to the individual array elements while
performing the array coercion. Instead, the per-element processing is
represented by a sub-expression whose input is a source array element and
whose output is a target array element. This simplifies life in
parse_coerce.c, because it can build that sub-expression by a recursive
invocation of coerce_to_target_type(). The executor now handles the
per-element processing as a compiled expression instead of hard-wired code.
The main advantage of this is that we can use a single ArrayCoerceExpr to
handle as many as three successive steps per element: base type conversion,
typmod coercion, and domain constraint checking. The old code used two
stacked ArrayCoerceExprs to handle type + typmod coercion, which was pretty
inefficient, and adding yet another array deconstruction to do domain
constraint checking seemed very unappetizing.
In the case where we just need a single, very simple coercion function,
doing this straightforwardly leads to a noticeable increase in the
per-array-element runtime cost. Hence, add an additional shortcut evalfunc
in execExprInterp.c that skips unnecessary overhead for that specific form
of expression. The runtime speed of simple cases is within 1% or so of
where it was before, while cases that previously required two levels of
array processing are significantly faster.
Finally, create an implicit array type for every domain type, as we do for
base types, enums, etc. Everything except the array-coercion case seems
to just work without further effort.
Tom Lane, reviewed by Andrew Dunstan
Discussion: https://postgr.es/m/9852.1499791473@sss.pgh.pa.us
2017-09-30 19:40:56 +02:00
|
|
|
WRITE_NODE_FIELD(elemexpr);
|
2007-03-28 01:21:12 +02:00
|
|
|
WRITE_OID_FIELD(resulttype);
|
|
|
|
WRITE_INT_FIELD(resulttypmod);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(resultcollid);
|
2007-03-28 01:21:12 +02:00
|
|
|
WRITE_ENUM_FIELD(coerceformat, CoercionForm);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2007-03-28 01:21:12 +02:00
|
|
|
}
|
|
|
|
|
2004-12-12 00:26:51 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outConvertRowtypeExpr(StringInfo str, const ConvertRowtypeExpr *node)
|
2004-12-12 00:26:51 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("CONVERTROWTYPEEXPR");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(arg);
|
|
|
|
WRITE_OID_FIELD(resulttype);
|
|
|
|
WRITE_ENUM_FIELD(convertformat, CoercionForm);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2004-12-12 00:26:51 +01:00
|
|
|
}
|
|
|
|
|
2011-03-11 22:27:51 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outCollateExpr(StringInfo str, const CollateExpr *node)
|
2011-03-11 22:27:51 +01:00
|
|
|
{
|
2021-09-15 16:35:41 +02:00
|
|
|
WRITE_NODE_TYPE("COLLATEEXPR");
|
2011-03-11 22:27:51 +01:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(arg);
|
|
|
|
WRITE_OID_FIELD(collOid);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
2000-09-12 23:07:18 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outCaseExpr(StringInfo str, const CaseExpr *node)
|
2000-09-12 23:07:18 +02:00
|
|
|
{
|
2021-09-15 16:35:41 +02:00
|
|
|
WRITE_NODE_TYPE("CASEEXPR");
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_OID_FIELD(casetype);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(casecollid);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(arg);
|
|
|
|
WRITE_NODE_FIELD(args);
|
|
|
|
WRITE_NODE_FIELD(defresult);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2000-09-12 23:07:18 +02:00
|
|
|
}
|
|
|
|
|
2000-09-29 20:21:41 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outCaseWhen(StringInfo str, const CaseWhen *node)
|
2000-09-29 20:21:41 +02:00
|
|
|
{
|
2021-09-15 16:35:41 +02:00
|
|
|
WRITE_NODE_TYPE("CASEWHEN");
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(expr);
|
|
|
|
WRITE_NODE_FIELD(result);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2000-09-29 20:21:41 +02:00
|
|
|
}
|
|
|
|
|
2004-03-17 21:48:43 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outCaseTestExpr(StringInfo str, const CaseTestExpr *node)
|
2004-03-17 21:48:43 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("CASETESTEXPR");
|
|
|
|
|
|
|
|
WRITE_OID_FIELD(typeId);
|
|
|
|
WRITE_INT_FIELD(typeMod);
|
2011-02-08 22:04:18 +01:00
|
|
|
WRITE_OID_FIELD(collation);
|
2004-03-17 21:48:43 +01:00
|
|
|
}
|
|
|
|
|
2003-04-09 01:20:04 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outArrayExpr(StringInfo str, const ArrayExpr *node)
|
2003-04-09 01:20:04 +02:00
|
|
|
{
|
2021-09-15 16:35:41 +02:00
|
|
|
WRITE_NODE_TYPE("ARRAYEXPR");
|
2003-04-09 01:20:04 +02:00
|
|
|
|
|
|
|
WRITE_OID_FIELD(array_typeid);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(array_collid);
|
2003-04-09 01:20:04 +02:00
|
|
|
WRITE_OID_FIELD(element_typeid);
|
|
|
|
WRITE_NODE_FIELD(elements);
|
2003-08-18 01:43:27 +02:00
|
|
|
WRITE_BOOL_FIELD(multidims);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2003-04-09 01:20:04 +02:00
|
|
|
}
|
|
|
|
|
2004-05-11 00:44:49 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outRowExpr(StringInfo str, const RowExpr *node)
|
2004-05-11 00:44:49 +02:00
|
|
|
{
|
2021-09-15 16:35:41 +02:00
|
|
|
WRITE_NODE_TYPE("ROWEXPR");
|
2004-05-11 00:44:49 +02:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(args);
|
|
|
|
WRITE_OID_FIELD(row_typeid);
|
|
|
|
WRITE_ENUM_FIELD(row_format, CoercionForm);
|
2008-10-06 19:39:26 +02:00
|
|
|
WRITE_NODE_FIELD(colnames);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2004-05-11 00:44:49 +02:00
|
|
|
}
|
|
|
|
|
2005-12-28 02:30:02 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outRowCompareExpr(StringInfo str, const RowCompareExpr *node)
|
2005-12-28 02:30:02 +01:00
|
|
|
{
|
2021-09-15 16:35:41 +02:00
|
|
|
WRITE_NODE_TYPE("ROWCOMPAREEXPR");
|
2005-12-28 02:30:02 +01:00
|
|
|
|
|
|
|
WRITE_ENUM_FIELD(rctype, RowCompareType);
|
|
|
|
WRITE_NODE_FIELD(opnos);
|
2006-12-23 01:43:13 +01:00
|
|
|
WRITE_NODE_FIELD(opfamilies);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_NODE_FIELD(inputcollids);
|
2005-12-28 02:30:02 +01:00
|
|
|
WRITE_NODE_FIELD(largs);
|
|
|
|
WRITE_NODE_FIELD(rargs);
|
|
|
|
}
|
|
|
|
|
2003-02-16 03:30:39 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outCoalesceExpr(StringInfo str, const CoalesceExpr *node)
|
2003-02-16 03:30:39 +01:00
|
|
|
{
|
2021-09-15 16:35:41 +02:00
|
|
|
WRITE_NODE_TYPE("COALESCEEXPR");
|
2003-02-16 03:30:39 +01:00
|
|
|
|
|
|
|
WRITE_OID_FIELD(coalescetype);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(coalescecollid);
|
2003-02-16 03:30:39 +01:00
|
|
|
WRITE_NODE_FIELD(args);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2003-02-16 03:30:39 +01:00
|
|
|
}
|
|
|
|
|
2005-06-27 00:05:42 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outMinMaxExpr(StringInfo str, const MinMaxExpr *node)
|
2005-06-27 00:05:42 +02:00
|
|
|
{
|
2021-09-15 16:35:41 +02:00
|
|
|
WRITE_NODE_TYPE("MINMAXEXPR");
|
2005-06-27 00:05:42 +02:00
|
|
|
|
|
|
|
WRITE_OID_FIELD(minmaxtype);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(minmaxcollid);
|
|
|
|
WRITE_OID_FIELD(inputcollid);
|
2005-06-27 00:05:42 +02:00
|
|
|
WRITE_ENUM_FIELD(op, MinMaxOp);
|
|
|
|
WRITE_NODE_FIELD(args);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2005-06-27 00:05:42 +02:00
|
|
|
}
|
|
|
|
|
2016-08-17 02:33:01 +02:00
|
|
|
static void
|
|
|
|
_outSQLValueFunction(StringInfo str, const SQLValueFunction *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("SQLVALUEFUNCTION");
|
|
|
|
|
|
|
|
WRITE_ENUM_FIELD(op, SQLValueFunctionOp);
|
|
|
|
WRITE_OID_FIELD(type);
|
|
|
|
WRITE_INT_FIELD(typmod);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
2006-12-24 01:29:20 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outXmlExpr(StringInfo str, const XmlExpr *node)
|
2006-12-24 01:29:20 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("XMLEXPR");
|
2007-11-15 22:14:46 +01:00
|
|
|
|
2006-12-24 01:29:20 +01:00
|
|
|
WRITE_ENUM_FIELD(op, XmlExprOp);
|
|
|
|
WRITE_STRING_FIELD(name);
|
|
|
|
WRITE_NODE_FIELD(named_args);
|
|
|
|
WRITE_NODE_FIELD(arg_names);
|
|
|
|
WRITE_NODE_FIELD(args);
|
2007-02-03 15:06:56 +01:00
|
|
|
WRITE_ENUM_FIELD(xmloption, XmlOptionType);
|
|
|
|
WRITE_OID_FIELD(type);
|
|
|
|
WRITE_INT_FIELD(typmod);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2006-12-24 01:29:20 +01:00
|
|
|
}
|
|
|
|
|
2000-09-12 23:07:18 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outNullTest(StringInfo str, const NullTest *node)
|
2000-09-12 23:07:18 +02:00
|
|
|
{
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_TYPE("NULLTEST");
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(arg);
|
|
|
|
WRITE_ENUM_FIELD(nulltesttype, NullTestType);
|
2010-01-02 00:03:10 +01:00
|
|
|
WRITE_BOOL_FIELD(argisrow);
|
2015-02-22 20:40:27 +01:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outBooleanTest(StringInfo str, const BooleanTest *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("BOOLEANTEST");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(arg);
|
|
|
|
WRITE_ENUM_FIELD(booltesttype, BoolTestType);
|
2015-02-22 20:40:27 +01:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outCoerceToDomain(StringInfo str, const CoerceToDomain *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
2003-02-03 22:15:45 +01:00
|
|
|
WRITE_NODE_TYPE("COERCETODOMAIN");
|
2002-12-12 16:49:42 +01:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(arg);
|
2003-02-03 22:15:45 +01:00
|
|
|
WRITE_OID_FIELD(resulttype);
|
|
|
|
WRITE_INT_FIELD(resulttypmod);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(resultcollid);
|
2003-02-03 22:15:45 +01:00
|
|
|
WRITE_ENUM_FIELD(coercionformat, CoercionForm);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outCoerceToDomainValue(StringInfo str, const CoerceToDomainValue *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
2003-02-03 22:15:45 +01:00
|
|
|
WRITE_NODE_TYPE("COERCETODOMAINVALUE");
|
2002-12-12 16:49:42 +01:00
|
|
|
|
|
|
|
WRITE_OID_FIELD(typeId);
|
|
|
|
WRITE_INT_FIELD(typeMod);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(collation);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2000-09-12 23:07:18 +02:00
|
|
|
}
|
|
|
|
|
2003-07-03 18:34:26 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outSetToDefault(StringInfo str, const SetToDefault *node)
|
2003-07-03 18:34:26 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("SETTODEFAULT");
|
|
|
|
|
|
|
|
WRITE_OID_FIELD(typeId);
|
|
|
|
WRITE_INT_FIELD(typeMod);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(collation);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2003-07-03 18:34:26 +02:00
|
|
|
}
|
|
|
|
|
2007-06-11 03:16:30 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outCurrentOfExpr(StringInfo str, const CurrentOfExpr *node)
|
2007-06-11 03:16:30 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("CURRENTOFEXPR");
|
|
|
|
|
|
|
|
WRITE_UINT_FIELD(cvarno);
|
|
|
|
WRITE_STRING_FIELD(cursor_name);
|
2007-06-12 00:22:42 +02:00
|
|
|
WRITE_INT_FIELD(cursor_param);
|
2007-06-11 03:16:30 +02:00
|
|
|
}
|
|
|
|
|
Code review for NextValueExpr expression node type.
Add missing infrastructure for this node type, notably in ruleutils.c where
its lack could demonstrably cause EXPLAIN to fail. Add outfuncs/readfuncs
support. (outfuncs support is useful today for debugging purposes. The
readfuncs support may never be needed, since at present it would only
matter for parallel query and NextValueExpr should never appear in a
parallelizable query; but it seems like a bad idea to have a primnode type
that isn't fully supported here.) Teach planner infrastructure that
NextValueExpr is a volatile, parallel-unsafe, non-leaky expression node
with cost cpu_operator_cost. Given its limited scope of usage, there
*might* be no live bug today from the lack of that knowledge, but it's
certainly going to bite us on the rear someday. Teach pg_stat_statements
about the new node type, too.
While at it, also teach cost_qual_eval() that MinMaxExpr, SQLValueFunction,
XmlExpr, and CoerceToDomain should be charged as cpu_operator_cost.
Failing to do this for SQLValueFunction was an oversight in my commit
0bb51aa96. The others are longer-standing oversights, but no time like the
present to fix them. (In principle, CoerceToDomain could have cost much
higher than this, but it doesn't presently seem worth trying to examine the
domain's constraints here.)
Modify execExprInterp.c to execute NextValueExpr as an out-of-line
function; it seems quite unlikely to me that it's worth insisting that
it be inlined in all expression eval methods. Besides, providing the
out-of-line function doesn't stop anyone from inlining if they want to.
Adjust some places where NextValueExpr support had been inserted with the
aid of a dartboard rather than keeping it in the same order as elsewhere.
Discussion: https://postgr.es/m/23862.1499981661@sss.pgh.pa.us
2017-07-14 21:25:43 +02:00
|
|
|
static void
|
|
|
|
_outNextValueExpr(StringInfo str, const NextValueExpr *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("NEXTVALUEEXPR");
|
|
|
|
|
|
|
|
WRITE_OID_FIELD(seqid);
|
|
|
|
WRITE_OID_FIELD(typeId);
|
|
|
|
}
|
|
|
|
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
static void
|
|
|
|
_outInferenceElem(StringInfo str, const InferenceElem *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("INFERENCEELEM");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(expr);
|
|
|
|
WRITE_OID_FIELD(infercollid);
|
2015-05-19 21:17:52 +02:00
|
|
|
WRITE_OID_FIELD(inferopclass);
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outTargetEntry(StringInfo str, const TargetEntry *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("TARGETENTRY");
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(expr);
|
2005-04-06 18:34:07 +02:00
|
|
|
WRITE_INT_FIELD(resno);
|
|
|
|
WRITE_STRING_FIELD(resname);
|
|
|
|
WRITE_UINT_FIELD(ressortgroupref);
|
|
|
|
WRITE_OID_FIELD(resorigtbl);
|
|
|
|
WRITE_INT_FIELD(resorigcol);
|
|
|
|
WRITE_BOOL_FIELD(resjunk);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2002-03-21 17:02:16 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outRangeTblRef(StringInfo str, const RangeTblRef *node)
|
2002-03-21 17:02:16 +01:00
|
|
|
{
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_TYPE("RANGETBLREF");
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_INT_FIELD(rtindex);
|
2002-03-21 17:02:16 +01:00
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outJoinExpr(StringInfo str, const JoinExpr *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_TYPE("JOINEXPR");
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_ENUM_FIELD(jointype, JoinType);
|
|
|
|
WRITE_BOOL_FIELD(isNatural);
|
|
|
|
WRITE_NODE_FIELD(larg);
|
|
|
|
WRITE_NODE_FIELD(rarg);
|
2009-07-16 08:33:46 +02:00
|
|
|
WRITE_NODE_FIELD(usingClause);
|
2021-03-31 17:09:24 +02:00
|
|
|
WRITE_NODE_FIELD(join_using_alias);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(quals);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(alias);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_INT_FIELD(rtindex);
|
|
|
|
}
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outFromExpr(StringInfo str, const FromExpr *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("FROMEXPR");
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(fromlist);
|
|
|
|
WRITE_NODE_FIELD(quals);
|
1999-01-21 17:08:55 +01:00
|
|
|
}
|
|
|
|
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
static void
|
|
|
|
_outOnConflictExpr(StringInfo str, const OnConflictExpr *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("ONCONFLICTEXPR");
|
|
|
|
|
|
|
|
WRITE_ENUM_FIELD(action, OnConflictAction);
|
|
|
|
WRITE_NODE_FIELD(arbiterElems);
|
|
|
|
WRITE_NODE_FIELD(arbiterWhere);
|
2015-08-06 02:44:27 +02:00
|
|
|
WRITE_OID_FIELD(constraint);
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
WRITE_NODE_FIELD(onConflictSet);
|
|
|
|
WRITE_NODE_FIELD(onConflictWhere);
|
|
|
|
WRITE_INT_FIELD(exclRelIndex);
|
|
|
|
WRITE_NODE_FIELD(exclRelTlist);
|
|
|
|
}
|
|
|
|
|
Common SQL/JSON clauses
This introduces some of the building blocks used by the SQL/JSON
constructor and query functions. Specifically, it provides node
executor and grammar support for the FORMAT JSON [ENCODING foo]
clause, and values decorated with it, and for the RETURNING clause.
The following SQL/JSON patches will leverage these.
Nikita Glukhov (who probably deserves an award for perseverance).
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/cd0bb935-0158-78a7-08b5-904886deac4b@postgrespro.ru
2022-03-03 19:00:49 +01:00
|
|
|
static void
|
|
|
|
_outJsonFormat(StringInfo str, const JsonFormat *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("JSONFORMAT");
|
|
|
|
|
|
|
|
WRITE_ENUM_FIELD(format_type, JsonFormatType);
|
|
|
|
WRITE_ENUM_FIELD(encoding, JsonEncoding);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outJsonReturning(StringInfo str, const JsonReturning *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("JSONRETURNING");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(format);
|
|
|
|
WRITE_OID_FIELD(typid);
|
|
|
|
WRITE_INT_FIELD(typmod);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outJsonValueExpr(StringInfo str, const JsonValueExpr *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("JSONVALUEEXPR");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(raw_expr);
|
|
|
|
WRITE_NODE_FIELD(formatted_expr);
|
|
|
|
WRITE_NODE_FIELD(format);
|
|
|
|
}
|
|
|
|
|
SQL/JSON constructors
This patch introduces the SQL/JSON standard constructors for JSON:
JSON()
JSON_ARRAY()
JSON_ARRAYAGG()
JSON_OBJECT()
JSON_OBJECTAGG()
For the most part these functions provide facilities that mimic
existing json/jsonb functions. However, they also offer some useful
additional functionality. In addition to text input, the JSON() function
accepts bytea input, which it will decode and constuct a json value from.
The other functions provide useful options for handling duplicate keys
and null values.
This series of patches will be followed by a consolidated documentation
patch.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/cd0bb935-0158-78a7-08b5-904886deac4b@postgrespro.ru
2022-03-03 19:02:10 +01:00
|
|
|
static void
|
|
|
|
_outJsonConstructorExpr(StringInfo str, const JsonConstructorExpr *node)
|
|
|
|
{
|
2022-04-12 16:18:01 +02:00
|
|
|
WRITE_NODE_TYPE("JSONCONSTRUCTOREXPR");
|
SQL/JSON constructors
This patch introduces the SQL/JSON standard constructors for JSON:
JSON()
JSON_ARRAY()
JSON_ARRAYAGG()
JSON_OBJECT()
JSON_OBJECTAGG()
For the most part these functions provide facilities that mimic
existing json/jsonb functions. However, they also offer some useful
additional functionality. In addition to text input, the JSON() function
accepts bytea input, which it will decode and constuct a json value from.
The other functions provide useful options for handling duplicate keys
and null values.
This series of patches will be followed by a consolidated documentation
patch.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/cd0bb935-0158-78a7-08b5-904886deac4b@postgrespro.ru
2022-03-03 19:02:10 +01:00
|
|
|
|
2022-05-13 17:40:01 +02:00
|
|
|
WRITE_ENUM_FIELD(type, JsonConstructorType);
|
SQL/JSON constructors
This patch introduces the SQL/JSON standard constructors for JSON:
JSON()
JSON_ARRAY()
JSON_ARRAYAGG()
JSON_OBJECT()
JSON_OBJECTAGG()
For the most part these functions provide facilities that mimic
existing json/jsonb functions. However, they also offer some useful
additional functionality. In addition to text input, the JSON() function
accepts bytea input, which it will decode and constuct a json value from.
The other functions provide useful options for handling duplicate keys
and null values.
This series of patches will be followed by a consolidated documentation
patch.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/cd0bb935-0158-78a7-08b5-904886deac4b@postgrespro.ru
2022-03-03 19:02:10 +01:00
|
|
|
WRITE_NODE_FIELD(args);
|
|
|
|
WRITE_NODE_FIELD(func);
|
|
|
|
WRITE_NODE_FIELD(coercion);
|
|
|
|
WRITE_NODE_FIELD(returning);
|
|
|
|
WRITE_BOOL_FIELD(absent_on_null);
|
2022-05-13 17:40:01 +02:00
|
|
|
WRITE_BOOL_FIELD(unique);
|
SQL/JSON constructors
This patch introduces the SQL/JSON standard constructors for JSON:
JSON()
JSON_ARRAY()
JSON_ARRAYAGG()
JSON_OBJECT()
JSON_OBJECTAGG()
For the most part these functions provide facilities that mimic
existing json/jsonb functions. However, they also offer some useful
additional functionality. In addition to text input, the JSON() function
accepts bytea input, which it will decode and constuct a json value from.
The other functions provide useful options for handling duplicate keys
and null values.
This series of patches will be followed by a consolidated documentation
patch.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/cd0bb935-0158-78a7-08b5-904886deac4b@postgrespro.ru
2022-03-03 19:02:10 +01:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
IS JSON predicate
This patch intrdocuces the SQL standard IS JSON predicate. It operates
on text and bytea values representing JSON as well as on the json and
jsonb types. Each test has an IS and IS NOT variant. The tests are:
IS JSON [VALUE]
IS JSON ARRAY
IS JSON OBJECT
IS JSON SCALAR
IS JSON WITH | WITHOUT UNIQUE KEYS
These are mostly self-explanatory, but note that IS JSON WITHOUT UNIQUE
KEYS is true whenever IS JSON is true, and IS JSON WITH UNIQUE KEYS is
true whenever IS JSON is true except it IS JSON OBJECT is true and there
are duplicate keys (which is never the case when applied to jsonb values).
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/cd0bb935-0158-78a7-08b5-904886deac4b@postgrespro.ru
2022-03-03 19:02:53 +01:00
|
|
|
static void
|
|
|
|
_outJsonIsPredicate(StringInfo str, const JsonIsPredicate *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("JSONISPREDICATE");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(expr);
|
2022-05-13 17:40:01 +02:00
|
|
|
WRITE_NODE_FIELD(format);
|
|
|
|
WRITE_ENUM_FIELD(item_type, JsonValueType);
|
IS JSON predicate
This patch intrdocuces the SQL standard IS JSON predicate. It operates
on text and bytea values representing JSON as well as on the json and
jsonb types. Each test has an IS and IS NOT variant. The tests are:
IS JSON [VALUE]
IS JSON ARRAY
IS JSON OBJECT
IS JSON SCALAR
IS JSON WITH | WITHOUT UNIQUE KEYS
These are mostly self-explanatory, but note that IS JSON WITHOUT UNIQUE
KEYS is true whenever IS JSON is true, and IS JSON WITH UNIQUE KEYS is
true whenever IS JSON is true except it IS JSON OBJECT is true and there
are duplicate keys (which is never the case when applied to jsonb values).
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/cd0bb935-0158-78a7-08b5-904886deac4b@postgrespro.ru
2022-03-03 19:02:53 +01:00
|
|
|
WRITE_BOOL_FIELD(unique_keys);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
SQL/JSON query functions
This introduces the SQL/JSON functions for querying JSON data using
jsonpath expressions. The functions are:
JSON_EXISTS()
JSON_QUERY()
JSON_VALUE()
All of these functions only operate on jsonb. The workaround for now is
to cast the argument to jsonb.
JSON_EXISTS() tests if the jsonpath expression applied to the jsonb
value yields any values. JSON_VALUE() must return a single value, and an
error occurs if it tries to return multiple values. JSON_QUERY() must
return a json object or array, and there are various WRAPPER options for
handling scalar or multi-value results. Both these functions have
options for handling EMPTY and ERROR conditions.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/cd0bb935-0158-78a7-08b5-904886deac4b@postgrespro.ru
2022-03-03 19:11:14 +01:00
|
|
|
static void
|
|
|
|
_outJsonBehavior(StringInfo str, const JsonBehavior *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("JSONBEHAVIOR");
|
|
|
|
|
|
|
|
WRITE_ENUM_FIELD(btype, JsonBehaviorType);
|
|
|
|
WRITE_NODE_FIELD(default_expr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outJsonExpr(StringInfo str, const JsonExpr *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("JSONEXPR");
|
|
|
|
|
|
|
|
WRITE_ENUM_FIELD(op, JsonExprOp);
|
|
|
|
WRITE_NODE_FIELD(formatted_expr);
|
|
|
|
WRITE_NODE_FIELD(result_coercion);
|
|
|
|
WRITE_NODE_FIELD(format);
|
|
|
|
WRITE_NODE_FIELD(path_spec);
|
|
|
|
WRITE_NODE_FIELD(passing_names);
|
2022-05-13 17:40:01 +02:00
|
|
|
WRITE_NODE_FIELD(passing_values);
|
SQL/JSON query functions
This introduces the SQL/JSON functions for querying JSON data using
jsonpath expressions. The functions are:
JSON_EXISTS()
JSON_QUERY()
JSON_VALUE()
All of these functions only operate on jsonb. The workaround for now is
to cast the argument to jsonb.
JSON_EXISTS() tests if the jsonpath expression applied to the jsonb
value yields any values. JSON_VALUE() must return a single value, and an
error occurs if it tries to return multiple values. JSON_QUERY() must
return a json object or array, and there are various WRAPPER options for
handling scalar or multi-value results. Both these functions have
options for handling EMPTY and ERROR conditions.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/cd0bb935-0158-78a7-08b5-904886deac4b@postgrespro.ru
2022-03-03 19:11:14 +01:00
|
|
|
WRITE_NODE_FIELD(returning);
|
|
|
|
WRITE_NODE_FIELD(on_empty);
|
2022-05-13 17:40:01 +02:00
|
|
|
WRITE_NODE_FIELD(on_error);
|
SQL/JSON query functions
This introduces the SQL/JSON functions for querying JSON data using
jsonpath expressions. The functions are:
JSON_EXISTS()
JSON_QUERY()
JSON_VALUE()
All of these functions only operate on jsonb. The workaround for now is
to cast the argument to jsonb.
JSON_EXISTS() tests if the jsonpath expression applied to the jsonb
value yields any values. JSON_VALUE() must return a single value, and an
error occurs if it tries to return multiple values. JSON_QUERY() must
return a json object or array, and there are various WRAPPER options for
handling scalar or multi-value results. Both these functions have
options for handling EMPTY and ERROR conditions.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/cd0bb935-0158-78a7-08b5-904886deac4b@postgrespro.ru
2022-03-03 19:11:14 +01:00
|
|
|
WRITE_NODE_FIELD(coercions);
|
|
|
|
WRITE_ENUM_FIELD(wrapper, JsonWrapper);
|
|
|
|
WRITE_BOOL_FIELD(omit_quotes);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outJsonCoercion(StringInfo str, const JsonCoercion *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("JSONCOERCION");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(expr);
|
|
|
|
WRITE_BOOL_FIELD(via_populate);
|
|
|
|
WRITE_BOOL_FIELD(via_io);
|
|
|
|
WRITE_OID_FIELD(collation);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outJsonItemCoercions(StringInfo str, const JsonItemCoercions *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("JSONITEMCOERCIONS");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(null);
|
|
|
|
WRITE_NODE_FIELD(string);
|
|
|
|
WRITE_NODE_FIELD(numeric);
|
|
|
|
WRITE_NODE_FIELD(boolean);
|
|
|
|
WRITE_NODE_FIELD(date);
|
|
|
|
WRITE_NODE_FIELD(time);
|
|
|
|
WRITE_NODE_FIELD(timetz);
|
|
|
|
WRITE_NODE_FIELD(timestamp);
|
|
|
|
WRITE_NODE_FIELD(timestamptz);
|
|
|
|
WRITE_NODE_FIELD(composite);
|
|
|
|
}
|
|
|
|
|
JSON_TABLE
This feature allows jsonb data to be treated as a table and thus used in
a FROM clause like other tabular data. Data can be selected from the
jsonb using jsonpath expressions, and hoisted out of nested structures
in the jsonb to form multiple rows, more or less like an outer join.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zhihong Yu (whose
name I previously misspelled), Himanshu Upadhyaya, Daniel Gustafsson,
Justin Pryzby.
Discussion: https://postgr.es/m/7e2cb85d-24cf-4abb-30a5-1a33715959bd@postgrespro.ru
2022-04-04 21:36:03 +02:00
|
|
|
static void
|
|
|
|
_outJsonTableParent(StringInfo str, const JsonTableParent *node)
|
|
|
|
{
|
2022-05-13 17:40:01 +02:00
|
|
|
WRITE_NODE_TYPE("JSONTABLEPARENT");
|
JSON_TABLE
This feature allows jsonb data to be treated as a table and thus used in
a FROM clause like other tabular data. Data can be selected from the
jsonb using jsonpath expressions, and hoisted out of nested structures
in the jsonb to form multiple rows, more or less like an outer join.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zhihong Yu (whose
name I previously misspelled), Himanshu Upadhyaya, Daniel Gustafsson,
Justin Pryzby.
Discussion: https://postgr.es/m/7e2cb85d-24cf-4abb-30a5-1a33715959bd@postgrespro.ru
2022-04-04 21:36:03 +02:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(path);
|
PLAN clauses for JSON_TABLE
These clauses allow the user to specify how data from nested paths are
joined, allowing considerable freedom in shaping the tabular output of
JSON_TABLE.
PLAN DEFAULT allows the user to specify the global strategies when
dealing with sibling or child nested paths. The is often sufficient to
achieve the necessary goal, and is considerably simpler than the full
PLAN clause, which allows the user to specify the strategy to be used
for each named nested path.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zhihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/7e2cb85d-24cf-4abb-30a5-1a33715959bd@postgrespro.ru
2022-04-05 20:09:04 +02:00
|
|
|
WRITE_STRING_FIELD(name);
|
JSON_TABLE
This feature allows jsonb data to be treated as a table and thus used in
a FROM clause like other tabular data. Data can be selected from the
jsonb using jsonpath expressions, and hoisted out of nested structures
in the jsonb to form multiple rows, more or less like an outer join.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zhihong Yu (whose
name I previously misspelled), Himanshu Upadhyaya, Daniel Gustafsson,
Justin Pryzby.
Discussion: https://postgr.es/m/7e2cb85d-24cf-4abb-30a5-1a33715959bd@postgrespro.ru
2022-04-04 21:36:03 +02:00
|
|
|
WRITE_NODE_FIELD(child);
|
PLAN clauses for JSON_TABLE
These clauses allow the user to specify how data from nested paths are
joined, allowing considerable freedom in shaping the tabular output of
JSON_TABLE.
PLAN DEFAULT allows the user to specify the global strategies when
dealing with sibling or child nested paths. The is often sufficient to
achieve the necessary goal, and is considerably simpler than the full
PLAN clause, which allows the user to specify the strategy to be used
for each named nested path.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zhihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/7e2cb85d-24cf-4abb-30a5-1a33715959bd@postgrespro.ru
2022-04-05 20:09:04 +02:00
|
|
|
WRITE_BOOL_FIELD(outerJoin);
|
JSON_TABLE
This feature allows jsonb data to be treated as a table and thus used in
a FROM clause like other tabular data. Data can be selected from the
jsonb using jsonpath expressions, and hoisted out of nested structures
in the jsonb to form multiple rows, more or less like an outer join.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zhihong Yu (whose
name I previously misspelled), Himanshu Upadhyaya, Daniel Gustafsson,
Justin Pryzby.
Discussion: https://postgr.es/m/7e2cb85d-24cf-4abb-30a5-1a33715959bd@postgrespro.ru
2022-04-04 21:36:03 +02:00
|
|
|
WRITE_INT_FIELD(colMin);
|
|
|
|
WRITE_INT_FIELD(colMax);
|
2022-05-13 17:40:01 +02:00
|
|
|
WRITE_BOOL_FIELD(errorOnError);
|
JSON_TABLE
This feature allows jsonb data to be treated as a table and thus used in
a FROM clause like other tabular data. Data can be selected from the
jsonb using jsonpath expressions, and hoisted out of nested structures
in the jsonb to form multiple rows, more or less like an outer join.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zhihong Yu (whose
name I previously misspelled), Himanshu Upadhyaya, Daniel Gustafsson,
Justin Pryzby.
Discussion: https://postgr.es/m/7e2cb85d-24cf-4abb-30a5-1a33715959bd@postgrespro.ru
2022-04-04 21:36:03 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outJsonTableSibling(StringInfo str, const JsonTableSibling *node)
|
|
|
|
{
|
2022-05-13 17:40:01 +02:00
|
|
|
WRITE_NODE_TYPE("JSONTABLESIBLING");
|
JSON_TABLE
This feature allows jsonb data to be treated as a table and thus used in
a FROM clause like other tabular data. Data can be selected from the
jsonb using jsonpath expressions, and hoisted out of nested structures
in the jsonb to form multiple rows, more or less like an outer join.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zhihong Yu (whose
name I previously misspelled), Himanshu Upadhyaya, Daniel Gustafsson,
Justin Pryzby.
Discussion: https://postgr.es/m/7e2cb85d-24cf-4abb-30a5-1a33715959bd@postgrespro.ru
2022-04-04 21:36:03 +02:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(larg);
|
|
|
|
WRITE_NODE_FIELD(rarg);
|
PLAN clauses for JSON_TABLE
These clauses allow the user to specify how data from nested paths are
joined, allowing considerable freedom in shaping the tabular output of
JSON_TABLE.
PLAN DEFAULT allows the user to specify the global strategies when
dealing with sibling or child nested paths. The is often sufficient to
achieve the necessary goal, and is considerably simpler than the full
PLAN clause, which allows the user to specify the strategy to be used
for each named nested path.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zhihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/7e2cb85d-24cf-4abb-30a5-1a33715959bd@postgrespro.ru
2022-04-05 20:09:04 +02:00
|
|
|
WRITE_BOOL_FIELD(cross);
|
JSON_TABLE
This feature allows jsonb data to be treated as a table and thus used in
a FROM clause like other tabular data. Data can be selected from the
jsonb using jsonpath expressions, and hoisted out of nested structures
in the jsonb to form multiple rows, more or less like an outer join.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zhihong Yu (whose
name I previously misspelled), Himanshu Upadhyaya, Daniel Gustafsson,
Justin Pryzby.
Discussion: https://postgr.es/m/7e2cb85d-24cf-4abb-30a5-1a33715959bd@postgrespro.ru
2022-04-04 21:36:03 +02:00
|
|
|
}
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
/*****************************************************************************
|
|
|
|
*
|
2019-01-29 22:49:25 +01:00
|
|
|
* Stuff from pathnodes.h.
|
2002-12-12 16:49:42 +01:00
|
|
|
*
|
|
|
|
*****************************************************************************/
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
2002-11-25 19:12:12 +01:00
|
|
|
* print the basic stuff of all nodes that inherit from Path
|
2002-12-05 16:50:39 +01:00
|
|
|
*
|
2010-09-23 21:16:49 +02:00
|
|
|
* Note we do NOT print the parent, else we'd be in infinite recursion.
|
|
|
|
* We can print the parent's relids for identification purposes, though.
|
Add an explicit representation of the output targetlist to Paths.
Up to now, there's been an assumption that all Paths for a given relation
compute the same output column set (targetlist). However, there are good
reasons to remove that assumption. For example, an indexscan on an
expression index might be able to return the value of an expensive function
"for free". While we have the ability to generate such a plan today in
simple cases, we don't have a way to model that it's cheaper than a plan
that computes the function from scratch, nor a way to create such a plan
in join cases (where the function computation would normally happen at
the topmost join node). Also, we need this so that we can have Paths
representing post-scan/join steps, where the targetlist may well change
from one step to the next. Therefore, invent a "struct PathTarget"
representing the columns we expect a plan step to emit. It's convenient
to include the output tuple width and tlist evaluation cost in this struct,
and there will likely be additional fields in future.
While Path nodes that actually do have custom outputs will need their own
PathTargets, it will still be true that most Paths for a given relation
will compute the same tlist. To reduce the overhead added by this patch,
keep a "default PathTarget" in RelOptInfo, and allow Paths that compute
that column set to just point to their parent RelOptInfo's reltarget.
(In the patch as committed, actually every Path is like that, since we
do not yet have any cases of custom PathTargets.)
I took this opportunity to provide some more-honest costing of
PlaceHolderVar evaluation. Up to now, the assumption that "scan/join
reltargetlists have cost zero" was applied not only to Vars, where it's
reasonable, but also PlaceHolderVars where it isn't. Now, we add the eval
cost of a PlaceHolderVar's expression to the first plan level where it can
be computed, by including it in the PathTarget cost field and adding that
to the cost estimates for Paths. This isn't perfect yet but it's much
better than before, and there is a way forward to improve it more. This
costing change affects the join order chosen for a couple of the regression
tests, changing expected row ordering.
2016-02-19 02:01:49 +01:00
|
|
|
* We print the pathtarget only if it's not the default one for the rel.
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
* We also do not print the whole of param_info, since it's printed by
|
|
|
|
* _outRelOptInfo; it's sufficient and less cluttering to print just the
|
|
|
|
* required outer relids.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPathInfo(StringInfo str, const Path *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_ENUM_FIELD(pathtype, NodeTag);
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, " :parent_relids ");
|
2016-09-16 15:36:19 +02:00
|
|
|
outBitmapset(str, node->parent->relids);
|
2016-03-14 21:59:59 +01:00
|
|
|
if (node->pathtarget != node->parent->reltarget)
|
|
|
|
WRITE_NODE_FIELD(pathtarget);
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, " :required_outer ");
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
if (node->param_info)
|
2016-09-16 15:36:19 +02:00
|
|
|
outBitmapset(str, node->param_info->ppi_req_outer);
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
else
|
2016-09-16 15:36:19 +02:00
|
|
|
outBitmapset(str, NULL);
|
2015-11-11 14:57:52 +01:00
|
|
|
WRITE_BOOL_FIELD(parallel_aware);
|
2016-01-20 20:29:22 +01:00
|
|
|
WRITE_BOOL_FIELD(parallel_safe);
|
2016-06-09 15:08:27 +02:00
|
|
|
WRITE_INT_FIELD(parallel_workers);
|
2012-01-28 01:26:38 +01:00
|
|
|
WRITE_FLOAT_FIELD(rows, "%.0f");
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_FLOAT_FIELD(startup_cost, "%.2f");
|
|
|
|
WRITE_FLOAT_FIELD(total_cost, "%.2f");
|
|
|
|
WRITE_NODE_FIELD(pathkeys);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2002-11-25 19:12:12 +01:00
|
|
|
* print the basic stuff of all nodes that inherit from JoinPath
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outJoinPathInfo(StringInfo str, const JoinPath *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPathInfo(str, (const Path *) node);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_ENUM_FIELD(jointype, JoinType);
|
2017-04-08 04:20:03 +02:00
|
|
|
WRITE_BOOL_FIELD(inner_unique);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(outerjoinpath);
|
|
|
|
WRITE_NODE_FIELD(innerjoinpath);
|
|
|
|
WRITE_NODE_FIELD(joinrestrictinfo);
|
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPath(StringInfo str, const Path *node)
|
2002-11-25 19:12:12 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PATH");
|
1999-08-16 04:17:58 +02:00
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPathInfo(str, (const Path *) node);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outIndexPath(StringInfo str, const IndexPath *node)
|
2002-11-25 19:12:12 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("INDEXPATH");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPathInfo(str, (const Path *) node);
|
2002-11-25 19:12:12 +01:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(indexinfo);
|
2004-01-06 00:39:54 +01:00
|
|
|
WRITE_NODE_FIELD(indexclauses);
|
2010-12-03 02:50:48 +01:00
|
|
|
WRITE_NODE_FIELD(indexorderbys);
|
2011-12-25 01:03:21 +01:00
|
|
|
WRITE_NODE_FIELD(indexorderbycols);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_ENUM_FIELD(indexscandir, ScanDirection);
|
2005-04-21 04:28:02 +02:00
|
|
|
WRITE_FLOAT_FIELD(indextotalcost, "%.2f");
|
|
|
|
WRITE_FLOAT_FIELD(indexselectivity, "%.4f");
|
2002-11-25 19:12:12 +01:00
|
|
|
}
|
|
|
|
|
2005-04-20 00:35:18 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outBitmapHeapPath(StringInfo str, const BitmapHeapPath *node)
|
2005-04-20 00:35:18 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("BITMAPHEAPPATH");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPathInfo(str, (const Path *) node);
|
2005-04-20 00:35:18 +02:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(bitmapqual);
|
|
|
|
}
|
|
|
|
|
2005-04-21 21:18:13 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outBitmapAndPath(StringInfo str, const BitmapAndPath *node)
|
2005-04-21 21:18:13 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("BITMAPANDPATH");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPathInfo(str, (const Path *) node);
|
2005-04-21 21:18:13 +02:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(bitmapquals);
|
|
|
|
WRITE_FLOAT_FIELD(bitmapselectivity, "%.4f");
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outBitmapOrPath(StringInfo str, const BitmapOrPath *node)
|
2005-04-21 21:18:13 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("BITMAPORPATH");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPathInfo(str, (const Path *) node);
|
2005-04-21 21:18:13 +02:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(bitmapquals);
|
|
|
|
WRITE_FLOAT_FIELD(bitmapselectivity, "%.4f");
|
|
|
|
}
|
|
|
|
|
1999-11-23 21:07:06 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outTidPath(StringInfo str, const TidPath *node)
|
1999-11-23 21:07:06 +01:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("TIDPATH");
|
1999-11-23 21:07:06 +01:00
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPathInfo(str, (const Path *) node);
|
1999-11-23 21:07:06 +01:00
|
|
|
|
2005-11-26 23:14:57 +01:00
|
|
|
WRITE_NODE_FIELD(tidquals);
|
1999-11-23 21:07:06 +01:00
|
|
|
}
|
|
|
|
|
2021-06-07 21:32:53 +02:00
|
|
|
static void
|
|
|
|
_outTidRangePath(StringInfo str, const TidRangePath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("TIDRANGEPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(tidrangequals);
|
|
|
|
}
|
|
|
|
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
static void
|
|
|
|
_outSubqueryScanPath(StringInfo str, const SubqueryScanPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("SUBQUERYSCANPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
|
|
|
}
|
|
|
|
|
2011-02-20 06:17:18 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outForeignPath(StringInfo str, const ForeignPath *node)
|
2011-02-20 06:17:18 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("FOREIGNPATH");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPathInfo(str, (const Path *) node);
|
2011-02-20 06:17:18 +01:00
|
|
|
|
Allow foreign and custom joins to handle EvalPlanQual rechecks.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 provided basic
infrastructure for allowing a foreign data wrapper or custom scan
provider to replace a join of one or more tables with a scan.
However, this infrastructure failed to take into account the need
for possible EvalPlanQual rechecks, and ExecScanFetch would fail
an assertion (or just overwrite memory) if such a check was attempted
for a plan containing a pushed-down join. To fix, adjust the EPQ
machinery to skip some processing steps when scanrelid == 0, making
those the responsibility of scan's recheck method, which also has
the responsibility in this case of correctly populating the relevant
slot.
To allow foreign scans to gain control in the right place to make
use of this new facility, add a new, optional RecheckForeignScan
method. Also, allow a foreign scan to have a child plan, which can
be used to correctly populate the slot (or perhaps for something
else, but this is the only use currently envisioned).
KaiGai Kohei, reviewed by Robert Haas, Etsuro Fujita, and Kyotaro
Horiguchi.
2015-12-08 18:31:03 +01:00
|
|
|
WRITE_NODE_FIELD(fdw_outerpath);
|
2012-03-05 22:15:59 +01:00
|
|
|
WRITE_NODE_FIELD(fdw_private);
|
2011-02-20 06:17:18 +01:00
|
|
|
}
|
|
|
|
|
2014-11-07 23:26:02 +01:00
|
|
|
static void
|
|
|
|
_outCustomPath(StringInfo str, const CustomPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("CUSTOMPATH");
|
2014-11-22 00:21:46 +01:00
|
|
|
|
2014-11-07 23:26:02 +01:00
|
|
|
_outPathInfo(str, (const Path *) node);
|
2014-11-22 00:21:46 +01:00
|
|
|
|
2014-11-07 23:26:02 +01:00
|
|
|
WRITE_UINT_FIELD(flags);
|
2015-08-06 02:44:27 +02:00
|
|
|
WRITE_NODE_FIELD(custom_paths);
|
2014-11-22 00:21:46 +01:00
|
|
|
WRITE_NODE_FIELD(custom_private);
|
|
|
|
appendStringInfoString(str, " :methods ");
|
2016-09-16 15:36:19 +02:00
|
|
|
outToken(str, node->methods->CustomName);
|
2014-11-07 23:26:02 +01:00
|
|
|
}
|
|
|
|
|
2000-11-12 01:37:02 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outAppendPath(StringInfo str, const AppendPath *node)
|
2000-11-12 01:37:02 +01:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("APPENDPATH");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPathInfo(str, (const Path *) node);
|
2000-11-12 01:37:02 +01:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(subpaths);
|
2018-06-01 01:07:13 +02:00
|
|
|
WRITE_INT_FIELD(first_partial_path);
|
Use Append rather than MergeAppend for scanning ordered partitions.
If we need ordered output from a scan of a partitioned table, but
the ordering matches the partition ordering, then we don't need to
use a MergeAppend to combine the pre-ordered per-partition scan
results: a plain Append will produce the same results. This
both saves useless comparison work inside the MergeAppend proper,
and allows us to start returning tuples after istarting up just
the first child node not all of them.
However, all is not peaches and cream, because if some of the
child nodes have high startup costs then there will be big
discontinuities in the tuples-returned-versus-elapsed-time curve.
The planner's cost model cannot handle that (yet, anyway).
If we model the Append's startup cost as being just the first
child's startup cost, we may drastically underestimate the cost
of fetching slightly more tuples than are available from the first
child. Since we've had bad experiences with over-optimistic choices
of "fast start" plans for ORDER BY LIMIT queries, that seems scary.
As a klugy workaround, set the startup cost estimate for an ordered
Append to be the sum of its children's startup costs (as MergeAppend
would). This doesn't really describe reality, but it's less likely
to cause a bad plan choice than an underestimated startup cost would.
In practice, the cases where we really care about this optimization
will have child plans that are IndexScans with zero startup cost,
so that the overly conservative estimate is still just zero.
David Rowley, reviewed by Julien Rouhaud and Antonin Houska
Discussion: https://postgr.es/m/CAKJS1f-hAqhPLRk_RaSFTgYxd=Tz5hA7kQ2h4-DhJufQk8TGuw@mail.gmail.com
2019-04-06 01:20:30 +02:00
|
|
|
WRITE_FLOAT_FIELD(limit_tuples, "%.0f");
|
2000-11-12 01:37:02 +01:00
|
|
|
}
|
|
|
|
|
2010-10-14 22:56:39 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outMergeAppendPath(StringInfo str, const MergeAppendPath *node)
|
2010-10-14 22:56:39 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("MERGEAPPENDPATH");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPathInfo(str, (const Path *) node);
|
2010-10-14 22:56:39 +02:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpaths);
|
2010-11-18 06:30:10 +01:00
|
|
|
WRITE_FLOAT_FIELD(limit_tuples, "%.0f");
|
2010-10-14 22:56:39 +02:00
|
|
|
}
|
|
|
|
|
2002-11-06 01:00:45 +01:00
|
|
|
static void
|
In the planner, replace an empty FROM clause with a dummy RTE.
The fact that "SELECT expression" has no base relations has long been a
thorn in the side of the planner. It makes it hard to flatten a sub-query
that looks like that, or is a trivial VALUES() item, because the planner
generally uses relid sets to identify sub-relations, and such a sub-query
would have an empty relid set if we flattened it. prepjointree.c contains
some baroque logic that works around this in certain special cases --- but
there is a much better answer. We can replace an empty FROM clause with a
dummy RTE that acts like a table of one row and no columns, and then there
are no such corner cases to worry about. Instead we need some logic to
get rid of useless dummy RTEs, but that's simpler and covers more cases
than what was there before.
For really trivial cases, where the query is just "SELECT expression" and
nothing else, there's a hazard that adding the extra RTE makes for a
noticeable slowdown; even though it's not much processing, there's not
that much for the planner to do overall. However testing says that the
penalty is very small, close to the noise level. In more complex queries,
this is able to find optimizations that we could not find before.
The new RTE type is called RTE_RESULT, since the "scan" plan type it
gives rise to is a Result node (the same plan we produced for a "SELECT
expression" query before). To avoid confusion, rename the old ResultPath
path type to GroupResultPath, reflecting that it's only used in degenerate
grouping cases where we know the query produces just one grouped row.
(It wouldn't work to unify the two cases, because there are different
rules about where the associated quals live during query_planner.)
Note: although this touches readfuncs.c, I don't think a catversion
bump is required, because the added case can't occur in stored rules,
only plans.
Patch by me, reviewed by David Rowley and Mark Dilger
Discussion: https://postgr.es/m/15944.1521127664@sss.pgh.pa.us
2019-01-28 23:54:10 +01:00
|
|
|
_outGroupResultPath(StringInfo str, const GroupResultPath *node)
|
2002-11-06 01:00:45 +01:00
|
|
|
{
|
In the planner, replace an empty FROM clause with a dummy RTE.
The fact that "SELECT expression" has no base relations has long been a
thorn in the side of the planner. It makes it hard to flatten a sub-query
that looks like that, or is a trivial VALUES() item, because the planner
generally uses relid sets to identify sub-relations, and such a sub-query
would have an empty relid set if we flattened it. prepjointree.c contains
some baroque logic that works around this in certain special cases --- but
there is a much better answer. We can replace an empty FROM clause with a
dummy RTE that acts like a table of one row and no columns, and then there
are no such corner cases to worry about. Instead we need some logic to
get rid of useless dummy RTEs, but that's simpler and covers more cases
than what was there before.
For really trivial cases, where the query is just "SELECT expression" and
nothing else, there's a hazard that adding the extra RTE makes for a
noticeable slowdown; even though it's not much processing, there's not
that much for the planner to do overall. However testing says that the
penalty is very small, close to the noise level. In more complex queries,
this is able to find optimizations that we could not find before.
The new RTE type is called RTE_RESULT, since the "scan" plan type it
gives rise to is a Result node (the same plan we produced for a "SELECT
expression" query before). To avoid confusion, rename the old ResultPath
path type to GroupResultPath, reflecting that it's only used in degenerate
grouping cases where we know the query produces just one grouped row.
(It wouldn't work to unify the two cases, because there are different
rules about where the associated quals live during query_planner.)
Note: although this touches readfuncs.c, I don't think a catversion
bump is required, because the added case can't occur in stored rules,
only plans.
Patch by me, reviewed by David Rowley and Mark Dilger
Discussion: https://postgr.es/m/15944.1521127664@sss.pgh.pa.us
2019-01-28 23:54:10 +01:00
|
|
|
WRITE_NODE_TYPE("GROUPRESULTPATH");
|
2002-11-06 01:00:45 +01:00
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPathInfo(str, (const Path *) node);
|
2002-11-06 01:00:45 +01:00
|
|
|
|
Revise the planner's handling of "pseudoconstant" WHERE clauses, that is
clauses containing no variables and no volatile functions. Such a clause
can be used as a one-time qual in a gating Result plan node, to suppress
plan execution entirely when it is false. Even when the clause is true,
putting it in a gating node wins by avoiding repeated evaluation of the
clause. In previous PG releases, query_planner() would do this for
pseudoconstant clauses appearing at the top level of the jointree, but
there was no ability to generate a gating Result deeper in the plan tree.
To fix it, get rid of the special case in query_planner(), and instead
process pseudoconstant clauses through the normal RestrictInfo qual
distribution mechanism. When a pseudoconstant clause is found attached to
a path node in create_plan(), pull it out and generate a gating Result at
that point. This requires special-casing pseudoconstants in selectivity
estimation and cost_qual_eval, but on the whole it's pretty clean.
It probably even makes the planner a bit faster than before for the normal
case of no pseudoconstants, since removing pull_constant_clauses saves one
useless traversal of the qual tree. Per gripe from Phil Frost.
2006-07-01 20:38:33 +02:00
|
|
|
WRITE_NODE_FIELD(quals);
|
2002-11-06 01:00:45 +01:00
|
|
|
}
|
|
|
|
|
2002-11-30 06:21:03 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outMaterialPath(StringInfo str, const MaterialPath *node)
|
2002-11-30 06:21:03 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("MATERIALPATH");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPathInfo(str, (const Path *) node);
|
2002-11-30 06:21:03 +01:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
|
|
|
}
|
|
|
|
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
static void
|
2021-07-14 02:43:58 +02:00
|
|
|
_outMemoizePath(StringInfo str, const MemoizePath *node)
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
{
|
2021-07-14 02:43:58 +02:00
|
|
|
WRITE_NODE_TYPE("MEMOIZEPATH");
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
|
|
|
WRITE_NODE_FIELD(hash_operators);
|
|
|
|
WRITE_NODE_FIELD(param_exprs);
|
|
|
|
WRITE_BOOL_FIELD(singlerow);
|
2021-11-23 22:06:59 +01:00
|
|
|
WRITE_BOOL_FIELD(binary_mode);
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
WRITE_FLOAT_FIELD(calls, "%.0f");
|
|
|
|
WRITE_UINT_FIELD(est_entries);
|
|
|
|
}
|
|
|
|
|
2003-01-20 19:55:07 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outUniquePath(StringInfo str, const UniquePath *node)
|
2003-01-20 19:55:07 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("UNIQUEPATH");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPathInfo(str, (const Path *) node);
|
2003-01-20 19:55:07 +01:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
2004-01-05 19:04:39 +01:00
|
|
|
WRITE_ENUM_FIELD(umethod, UniquePathMethod);
|
2008-08-14 20:48:00 +02:00
|
|
|
WRITE_NODE_FIELD(in_operators);
|
|
|
|
WRITE_NODE_FIELD(uniq_exprs);
|
2003-01-20 19:55:07 +01:00
|
|
|
}
|
|
|
|
|
2015-11-11 12:29:03 +01:00
|
|
|
static void
|
|
|
|
_outGatherPath(StringInfo str, const GatherPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("GATHERPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
|
|
|
WRITE_BOOL_FIELD(single_copy);
|
2017-04-01 03:01:20 +02:00
|
|
|
WRITE_INT_FIELD(num_workers);
|
2015-11-11 12:29:03 +01:00
|
|
|
}
|
|
|
|
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
static void
|
|
|
|
_outProjectionPath(StringInfo str, const ProjectionPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PROJECTIONPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
Refactor planning of projection steps that don't need a Result plan node.
The original upper-planner-pathification design (commit 3fc6e2d7f5b652b4)
assumed that we could always determine during Path formation whether or not
we would need a Result plan node to perform projection of a targetlist.
That turns out not to work very well, though, because createplan.c still
has some responsibilities for choosing the specific target list associated
with sorting/grouping nodes (in particular it might choose to add resjunk
columns for sorting). We might not ever refactor that --- doing so would
push more work into Path formation, which isn't attractive --- and we
certainly won't do so for 9.6. So, while create_projection_path and
apply_projection_to_path can tell for sure what will happen if the subpath
is projection-capable, they can't tell for sure when it isn't. This is at
least a latent bug in apply_projection_to_path, which might think it can
apply a target to a non-projecting node when the node will end up computing
something different.
Also, I'd tied the creation of a ProjectionPath node to whether or not a
Result is needed, but it turns out that we sometimes need a ProjectionPath
node anyway to avoid modifying a possibly-shared subpath node. Callers had
to use create_projection_path for such cases, and we added code to them
that knew about the potential omission of a Result node and attempted to
adjust the cost estimates for that. That was uncertainly correct and
definitely ugly/unmaintainable.
To fix, have create_projection_path explicitly check whether a Result
is needed and adjust its cost estimate accordingly, though it creates
a ProjectionPath in either case. apply_projection_to_path is now mostly
just an optimized version that can avoid creating an extra Path node when
the input is known to not be shared with any other live path. (There
is one case that create_projection_path doesn't handle, which is pushing
parallel-safe expressions below a Gather node. We could make it do that
by duplicating the GatherPath, but there seems no need as yet.)
create_projection_plan still has to recheck the tlist-match condition,
which means that if the matching situation does get changed by createplan.c
then we'll have made a slightly incorrect cost estimate. But there seems
no help for that in the near term, and I doubt it occurs often enough,
let alone would change planning decisions often enough, to be worth
stressing about.
I added a "dummypp" field to ProjectionPath to track whether
create_projection_path thinks a Result is needed. This is not really
necessary as-committed because create_projection_plan doesn't look at the
flag; but it seems like a good idea to remember what we thought when
forming the cost estimate, if only for debugging purposes.
In passing, get rid of the target_parallel parameter added to
apply_projection_to_path by commit 54f5c5150. I don't think that's a good
idea because it involves callers in what should be an internal decision,
and opens us up to missing optimization opportunities if callers think they
don't need to provide a valid flag, as most don't. For the moment, this
just costs us an extra has_parallel_hazard call when planning a Gather.
If that starts to look expensive, I think a better solution would be to
teach PathTarget to carry/cache knowledge of parallel-safety of its
contents.
2016-06-22 00:38:20 +02:00
|
|
|
WRITE_BOOL_FIELD(dummypp);
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
}
|
|
|
|
|
Move targetlist SRF handling from expression evaluation to new executor node.
Evaluation of set returning functions (SRFs_ in the targetlist (like SELECT
generate_series(1,5)) so far was done in the expression evaluation (i.e.
ExecEvalExpr()) and projection (i.e. ExecProject/ExecTargetList) code.
This meant that most executor nodes performing projection, and most
expression evaluation functions, had to deal with the possibility that an
evaluated expression could return a set of return values.
That's bad because it leads to repeated code in a lot of places. It also,
and that's my (Andres's) motivation, made it a lot harder to implement a
more efficient way of doing expression evaluation.
To fix this, introduce a new executor node (ProjectSet) that can evaluate
targetlists containing one or more SRFs. To avoid the complexity of the old
way of handling nested expressions returning sets (e.g. having to pass up
ExprDoneCond, and dealing with arguments to functions returning sets etc.),
those SRFs can only be at the top level of the node's targetlist. The
planner makes sure (via split_pathtarget_at_srfs()) that SRF evaluation is
only necessary in ProjectSet nodes and that SRFs are only present at the
top level of the node's targetlist. If there are nested SRFs the planner
creates multiple stacked ProjectSet nodes. The ProjectSet nodes always get
input from an underlying node.
We also discussed and prototyped evaluating targetlist SRFs using ROWS
FROM(), but that turned out to be more complicated than we'd hoped.
While moving SRF evaluation to ProjectSet would allow to retain the old
"least common multiple" behavior when multiple SRFs are present in one
targetlist (i.e. continue returning rows until all SRFs are at the end of
their input at the same time), we decided to instead only return rows till
all SRFs are exhausted, returning NULL for already exhausted ones. We
deemed the previous behavior to be too confusing, unexpected and actually
not particularly useful.
As a side effect, the previously prohibited case of multiple set returning
arguments to a function, is now allowed. Not because it's particularly
desirable, but because it ends up working and there seems to be no argument
for adding code to prohibit it.
Currently the behavior for COALESCE and CASE containing SRFs has changed,
returning multiple rows from the expression, even when the SRF containing
"arm" of the expression is not evaluated. That's because the SRFs are
evaluated in a separate ProjectSet node. As that's quite confusing, we're
likely to instead prohibit SRFs in those places. But that's still being
discussed, and the code would reside in places not touched here, so that's
a task for later.
There's a lot of, now superfluous, code dealing with set return expressions
around. But as the changes to get rid of those are verbose largely boring,
it seems better for readability to keep the cleanup as a separate commit.
Author: Tom Lane and Andres Freund
Discussion: https://postgr.es/m/20160822214023.aaxz5l4igypowyri@alap3.anarazel.de
2017-01-18 21:46:50 +01:00
|
|
|
static void
|
|
|
|
_outProjectSetPath(StringInfo str, const ProjectSetPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PROJECTSETPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
|
|
|
}
|
|
|
|
|
2020-11-30 22:32:56 +01:00
|
|
|
static void
|
|
|
|
_outSortPathInfo(StringInfo str, const SortPath *node)
|
|
|
|
{
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
|
|
|
}
|
|
|
|
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
static void
|
|
|
|
_outSortPath(StringInfo str, const SortPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("SORTPATH");
|
|
|
|
|
2020-11-30 22:32:56 +01:00
|
|
|
_outSortPathInfo(str, node);
|
|
|
|
}
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
|
2020-11-30 22:32:56 +01:00
|
|
|
static void
|
|
|
|
_outIncrementalSortPath(StringInfo str, const IncrementalSortPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("INCREMENTALSORTPATH");
|
|
|
|
|
|
|
|
_outSortPathInfo(str, (const SortPath *) node);
|
|
|
|
|
|
|
|
WRITE_INT_FIELD(nPresortedCols);
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outGroupPath(StringInfo str, const GroupPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("GROUPPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
|
|
|
WRITE_NODE_FIELD(groupClause);
|
|
|
|
WRITE_NODE_FIELD(qual);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outUpperUniquePath(StringInfo str, const UpperUniquePath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("UPPERUNIQUEPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
|
|
|
WRITE_INT_FIELD(numkeys);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outAggPath(StringInfo str, const AggPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("AGGPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
|
|
|
WRITE_ENUM_FIELD(aggstrategy, AggStrategy);
|
2016-06-26 20:33:38 +02:00
|
|
|
WRITE_ENUM_FIELD(aggsplit, AggSplit);
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
WRITE_FLOAT_FIELD(numGroups, "%.0f");
|
2020-02-28 18:32:35 +01:00
|
|
|
WRITE_UINT64_FIELD(transitionSpace);
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
WRITE_NODE_FIELD(groupClause);
|
|
|
|
WRITE_NODE_FIELD(qual);
|
|
|
|
}
|
|
|
|
|
2017-03-27 05:20:54 +02:00
|
|
|
static void
|
|
|
|
_outRollupData(StringInfo str, const RollupData *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("ROLLUP");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(groupClause);
|
|
|
|
WRITE_NODE_FIELD(gsets);
|
|
|
|
WRITE_NODE_FIELD(gsets_data);
|
|
|
|
WRITE_FLOAT_FIELD(numGroups, "%.0f");
|
|
|
|
WRITE_BOOL_FIELD(hashable);
|
|
|
|
WRITE_BOOL_FIELD(is_hashed);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outGroupingSetData(StringInfo str, const GroupingSetData *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("GSDATA");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(set);
|
|
|
|
WRITE_FLOAT_FIELD(numGroups, "%.0f");
|
|
|
|
}
|
|
|
|
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
static void
|
|
|
|
_outGroupingSetsPath(StringInfo str, const GroupingSetsPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("GROUPINGSETSPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
2017-03-27 05:20:54 +02:00
|
|
|
WRITE_ENUM_FIELD(aggstrategy, AggStrategy);
|
|
|
|
WRITE_NODE_FIELD(rollups);
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
WRITE_NODE_FIELD(qual);
|
2020-02-28 18:32:35 +01:00
|
|
|
WRITE_UINT64_FIELD(transitionSpace);
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outMinMaxAggPath(StringInfo str, const MinMaxAggPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("MINMAXAGGPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(mmaggregates);
|
|
|
|
WRITE_NODE_FIELD(quals);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outWindowAggPath(StringInfo str, const WindowAggPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("WINDOWAGGPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
|
|
|
WRITE_NODE_FIELD(winclause);
|
Teach planner and executor about monotonic window funcs
Window functions such as row_number() always return a value higher than
the previously returned value for tuples in any given window partition.
Traditionally queries such as;
SELECT * FROM (
SELECT *, row_number() over (order by c) rn
FROM t
) t WHERE rn <= 10;
were executed fairly inefficiently. Neither the query planner nor the
executor knew that once rn made it to 11 that nothing further would match
the outer query's WHERE clause. It would blindly continue until all
tuples were exhausted from the subquery.
Here we implement means to make the above execute more efficiently.
This is done by way of adding a pg_proc.prosupport function to various of
the built-in window functions and adding supporting code to allow the
support function to inform the planner if the window function is
monotonically increasing, monotonically decreasing, both or neither. The
planner is then able to make use of that information and possibly allow
the executor to short-circuit execution by way of adding a "run condition"
to the WindowAgg to allow it to determine if some of its execution work
can be skipped.
This "run condition" is not like a normal filter. These run conditions
are only built using quals comparing values to monotonic window functions.
For monotonic increasing functions, quals making use of the btree
operators for <, <= and = can be used (assuming the window function column
is on the left). You can see here that once such a condition becomes false
that a monotonic increasing function could never make it subsequently true
again. For monotonically decreasing functions the >, >= and = btree
operators for the given type can be used for run conditions.
The best-case situation for this is when there is a single WindowAgg node
without a PARTITION BY clause. Here when the run condition becomes false
the WindowAgg node can simply return NULL. No more tuples will ever match
the run condition. It's a little more complex when there is a PARTITION
BY clause. In this case, we cannot return NULL as we must still process
other partitions. To speed this case up we pull tuples from the outer
plan to check if they're from the same partition and simply discard them
if they are. When we find a tuple belonging to another partition we start
processing as normal again until the run condition becomes false or we run
out of tuples to process.
When there are multiple WindowAgg nodes to evaluate then this complicates
the situation. For intermediate WindowAggs we must ensure we always
return all tuples to the calling node. Any filtering done could lead to
incorrect results in WindowAgg nodes above. For all intermediate nodes,
we can still save some work when the run condition becomes false. We've
no need to evaluate the WindowFuncs anymore. Other WindowAgg nodes cannot
reference the value of these and these tuples will not appear in the final
result anyway. The savings here are small in comparison to what can be
saved in the top-level WingowAgg, but still worthwhile.
Intermediate WindowAgg nodes never filter out tuples, but here we change
WindowAgg so that the top-level WindowAgg filters out tuples that don't
match the intermediate WindowAgg node's run condition. Such filters
appear in the "Filter" clause in EXPLAIN for the top-level WindowAgg node.
Here we add prosupport functions to allow the above to work for;
row_number(), rank(), dense_rank(), count(*) and count(expr). It appears
technically possible to do the same for min() and max(), however, it seems
unlikely to be useful enough, so that's not done here.
Bump catversion
Author: David Rowley
Reviewed-by: Andy Fan, Zhihong Yu
Discussion: https://postgr.es/m/CAApHDvqvp3At8++yF8ij06sdcoo1S_b2YoaT9D4Nf+MObzsrLQ@mail.gmail.com
2022-04-08 00:34:36 +02:00
|
|
|
WRITE_NODE_FIELD(qual);
|
|
|
|
WRITE_BOOL_FIELD(topwindow);
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outSetOpPath(StringInfo str, const SetOpPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("SETOPPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
|
|
|
WRITE_ENUM_FIELD(cmd, SetOpCmd);
|
|
|
|
WRITE_ENUM_FIELD(strategy, SetOpStrategy);
|
|
|
|
WRITE_NODE_FIELD(distinctList);
|
|
|
|
WRITE_INT_FIELD(flagColIdx);
|
|
|
|
WRITE_INT_FIELD(firstFlag);
|
|
|
|
WRITE_FLOAT_FIELD(numGroups, "%.0f");
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outRecursiveUnionPath(StringInfo str, const RecursiveUnionPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("RECURSIVEUNIONPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(leftpath);
|
|
|
|
WRITE_NODE_FIELD(rightpath);
|
|
|
|
WRITE_NODE_FIELD(distinctList);
|
|
|
|
WRITE_INT_FIELD(wtParam);
|
|
|
|
WRITE_FLOAT_FIELD(numGroups, "%.0f");
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outLockRowsPath(StringInfo str, const LockRowsPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("LOCKROWSPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
|
|
|
WRITE_NODE_FIELD(rowMarks);
|
|
|
|
WRITE_INT_FIELD(epqParam);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outModifyTablePath(StringInfo str, const ModifyTablePath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("MODIFYTABLEPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
WRITE_NODE_FIELD(subpath);
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
WRITE_ENUM_FIELD(operation, CmdType);
|
|
|
|
WRITE_BOOL_FIELD(canSetTag);
|
|
|
|
WRITE_UINT_FIELD(nominalRelation);
|
2018-10-07 20:33:17 +02:00
|
|
|
WRITE_UINT_FIELD(rootRelation);
|
Allow UPDATE to move rows between partitions.
When an UPDATE causes a row to no longer match the partition
constraint, try to move it to a different partition where it does
match the partition constraint. In essence, the UPDATE is split into
a DELETE from the old partition and an INSERT into the new one. This
can lead to surprising behavior in concurrency scenarios because
EvalPlanQual rechecks won't work as they normally did; the known
problems are documented. (There is a pending patch to improve the
situation further, but it needs more review.)
Amit Khandekar, reviewed and tested by Amit Langote, David Rowley,
Rajkumar Raghuwanshi, Dilip Kumar, Amul Sul, Thomas Munro, Álvaro
Herrera, Amit Kapila, and me. A few final revisions by me.
Discussion: http://postgr.es/m/CAJ3gD9do9o2ccQ7j7+tSgiE1REY65XRiMb=yJO3u3QhyP8EEPQ@mail.gmail.com
2018-01-19 21:33:06 +01:00
|
|
|
WRITE_BOOL_FIELD(partColsUpdated);
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
WRITE_NODE_FIELD(resultRelations);
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
WRITE_NODE_FIELD(updateColnosLists);
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
WRITE_NODE_FIELD(withCheckOptionLists);
|
|
|
|
WRITE_NODE_FIELD(returningLists);
|
|
|
|
WRITE_NODE_FIELD(rowMarks);
|
|
|
|
WRITE_NODE_FIELD(onconflict);
|
|
|
|
WRITE_INT_FIELD(epqParam);
|
2022-03-28 16:45:58 +02:00
|
|
|
WRITE_NODE_FIELD(mergeActionLists);
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outLimitPath(StringInfo str, const LimitPath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("LIMITPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
|
|
|
WRITE_NODE_FIELD(limitOffset);
|
|
|
|
WRITE_NODE_FIELD(limitCount);
|
2020-05-26 01:23:48 +02:00
|
|
|
WRITE_ENUM_FIELD(limitOption, LimitOption);
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
}
|
|
|
|
|
2017-03-09 13:40:36 +01:00
|
|
|
static void
|
|
|
|
_outGatherMergePath(StringInfo str, const GatherMergePath *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("GATHERMERGEPATH");
|
|
|
|
|
|
|
|
_outPathInfo(str, (const Path *) node);
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(subpath);
|
|
|
|
WRITE_INT_FIELD(num_workers);
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outNestPath(StringInfo str, const NestPath *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("NESTPATH");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outJoinPathInfo(str, (const JoinPath *) node);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outMergePath(StringInfo str, const MergePath *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("MERGEPATH");
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outJoinPathInfo(str, (const JoinPath *) node);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(path_mergeclauses);
|
|
|
|
WRITE_NODE_FIELD(outersortkeys);
|
|
|
|
WRITE_NODE_FIELD(innersortkeys);
|
2017-04-08 04:20:03 +02:00
|
|
|
WRITE_BOOL_FIELD(skip_mark_restore);
|
2009-11-15 03:45:35 +01:00
|
|
|
WRITE_BOOL_FIELD(materialize_inner);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outHashPath(StringInfo str, const HashPath *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("HASHPATH");
|
|
|
|
|
2011-12-07 20:46:56 +01:00
|
|
|
_outJoinPathInfo(str, (const JoinPath *) node);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(path_hashclauses);
|
2009-03-26 18:15:35 +01:00
|
|
|
WRITE_INT_FIELD(num_batches);
|
2018-06-01 01:07:13 +02:00
|
|
|
WRITE_FLOAT_FIELD(inner_rows_total, "%.0f");
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2007-02-19 08:03:34 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlannerGlobal(StringInfo str, const PlannerGlobal *node)
|
2007-02-19 08:03:34 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PLANNERGLOBAL");
|
|
|
|
|
|
|
|
/* NB: this isn't a complete set of fields */
|
2007-02-22 23:00:26 +01:00
|
|
|
WRITE_NODE_FIELD(subplans);
|
2007-02-27 02:11:26 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(rewindPlanIDs);
|
2007-02-22 23:00:26 +01:00
|
|
|
WRITE_NODE_FIELD(finalrtable);
|
2009-10-12 20:10:51 +02:00
|
|
|
WRITE_NODE_FIELD(finalrowmarks);
|
2011-02-26 00:56:23 +01:00
|
|
|
WRITE_NODE_FIELD(resultRelations);
|
Further adjust EXPLAIN's choices of table alias names.
This patch causes EXPLAIN to always assign a separate table alias to the
parent RTE of an append relation (inheritance set); before, such RTEs
were ignored if not actually scanned by the plan. Since the child RTEs
now always have that same alias to start with (cf. commit 55a1954da),
the net effect is that the parent RTE usually gets the alias used or
implied by the query text, and the children all get that alias with "_N"
appended. (The exception to "usually" is if there are duplicate aliases
in different subtrees of the original query; then some of those original
RTEs will also have "_N" appended.)
This results in more uniform output for partitioned-table plans than
we had before: the partitioned table itself gets the original alias,
and all child tables have aliases with "_N", rather than the previous
behavior where one of the children would get an alias without "_N".
The reason for giving the parent RTE an alias, even if it isn't scanned
by the plan, is that we now use the parent's alias to qualify Vars that
refer to an appendrel output column and appear above the Append or
MergeAppend that computes the appendrel. But below the append, Vars
refer to some one of the child relations, and are displayed that way.
This seems clearer than the old behavior where a Var that could carry
values from any child relation was displayed as if it referred to only
one of them.
While at it, change ruleutils.c so that the code paths used by EXPLAIN
deal in Plan trees not PlanState trees. This effectively reverts a
decision made in commit 1cc29fe7c, which seemed like a good idea at
the time to make ruleutils.c consistent with explain.c. However,
it's problematic because we'd really like to allow executor startup
pruning to remove all the children of an append node when possible,
leaving no child PlanState to resolve Vars against. (That's not done
here, but will be in the next patch.) This requires different handling
of subplans and initplans than before, but is otherwise a pretty
straightforward change.
Discussion: https://postgr.es/m/001001d4f44b$2a2cca50$7e865ef0$@lab.ntt.co.jp
2019-12-11 23:05:18 +01:00
|
|
|
WRITE_NODE_FIELD(appendRelations);
|
2007-10-11 20:05:27 +02:00
|
|
|
WRITE_NODE_FIELD(relationOids);
|
2008-09-09 20:58:09 +02:00
|
|
|
WRITE_NODE_FIELD(invalItems);
|
2017-11-13 21:24:12 +01:00
|
|
|
WRITE_NODE_FIELD(paramExecTypes);
|
2008-10-21 22:42:53 +02:00
|
|
|
WRITE_UINT_FIELD(lastPHId);
|
2011-02-10 05:27:07 +01:00
|
|
|
WRITE_UINT_FIELD(lastRowMarkId);
|
Avoid invalidating all foreign-join cached plans when user mappings change.
We must not push down a foreign join when the foreign tables involved
should be accessed under different user mappings. Previously we tried
to enforce that rule literally during planning, but that meant that the
resulting plans were dependent on the current contents of the
pg_user_mapping catalog, and we had to blow away all cached plans
containing any remote join when anything at all changed in pg_user_mapping.
This could have been improved somewhat, but the fact that a syscache inval
callback has very limited info about what changed made it hard to do better
within that design. Instead, let's change the planner to not consider user
mappings per se, but to allow a foreign join if both RTEs have the same
checkAsUser value. If they do, then they necessarily will use the same
user mapping at runtime, and we don't need to know specifically which one
that is. Post-plan-time changes in pg_user_mapping no longer require any
plan invalidation.
This rule does give up some optimization ability, to wit where two foreign
table references come from views with different owners or one's from a view
and one's directly in the query, but nonetheless the same user mapping
would have applied. We'll sacrifice the first case, but to not regress
more than we have to in the second case, allow a foreign join involving
both zero and nonzero checkAsUser values if the nonzero one is the same as
the prevailing effective userID. In that case, mark the plan as only
runnable by that userID.
The plancache code already had a notion of plans being userID-specific,
in order to support RLS. It was a little confused though, in particular
lacking clarity of thought as to whether it was the rewritten query or just
the finished plan that's dependent on the userID. Rearrange that code so
that it's clearer what depends on which, and so that the same logic applies
to both RLS-injected role dependency and foreign-join-injected role
dependency.
Note that this patch doesn't remove the other issue mentioned in the
original complaint, which is that while we'll reliably stop using a foreign
join if it's disallowed in a new context, we might fail to start using a
foreign join if it's now allowed, but we previously created a generic
cached plan that didn't use one. It was agreed that the chance of winning
that way was not high enough to justify the much larger number of plan
invalidations that would have to occur if we tried to cause it to happen.
In passing, clean up randomly-varying spelling of EXPLAIN commands in
postgres_fdw.sql, and fix a COSTS ON example that had been allowed to
leak into the committed tests.
This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the
previous attempt at ensuring we wouldn't push down foreign joins that
span permissions contexts.
Etsuro Fujita and Tom Lane
Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
|
|
|
WRITE_INT_FIELD(lastPlanNodeId);
|
2008-10-21 22:42:53 +02:00
|
|
|
WRITE_BOOL_FIELD(transientPlan);
|
Avoid invalidating all foreign-join cached plans when user mappings change.
We must not push down a foreign join when the foreign tables involved
should be accessed under different user mappings. Previously we tried
to enforce that rule literally during planning, but that meant that the
resulting plans were dependent on the current contents of the
pg_user_mapping catalog, and we had to blow away all cached plans
containing any remote join when anything at all changed in pg_user_mapping.
This could have been improved somewhat, but the fact that a syscache inval
callback has very limited info about what changed made it hard to do better
within that design. Instead, let's change the planner to not consider user
mappings per se, but to allow a foreign join if both RTEs have the same
checkAsUser value. If they do, then they necessarily will use the same
user mapping at runtime, and we don't need to know specifically which one
that is. Post-plan-time changes in pg_user_mapping no longer require any
plan invalidation.
This rule does give up some optimization ability, to wit where two foreign
table references come from views with different owners or one's from a view
and one's directly in the query, but nonetheless the same user mapping
would have applied. We'll sacrifice the first case, but to not regress
more than we have to in the second case, allow a foreign join involving
both zero and nonzero checkAsUser values if the nonzero one is the same as
the prevailing effective userID. In that case, mark the plan as only
runnable by that userID.
The plancache code already had a notion of plans being userID-specific,
in order to support RLS. It was a little confused though, in particular
lacking clarity of thought as to whether it was the rewritten query or just
the finished plan that's dependent on the userID. Rearrange that code so
that it's clearer what depends on which, and so that the same logic applies
to both RLS-injected role dependency and foreign-join-injected role
dependency.
Note that this patch doesn't remove the other issue mentioned in the
original complaint, which is that while we'll reliably stop using a foreign
join if it's disallowed in a new context, we might fail to start using a
foreign join if it's now allowed, but we previously created a generic
cached plan that didn't use one. It was agreed that the chance of winning
that way was not high enough to justify the much larger number of plan
invalidations that would have to occur if we tried to cause it to happen.
In passing, clean up randomly-varying spelling of EXPLAIN commands in
postgres_fdw.sql, and fix a COSTS ON example that had been allowed to
leak into the committed tests.
This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the
previous attempt at ensuring we wouldn't push down foreign joins that
span permissions contexts.
Etsuro Fujita and Tom Lane
Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
|
|
|
WRITE_BOOL_FIELD(dependsOnRole);
|
2015-09-16 21:38:47 +02:00
|
|
|
WRITE_BOOL_FIELD(parallelModeOK);
|
|
|
|
WRITE_BOOL_FIELD(parallelModeNeeded);
|
2016-08-19 20:03:07 +02:00
|
|
|
WRITE_CHAR_FIELD(maxParallelHazard);
|
2007-02-19 08:03:34 +01:00
|
|
|
}
|
|
|
|
|
2005-06-06 00:32:58 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlannerInfo(StringInfo str, const PlannerInfo *node)
|
2005-06-06 00:32:58 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PLANNERINFO");
|
|
|
|
|
2005-06-06 06:13:36 +02:00
|
|
|
/* NB: this isn't a complete set of fields */
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_NODE_FIELD(parse);
|
2007-02-19 08:03:34 +01:00
|
|
|
WRITE_NODE_FIELD(glob);
|
|
|
|
WRITE_UINT_FIELD(query_level);
|
Fix PARAM_EXEC assignment mechanism to be safe in the presence of WITH.
The planner previously assumed that parameter Vars having the same absolute
query level, varno, and varattno could safely be assigned the same runtime
PARAM_EXEC slot, even though they might be different Vars appearing in
different subqueries. This was (probably) safe before the introduction of
CTEs, but the lazy-evalution mechanism used for CTEs means that a CTE can
be executed during execution of some other subquery, causing the lifespan
of Params at the same syntactic nesting level as the CTE to overlap with
use of the same slots inside the CTE. In 9.1 we created additional hazards
by using the same parameter-assignment technology for nestloop inner scan
parameters, but it was broken before that, as illustrated by the added
regression test.
To fix, restructure the planner's management of PlannerParamItems so that
items having different semantic lifespans are kept rigorously separated.
This will probably result in complex queries using more runtime PARAM_EXEC
slots than before, but the slots are cheap enough that this hardly matters.
Also, stop generating PlannerParamItems containing Params for subquery
outputs: all we really need to do is reserve the PARAM_EXEC slot number,
and that now only takes incrementing a counter. The planning code is
simpler and probably faster than before, as well as being more correct.
Per report from Vik Reykja.
These changes will mostly also need to be made in the back branches, but
I'm going to hold off on that until after 9.2.0 wraps.
2012-09-05 18:54:03 +02:00
|
|
|
WRITE_NODE_FIELD(plan_params);
|
2015-08-12 05:48:37 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(outer_params);
|
2012-01-28 01:26:38 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(all_baserels);
|
Compute correct em_nullable_relids in get_eclass_for_sort_expr().
Bug #8591 from Claudio Freire demonstrates that get_eclass_for_sort_expr
must be able to compute valid em_nullable_relids for any new equivalence
class members it creates. I'd worried about this in the commit message
for db9f0e1d9a4a0842c814a464cdc9758c3f20b96c, but claimed that it wasn't a
problem because multi-member ECs should already exist when it runs. That
is transparently wrong, though, because this function is also called by
initialize_mergeclause_eclasses, which runs during deconstruct_jointree.
The example given in the bug report (which the new regression test item
is based upon) fails because the COALESCE() expression is first seen by
initialize_mergeclause_eclasses rather than process_equivalence.
Fixing this requires passing the appropriate nullable_relids set to
get_eclass_for_sort_expr, and it requires new code to compute that set
for top-level expressions such as ORDER BY, GROUP BY, etc. We store
the top-level nullable_relids in a new field in PlannerInfo to avoid
computing it many times. In the back branches, I've added the new
field at the end of the struct to minimize ABI breakage for planner
plugins. There doesn't seem to be a good alternative to changing
get_eclass_for_sort_expr's API signature, though. There probably aren't
any third-party extensions calling that function directly; moreover,
if there are, they probably need to think about what to pass for
nullable_relids anyway.
Back-patch to 9.2, like the previous patch in this area.
2013-11-15 22:46:18 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(nullable_baserels);
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_NODE_FIELD(join_rel_list);
|
2009-11-28 01:46:19 +01:00
|
|
|
WRITE_INT_FIELD(join_cur_level);
|
2007-02-19 08:03:34 +01:00
|
|
|
WRITE_NODE_FIELD(init_plans);
|
2008-10-04 23:56:55 +02:00
|
|
|
WRITE_NODE_FIELD(cte_plan_ids);
|
Implement UPDATE tab SET (col1,col2,...) = (SELECT ...), ...
This SQL-standard feature allows a sub-SELECT yielding multiple columns
(but only one row) to be used to compute the new values of several columns
to be updated. While the same results can be had with an independent
sub-SELECT per column, such a workaround can require a great deal of
duplicated computation.
The standard actually says that the source for a multi-column assignment
could be any row-valued expression. The implementation used here is
tightly tied to our existing sub-SELECT support and can't handle other
cases; the Bison grammar would have some issues with them too. However,
I don't feel too bad about this since other cases can be converted into
sub-SELECTs. For instance, "SET (a,b,c) = row_valued_function(x)" could
be written "SET (a,b,c) = (SELECT * FROM row_valued_function(x))".
2014-06-18 19:22:25 +02:00
|
|
|
WRITE_NODE_FIELD(multiexpr_params);
|
2007-01-20 21:45:41 +01:00
|
|
|
WRITE_NODE_FIELD(eq_classes);
|
Speed up finding EquivalenceClasses for a given set of rels
Previously in order to determine which ECs a relation had members in, we
had to loop over all ECs stored in PlannerInfo's eq_classes and check if
ec_relids mentioned the relation. For the most part, this was fine, as
generally, unless queries were fairly complex, the overhead of performing
the lookup would have not been that significant. However, when queries
contained large numbers of joins and ECs, the overhead to find the set of
classes matching a given set of relations could become a significant
portion of the overall planning effort.
Here we allow a much more efficient method to access the ECs which match a
given relation or set of relations. A new Bitmapset field in RelOptInfo
now exists to store the indexes into PlannerInfo's eq_classes list which
each relation is mentioned in. This allows very fast lookups to find all
ECs belonging to a single relation. When we need to lookup ECs belonging
to a given pair of relations, we can simply bitwise-AND the Bitmapsets from
each relation and use the result to perform the lookup.
We also take the opportunity to write a new implementation of
generate_join_implied_equalities which makes use of the new indexes.
generate_join_implied_equalities_for_ecs must remain as is as it can be
given a custom list of ECs, which we can't easily determine the indexes of.
This was originally intended to fix the performance penalty of looking up
foreign keys matching a join condition which was introduced by 100340e2d.
However, we're speeding up much more than just that here.
Author: David Rowley, Tom Lane
Reviewed-by: Tom Lane, Tomas Vondra
Discussion: https://postgr.es/m/6970.1545327857@sss.pgh.pa.us
2019-07-21 07:30:58 +02:00
|
|
|
WRITE_BOOL_FIELD(ec_merging_done);
|
2007-01-20 21:45:41 +01:00
|
|
|
WRITE_NODE_FIELD(canon_pathkeys);
|
Teach planner about some cases where a restriction clause can be
propagated inside an outer join. In particular, given
LEFT JOIN ON (A = B) WHERE A = constant, we cannot conclude that
B = constant at the top level (B might be null instead), but we
can nonetheless put a restriction B = constant into the quals for
B's relation, since no inner-side rows not meeting that condition
can contribute to the final result. Similarly, given
FULL JOIN USING (J) WHERE J = constant, we can't directly conclude
that either input J variable = constant, but it's OK to push such
quals into each input rel. Per recent gripe from Kim Bisgaard.
Along the way, remove 'valid_everywhere' flag from RestrictInfo,
as on closer analysis it was not being used for anything, and was
defined backwards anyway.
2005-07-03 01:00:42 +02:00
|
|
|
WRITE_NODE_FIELD(left_join_clauses);
|
|
|
|
WRITE_NODE_FIELD(right_join_clauses);
|
|
|
|
WRITE_NODE_FIELD(full_join_clauses);
|
2008-08-14 20:48:00 +02:00
|
|
|
WRITE_NODE_FIELD(join_info_list);
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(all_result_relids);
|
|
|
|
WRITE_BITMAPSET_FIELD(leaf_result_relids);
|
2006-01-31 22:39:25 +01:00
|
|
|
WRITE_NODE_FIELD(append_rel_list);
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
WRITE_NODE_FIELD(row_identity_vars);
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
WRITE_NODE_FIELD(rowMarks);
|
2008-10-21 22:42:53 +02:00
|
|
|
WRITE_NODE_FIELD(placeholder_list);
|
2016-06-18 21:22:34 +02:00
|
|
|
WRITE_NODE_FIELD(fkey_list);
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_NODE_FIELD(query_pathkeys);
|
2005-08-28 00:13:44 +02:00
|
|
|
WRITE_NODE_FIELD(group_pathkeys);
|
2008-12-28 19:54:01 +01:00
|
|
|
WRITE_NODE_FIELD(window_pathkeys);
|
2008-08-05 04:43:18 +02:00
|
|
|
WRITE_NODE_FIELD(distinct_pathkeys);
|
2005-08-28 00:13:44 +02:00
|
|
|
WRITE_NODE_FIELD(sort_pathkeys);
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
WRITE_NODE_FIELD(processed_tlist);
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
WRITE_NODE_FIELD(update_colnos);
|
2010-11-04 17:01:17 +01:00
|
|
|
WRITE_NODE_FIELD(minmax_aggs);
|
2006-09-20 00:49:53 +02:00
|
|
|
WRITE_FLOAT_FIELD(total_table_pages, "%.0f");
|
2005-08-28 00:13:44 +02:00
|
|
|
WRITE_FLOAT_FIELD(tuple_fraction, "%.4f");
|
2010-11-18 06:30:10 +01:00
|
|
|
WRITE_FLOAT_FIELD(limit_tuples, "%.0f");
|
Improve RLS planning by marking individual quals with security levels.
In an RLS query, we must ensure that security filter quals are evaluated
before ordinary query quals, in case the latter contain "leaky" functions
that could expose the contents of sensitive rows. The original
implementation of RLS planning ensured this by pushing the scan of a
secured table into a sub-query that it marked as a security-barrier view.
Unfortunately this results in very inefficient plans in many cases, because
the sub-query cannot be flattened and gets planned independently of the
rest of the query.
To fix, drop the use of sub-queries to enforce RLS qual order, and instead
mark each qual (RestrictInfo) with a security_level field establishing its
priority for evaluation. Quals must be evaluated in security_level order,
except that "leakproof" quals can be allowed to go ahead of quals of lower
security_level, if it's helpful to do so. This has to be enforced within
the ordering of any one list of quals to be evaluated at a table scan node,
and we also have to ensure that quals are not chosen for early evaluation
(i.e., use as an index qual or TID scan qual) if they're not allowed to go
ahead of other quals at the scan node.
This is sufficient to fix the problem for RLS quals, since we only support
RLS policies on simple tables and thus RLS quals will always exist at the
table scan level only. Eventually these qual ordering rules should be
enforced for join quals as well, which would permit improving planning for
explicit security-barrier views; but that's a task for another patch.
Note that FDWs would need to be aware of these rules --- and not, for
example, send an insecure qual for remote execution --- but since we do
not yet allow RLS policies on foreign tables, the case doesn't arise.
This will need to be addressed before we can allow such policies.
Patch by me, reviewed by Stephen Frost and Dean Rasheed.
Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
|
|
|
WRITE_UINT_FIELD(qual_security_level);
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_BOOL_FIELD(hasJoinRTEs);
|
2012-08-27 04:48:55 +02:00
|
|
|
WRITE_BOOL_FIELD(hasLateralRTEs);
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_BOOL_FIELD(hasHavingQual);
|
Revise the planner's handling of "pseudoconstant" WHERE clauses, that is
clauses containing no variables and no volatile functions. Such a clause
can be used as a one-time qual in a gating Result plan node, to suppress
plan execution entirely when it is false. Even when the clause is true,
putting it in a gating node wins by avoiding repeated evaluation of the
clause. In previous PG releases, query_planner() would do this for
pseudoconstant clauses appearing at the top level of the jointree, but
there was no ability to generate a gating Result deeper in the plan tree.
To fix it, get rid of the special case in query_planner(), and instead
process pseudoconstant clauses through the normal RestrictInfo qual
distribution mechanism. When a pseudoconstant clause is found attached to
a path node in create_plan(), pull it out and generate a gating Result at
that point. This requires special-casing pseudoconstants in selectivity
estimation and cost_qual_eval, but on the whole it's pretty clean.
It probably even makes the planner a bit faster than before for the normal
case of no pseudoconstants, since removing pull_constant_clauses saves one
useless traversal of the qual tree. Per gripe from Phil Frost.
2006-07-01 20:38:33 +02:00
|
|
|
WRITE_BOOL_FIELD(hasPseudoConstantQuals);
|
Move resolution of AlternativeSubPlan choices to the planner.
When commit bd3daddaf introduced AlternativeSubPlans, I had some
ambitions towards allowing the choice of subplan to change during
execution. That has not happened, or even been thought about, in the
ensuing twelve years; so it seems like a failed experiment. So let's
rip that out and resolve the choice of subplan at the end of planning
(in setrefs.c) rather than during executor startup. This has a number
of positive benefits:
* Removal of a few hundred lines of executor code, since
AlternativeSubPlans need no longer be supported there.
* Removal of executor-startup overhead (particularly, initialization
of subplans that won't be used).
* Removal of incidental costs of having a larger plan tree, such as
tree-scanning and copying costs in the plancache; not to mention
setrefs.c's own costs of processing the discarded subplans.
* EXPLAIN no longer has to print a weird (and undocumented)
representation of an AlternativeSubPlan choice; it sees only the
subplan actually used. This should mean less confusion for users.
* Since setrefs.c knows which subexpression of a plan node it's
working on at any instant, it's possible to adjust the estimated
number of executions of the subplan based on that. For example,
we should usually estimate more executions of a qual expression
than a targetlist expression. The implementation used here is
pretty simplistic, because we don't want to expend a lot of cycles
on the issue; but it's better than ignoring the point entirely,
as the executor had to.
That last point might possibly result in shifting the choice
between hashed and non-hashed EXISTS subplans in a few cases,
but in general this patch isn't meant to change planner choices.
Since we're doing the resolution so late, it's really impossible
to change any plan choices outside the AlternativeSubPlan itself.
Patch by me; thanks to David Rowley for review.
Discussion: https://postgr.es/m/1992952.1592785225@sss.pgh.pa.us
2020-09-27 18:51:28 +02:00
|
|
|
WRITE_BOOL_FIELD(hasAlternativeSubPlans);
|
2008-10-04 23:56:55 +02:00
|
|
|
WRITE_BOOL_FIELD(hasRecursion);
|
|
|
|
WRITE_INT_FIELD(wt_param_id);
|
2010-07-12 19:01:06 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(curOuterRels);
|
|
|
|
WRITE_NODE_FIELD(curOuterParams);
|
Faster partition pruning
Add a new module backend/partitioning/partprune.c, implementing a more
sophisticated algorithm for partition pruning. The new module uses each
partition's "boundinfo" for pruning instead of constraint exclusion,
based on an idea proposed by Robert Haas of a "pruning program": a list
of steps generated from the query quals which are run iteratively to
obtain a list of partitions that must be scanned in order to satisfy
those quals.
At present, this targets planner-time partition pruning, but there exist
further patches to apply partition pruning at execution time as well.
This commit also moves some definitions from include/catalog/partition.h
to a new file include/partitioning/partbounds.h, in an attempt to
rationalize partitioning related code.
Authors: Amit Langote, David Rowley, Dilip Kumar
Reviewers: Robert Haas, Kyotaro Horiguchi, Ashutosh Bapat, Jesper Pedersen.
Discussion: https://postgr.es/m/098b9c71-1915-1a2a-8d52-1a7a50ce79e8@lab.ntt.co.jp
2018-04-06 21:23:04 +02:00
|
|
|
WRITE_BOOL_FIELD(partColsUpdated);
|
2005-06-06 00:32:58 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outRelOptInfo(StringInfo str, const RelOptInfo *node)
|
2005-06-06 00:32:58 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("RELOPTINFO");
|
|
|
|
|
|
|
|
/* NB: this isn't a complete set of fields */
|
|
|
|
WRITE_ENUM_FIELD(reloptkind, RelOptKind);
|
|
|
|
WRITE_BITMAPSET_FIELD(relids);
|
|
|
|
WRITE_FLOAT_FIELD(rows, "%.0f");
|
2012-09-02 00:16:24 +02:00
|
|
|
WRITE_BOOL_FIELD(consider_startup);
|
Fix planner's cost estimation for SEMI/ANTI joins with inner indexscans.
When the inner side of a nestloop SEMI or ANTI join is an indexscan that
uses all the join clauses as indexquals, it can be presumed that both
matched and unmatched outer rows will be processed very quickly: for
matched rows, we'll stop after fetching one row from the indexscan, while
for unmatched rows we'll have an indexscan that finds no matching index
entries, which should also be quick. The planner already knew about this,
but it was nonetheless charging for at least one full run of the inner
indexscan, as a consequence of concerns about the behavior of materialized
inner scans --- but those concerns don't apply in the fast case. If the
inner side has low cardinality (many matching rows) this could make an
indexscan plan look far more expensive than it actually is. To fix,
rearrange the work in initial_cost_nestloop/final_cost_nestloop so that we
don't add the inner scan cost until we've inspected the indexquals, and
then we can add either the full-run cost or just the first tuple's cost as
appropriate.
Experimentation with this fix uncovered another problem: add_path and
friends were coded to disregard cheap startup cost when considering
parameterized paths. That's usually okay (and desirable, because it thins
the path herd faster); but in this fast case for SEMI/ANTI joins, it could
result in throwing away the desired plain indexscan path in favor of a
bitmap scan path before we ever get to the join costing logic. In the
many-matching-rows cases of interest here, a bitmap scan will do a lot more
work than required, so this is a problem. To fix, add a per-relation flag
consider_param_startup that works like the existing consider_startup flag,
but applies to parameterized paths, and set it for relations that are the
inside of a SEMI or ANTI join.
To make this patch reasonably safe to back-patch, care has been taken to
avoid changing the planner's behavior except in the very narrow case of
SEMI/ANTI joins with inner indexscans. There are places in
compare_path_costs_fuzzily and add_path_precheck that are not terribly
consistent with the new approach, but changing them will affect planner
decisions at the margins in other cases, so we'll leave that for a
HEAD-only fix.
Back-patch to 9.3; before that, the consider_startup flag didn't exist,
meaning that the second aspect of the patch would be too invasive.
Per a complaint from Peter Holzer and analysis by Tomas Vondra.
2015-06-03 17:58:47 +02:00
|
|
|
WRITE_BOOL_FIELD(consider_param_startup);
|
Generate parallel sequential scan plans in simple cases.
Add a new flag, consider_parallel, to each RelOptInfo, indicating
whether a plan for that relation could conceivably be run inside of
a parallel worker. Right now, we're pretty conservative: for example,
it might be possible to defer applying a parallel-restricted qual
in a worker, and later do it in the leader, but right now we just
don't try to parallelize access to that relation. That's probably
the right decision in most cases, anyway.
Using the new flag, generate parallel sequential scan plans for plain
baserels, meaning that we now have parallel sequential scan in
PostgreSQL. The logic here is pretty unsophisticated right now: the
costing model probably isn't right in detail, and we can't push joins
beneath Gather nodes, so the number of plans that can actually benefit
from this is pretty limited right now. Lots more work is needed.
Nevertheless, it seems time to enable this functionality so that all
this code can actually be tested easily by users and developers.
Note that, if you wish to test this functionality, it will be
necessary to set max_parallel_degree to a value greater than the
default of 0. Once a few more loose ends have been tidied up here, we
might want to consider changing the default value of this GUC, but
I'm leaving it alone for now.
Along the way, fix a bug in cost_gather: the previous coding thought
that a Gather node's transfer overhead should be costed on the basis of
the relation size rather than the number of tuples that actually need
to be passed off to the leader.
Patch by me, reviewed in earlier versions by Amit Kapila.
2015-11-11 15:02:52 +01:00
|
|
|
WRITE_BOOL_FIELD(consider_parallel);
|
2016-03-14 21:59:59 +01:00
|
|
|
WRITE_NODE_FIELD(reltarget);
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_NODE_FIELD(pathlist);
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
WRITE_NODE_FIELD(ppilist);
|
2016-01-20 20:29:22 +01:00
|
|
|
WRITE_NODE_FIELD(partial_pathlist);
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_NODE_FIELD(cheapest_startup_path);
|
|
|
|
WRITE_NODE_FIELD(cheapest_total_path);
|
|
|
|
WRITE_NODE_FIELD(cheapest_unique_path);
|
2012-01-28 01:26:38 +01:00
|
|
|
WRITE_NODE_FIELD(cheapest_parameterized_paths);
|
2015-12-11 21:52:16 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(direct_lateral_relids);
|
2015-12-08 00:56:14 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(lateral_relids);
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_UINT_FIELD(relid);
|
2014-12-11 09:19:50 +01:00
|
|
|
WRITE_OID_FIELD(reltablespace);
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_ENUM_FIELD(rtekind, RTEKind);
|
2006-01-31 22:39:25 +01:00
|
|
|
WRITE_INT_FIELD(min_attr);
|
|
|
|
WRITE_INT_FIELD(max_attr);
|
2012-08-27 04:48:55 +02:00
|
|
|
WRITE_NODE_FIELD(lateral_vars);
|
2013-08-18 02:22:37 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(lateral_referencers);
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_NODE_FIELD(indexlist);
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
|
|
|
WRITE_NODE_FIELD(statlist);
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_UINT_FIELD(pages);
|
|
|
|
WRITE_FLOAT_FIELD(tuples, "%.0f");
|
2011-10-14 23:23:01 +02:00
|
|
|
WRITE_FLOAT_FIELD(allvisfrac, "%.6f");
|
Speed up finding EquivalenceClasses for a given set of rels
Previously in order to determine which ECs a relation had members in, we
had to loop over all ECs stored in PlannerInfo's eq_classes and check if
ec_relids mentioned the relation. For the most part, this was fine, as
generally, unless queries were fairly complex, the overhead of performing
the lookup would have not been that significant. However, when queries
contained large numbers of joins and ECs, the overhead to find the set of
classes matching a given set of relations could become a significant
portion of the overall planning effort.
Here we allow a much more efficient method to access the ECs which match a
given relation or set of relations. A new Bitmapset field in RelOptInfo
now exists to store the indexes into PlannerInfo's eq_classes list which
each relation is mentioned in. This allows very fast lookups to find all
ECs belonging to a single relation. When we need to lookup ECs belonging
to a given pair of relations, we can simply bitwise-AND the Bitmapsets from
each relation and use the result to perform the lookup.
We also take the opportunity to write a new implementation of
generate_join_implied_equalities which makes use of the new indexes.
generate_join_implied_equalities_for_ecs must remain as is as it can be
given a custom list of ECs, which we can't easily determine the indexes of.
This was originally intended to fix the performance penalty of looking up
foreign keys matching a join condition which was introduced by 100340e2d.
However, we're speeding up much more than just that here.
Author: David Rowley, Tom Lane
Reviewed-by: Tom Lane, Tomas Vondra
Discussion: https://postgr.es/m/6970.1545327857@sss.pgh.pa.us
2019-07-21 07:30:58 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(eclass_indexes);
|
2011-09-03 21:35:12 +02:00
|
|
|
WRITE_NODE_FIELD(subroot);
|
Fix PARAM_EXEC assignment mechanism to be safe in the presence of WITH.
The planner previously assumed that parameter Vars having the same absolute
query level, varno, and varattno could safely be assigned the same runtime
PARAM_EXEC slot, even though they might be different Vars appearing in
different subqueries. This was (probably) safe before the introduction of
CTEs, but the lazy-evalution mechanism used for CTEs means that a CTE can
be executed during execution of some other subquery, causing the lifespan
of Params at the same syntactic nesting level as the CTE to overlap with
use of the same slots inside the CTE. In 9.1 we created additional hazards
by using the same parameter-assignment technology for nestloop inner scan
parameters, but it was broken before that, as illustrated by the added
regression test.
To fix, restructure the planner's management of PlannerParamItems so that
items having different semantic lifespans are kept rigorously separated.
This will probably result in complex queries using more runtime PARAM_EXEC
slots than before, but the slots are cheap enough that this hardly matters.
Also, stop generating PlannerParamItems containing Params for subquery
outputs: all we really need to do is reserve the PARAM_EXEC slot number,
and that now only takes incrementing a counter. The planning code is
simpler and probably faster than before, as well as being more correct.
Per report from Vik Reykja.
These changes will mostly also need to be made in the back branches, but
I'm going to hold off on that until after 9.2.0 wraps.
2012-09-05 18:54:03 +02:00
|
|
|
WRITE_NODE_FIELD(subplan_params);
|
Avoid invalidating all foreign-join cached plans when user mappings change.
We must not push down a foreign join when the foreign tables involved
should be accessed under different user mappings. Previously we tried
to enforce that rule literally during planning, but that meant that the
resulting plans were dependent on the current contents of the
pg_user_mapping catalog, and we had to blow away all cached plans
containing any remote join when anything at all changed in pg_user_mapping.
This could have been improved somewhat, but the fact that a syscache inval
callback has very limited info about what changed made it hard to do better
within that design. Instead, let's change the planner to not consider user
mappings per se, but to allow a foreign join if both RTEs have the same
checkAsUser value. If they do, then they necessarily will use the same
user mapping at runtime, and we don't need to know specifically which one
that is. Post-plan-time changes in pg_user_mapping no longer require any
plan invalidation.
This rule does give up some optimization ability, to wit where two foreign
table references come from views with different owners or one's from a view
and one's directly in the query, but nonetheless the same user mapping
would have applied. We'll sacrifice the first case, but to not regress
more than we have to in the second case, allow a foreign join involving
both zero and nonzero checkAsUser values if the nonzero one is the same as
the prevailing effective userID. In that case, mark the plan as only
runnable by that userID.
The plancache code already had a notion of plans being userID-specific,
in order to support RLS. It was a little confused though, in particular
lacking clarity of thought as to whether it was the rewritten query or just
the finished plan that's dependent on the userID. Rearrange that code so
that it's clearer what depends on which, and so that the same logic applies
to both RLS-injected role dependency and foreign-join-injected role
dependency.
Note that this patch doesn't remove the other issue mentioned in the
original complaint, which is that while we'll reliably stop using a foreign
join if it's disallowed in a new context, we might fail to start using a
foreign join if it's now allowed, but we previously created a generic
cached plan that didn't use one. It was agreed that the chance of winning
that way was not high enough to justify the much larger number of plan
invalidations that would have to occur if we tried to cause it to happen.
In passing, clean up randomly-varying spelling of EXPLAIN commands in
postgres_fdw.sql, and fix a COSTS ON example that had been allowed to
leak into the committed tests.
This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the
previous attempt at ensuring we wouldn't push down foreign joins that
span permissions contexts.
Etsuro Fujita and Tom Lane
Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
|
|
|
WRITE_INT_FIELD(rel_parallel_workers);
|
2021-02-27 10:59:36 +01:00
|
|
|
WRITE_UINT_FIELD(amflags);
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
WRITE_OID_FIELD(serverid);
|
Avoid invalidating all foreign-join cached plans when user mappings change.
We must not push down a foreign join when the foreign tables involved
should be accessed under different user mappings. Previously we tried
to enforce that rule literally during planning, but that meant that the
resulting plans were dependent on the current contents of the
pg_user_mapping catalog, and we had to blow away all cached plans
containing any remote join when anything at all changed in pg_user_mapping.
This could have been improved somewhat, but the fact that a syscache inval
callback has very limited info about what changed made it hard to do better
within that design. Instead, let's change the planner to not consider user
mappings per se, but to allow a foreign join if both RTEs have the same
checkAsUser value. If they do, then they necessarily will use the same
user mapping at runtime, and we don't need to know specifically which one
that is. Post-plan-time changes in pg_user_mapping no longer require any
plan invalidation.
This rule does give up some optimization ability, to wit where two foreign
table references come from views with different owners or one's from a view
and one's directly in the query, but nonetheless the same user mapping
would have applied. We'll sacrifice the first case, but to not regress
more than we have to in the second case, allow a foreign join involving
both zero and nonzero checkAsUser values if the nonzero one is the same as
the prevailing effective userID. In that case, mark the plan as only
runnable by that userID.
The plancache code already had a notion of plans being userID-specific,
in order to support RLS. It was a little confused though, in particular
lacking clarity of thought as to whether it was the rewritten query or just
the finished plan that's dependent on the userID. Rearrange that code so
that it's clearer what depends on which, and so that the same logic applies
to both RLS-injected role dependency and foreign-join-injected role
dependency.
Note that this patch doesn't remove the other issue mentioned in the
original complaint, which is that while we'll reliably stop using a foreign
join if it's disallowed in a new context, we might fail to start using a
foreign join if it's now allowed, but we previously created a generic
cached plan that didn't use one. It was agreed that the chance of winning
that way was not high enough to justify the much larger number of plan
invalidations that would have to occur if we tried to cause it to happen.
In passing, clean up randomly-varying spelling of EXPLAIN commands in
postgres_fdw.sql, and fix a COSTS ON example that had been allowed to
leak into the committed tests.
This reverts most of commits fbe5a3fb7 and 5d4171d1c, which were the
previous attempt at ensuring we wouldn't push down foreign joins that
span permissions contexts.
Etsuro Fujita and Tom Lane
Discussion: <d49c1e5b-f059-20f4-c132-e9752ee0113e@lab.ntt.co.jp>
2016-07-15 23:22:56 +02:00
|
|
|
WRITE_OID_FIELD(userid);
|
|
|
|
WRITE_BOOL_FIELD(useridiscurrent);
|
Revise FDW planning API, again.
Further reflection shows that a single callback isn't very workable if we
desire to let FDWs generate multiple Paths, because that forces the FDW to
do all work necessary to generate a valid Plan node for each Path. Instead
split the former PlanForeignScan API into three steps: GetForeignRelSize,
GetForeignPaths, GetForeignPlan. We had already bit the bullet of breaking
the 9.1 FDW API for 9.2, so this shouldn't cause very much additional pain,
and it's substantially more flexible for complex FDWs.
Add an fdw_private field to RelOptInfo so that the new functions can save
state there rather than possibly having to recalculate information two or
three times.
In addition, we'd not thought through what would be needed to allow an FDW
to set up subexpressions of its choice for runtime execution. We could
treat ForeignScan.fdw_private as an executable expression but that seems
likely to break existing FDWs unnecessarily (in particular, it would
restrict the set of node types allowable in fdw_private to those supported
by expression_tree_walker). Instead, invent a separate field fdw_exprs
which will receive the postprocessing appropriate for expression trees.
(One field is enough since it can be a list of expressions; also, we assume
the corresponding expression state tree(s) will be held within fdw_state,
so we don't need to add anything to ForeignScanState.)
Per review of Hanada Shigeru's pgsql_fdw patch. We may need to tweak this
further as we continue to work on that patch, but to me it feels a lot
closer to being right now.
2012-03-09 18:48:48 +01:00
|
|
|
/* we don't try to print fdwroutine or fdw_private */
|
2017-04-08 04:20:03 +02:00
|
|
|
/* can't print unique_for_rels/non_unique_for_rels; BMSes aren't Nodes */
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_NODE_FIELD(baserestrictinfo);
|
Improve RLS planning by marking individual quals with security levels.
In an RLS query, we must ensure that security filter quals are evaluated
before ordinary query quals, in case the latter contain "leaky" functions
that could expose the contents of sensitive rows. The original
implementation of RLS planning ensured this by pushing the scan of a
secured table into a sub-query that it marked as a security-barrier view.
Unfortunately this results in very inefficient plans in many cases, because
the sub-query cannot be flattened and gets planned independently of the
rest of the query.
To fix, drop the use of sub-queries to enforce RLS qual order, and instead
mark each qual (RestrictInfo) with a security_level field establishing its
priority for evaluation. Quals must be evaluated in security_level order,
except that "leakproof" quals can be allowed to go ahead of quals of lower
security_level, if it's helpful to do so. This has to be enforced within
the ordering of any one list of quals to be evaluated at a table scan node,
and we also have to ensure that quals are not chosen for early evaluation
(i.e., use as an index qual or TID scan qual) if they're not allowed to go
ahead of other quals at the scan node.
This is sufficient to fix the problem for RLS quals, since we only support
RLS policies on simple tables and thus RLS quals will always exist at the
table scan level only. Eventually these qual ordering rules should be
enforced for join quals as well, which would permit improving planning for
explicit security-barrier views; but that's a task for another patch.
Note that FDWs would need to be aware of these rules --- and not, for
example, send an insecure qual for remote execution --- but since we do
not yet allow RLS policies on foreign tables, the case doesn't arise.
This will need to be addressed before we can allow such policies.
Patch by me, reviewed by Stephen Frost and Dean Rasheed.
Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
|
|
|
WRITE_UINT_FIELD(baserestrict_min_security);
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_NODE_FIELD(joininfo);
|
2007-01-20 21:45:41 +01:00
|
|
|
WRITE_BOOL_FIELD(has_eclass_joins);
|
Disable support for partitionwise joins in problematic cases.
Commit f49842d, which added support for partitionwise joins, built the
child's tlist by applying adjust_appendrel_attrs() to the parent's. So in
the case where the parent's included a whole-row Var for the parent, the
child's contained a ConvertRowtypeExpr. To cope with that, that commit
added code to the planner, such as setrefs.c, but some code paths still
assumed that the tlist for a scan (or join) rel would only include Vars
and PlaceHolderVars, which was true before that commit, causing errors:
* When creating an explicit sort node for an input path for a mergejoin
path for a child join, prepare_sort_from_pathkeys() threw the 'could not
find pathkey item to sort' error.
* When deparsing a relation participating in a pushed down child join as a
subquery in contrib/postgres_fdw, get_relation_column_alias_ids() threw
the 'unexpected expression in subquery output' error.
* When performing set_plan_references() on a local join plan generated by
contrib/postgres_fdw for EvalPlanQual support for a pushed down child
join, fix_join_expr() threw the 'variable not found in subplan target
lists' error.
To fix these, two approaches have been proposed: one by Ashutosh Bapat and
one by me. While the former keeps building the child's tlist with a
ConvertRowtypeExpr, the latter builds it with a whole-row Var for the
child not to violate the planner assumption, and tries to fix it up later,
But both approaches need more work, so refuse to generate partitionwise
join paths when whole-row Vars are involved, instead. We don't need to
handle ConvertRowtypeExprs in the child's tlists for now, so this commit
also removes the changes to the planner.
Previously, partitionwise join computed attr_needed data for each child
separately, and built the child join's tlist using that data, which also
required an extra step for adding PlaceHolderVars to that tlist, but it
would be more efficient to build it from the parent join's tlist through
the adjust_appendrel_attrs() transformation. So this commit builds that
list that way, and simplifies build_joinrel_tlist() and placeholder.c as
well as part of set_append_rel_size() to basically what they were before
partitionwise join went in.
Back-patch to PG11 where partitionwise join was introduced.
Report by Rajkumar Raghuwanshi. Analysis by Ashutosh Bapat, who also
provided some of regression tests. Patch by me, reviewed by Robert Haas.
Discussion: https://postgr.es/m/CAKcux6ktu-8tefLWtQuuZBYFaZA83vUzuRd7c1YHC-yEWyYFpg@mail.gmail.com
2018-08-31 13:34:06 +02:00
|
|
|
WRITE_BOOL_FIELD(consider_partitionwise_join);
|
2017-04-04 05:06:36 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(top_parent_relids);
|
Allow partitionwise joins in more cases.
Previously, the partitionwise join technique only allowed partitionwise
join when input partitioned tables had exactly the same partition
bounds. This commit extends the technique to some cases when the tables
have different partition bounds, by using an advanced partition-matching
algorithm introduced by this commit. For both the input partitioned
tables, the algorithm checks whether every partition of one input
partitioned table only matches one partition of the other input
partitioned table at most, and vice versa. In such a case the join
between the tables can be broken down into joins between the matching
partitions, so the algorithm produces the pairs of the matching
partitions, plus the partition bounds for the join relation, to allow
partitionwise join for computing the join. Currently, the algorithm
works for list-partitioned and range-partitioned tables, but not
hash-partitioned tables. See comments in partition_bounds_merge().
Ashutosh Bapat and Etsuro Fujita, most of regression tests by Rajkumar
Raghuwanshi, some of the tests by Mark Dilger and Amul Sul, reviewed by
Dmitry Dolgov and Amul Sul, with additional review at various points by
Ashutosh Bapat, Mark Dilger, Robert Haas, Antonin Houska, Amit Langote,
Justin Pryzby, and Tomas Vondra
Discussion: https://postgr.es/m/CAFjFpRdjQvaUEV5DJX3TW6pU5eq54NCkadtxHX2JiJG_GvbrCA@mail.gmail.com
2020-04-08 03:25:00 +02:00
|
|
|
WRITE_BOOL_FIELD(partbounds_merged);
|
2021-08-03 01:47:24 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(live_parts);
|
Allow partitionwise joins in more cases.
Previously, the partitionwise join technique only allowed partitionwise
join when input partitioned tables had exactly the same partition
bounds. This commit extends the technique to some cases when the tables
have different partition bounds, by using an advanced partition-matching
algorithm introduced by this commit. For both the input partitioned
tables, the algorithm checks whether every partition of one input
partitioned table only matches one partition of the other input
partitioned table at most, and vice versa. In such a case the join
between the tables can be broken down into joins between the matching
partitions, so the algorithm produces the pairs of the matching
partitions, plus the partition bounds for the join relation, to allow
partitionwise join for computing the join. Currently, the algorithm
works for list-partitioned and range-partitioned tables, but not
hash-partitioned tables. See comments in partition_bounds_merge().
Ashutosh Bapat and Etsuro Fujita, most of regression tests by Rajkumar
Raghuwanshi, some of the tests by Mark Dilger and Amul Sul, reviewed by
Dmitry Dolgov and Amul Sul, with additional review at various points by
Ashutosh Bapat, Mark Dilger, Robert Haas, Antonin Houska, Amit Langote,
Justin Pryzby, and Tomas Vondra
Discussion: https://postgr.es/m/CAFjFpRdjQvaUEV5DJX3TW6pU5eq54NCkadtxHX2JiJG_GvbrCA@mail.gmail.com
2020-04-08 03:25:00 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(all_partrels);
|
2005-06-06 00:32:58 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outIndexOptInfo(StringInfo str, const IndexOptInfo *node)
|
2005-06-06 00:32:58 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("INDEXOPTINFO");
|
|
|
|
|
|
|
|
/* NB: this isn't a complete set of fields */
|
|
|
|
WRITE_OID_FIELD(indexoid);
|
|
|
|
/* Do NOT print rel field, else infinite recursion */
|
|
|
|
WRITE_UINT_FIELD(pages);
|
|
|
|
WRITE_FLOAT_FIELD(tuples, "%.0f");
|
Redesign the planner's handling of index-descent cost estimation.
Historically we've used a couple of very ad-hoc fudge factors to try to
get the right results when indexes of different sizes would satisfy a
query with the same number of index leaf tuples being visited. In
commit 21a39de5809cd3050a37d2554323cc1d0cbeed9d I tweaked one of these
fudge factors, with results that proved disastrous for larger indexes.
Commit bf01e34b556ff37982ba2d882db424aa484c0d07 fudged it some more,
but still with not a lot of principle behind it.
What seems like a better way to address these issues is to explicitly model
index-descent costs, since that's what's really at stake when considering
diferent indexes with similar leaf-page-level costs. We tried that once
long ago, and found that charging random_page_cost per page descended
through was way too much, because upper btree levels tend to stay in cache
in real-world workloads. However, there's still CPU costs to think about,
and the previous fudge factors can be seen as a crude attempt to account
for those costs. So this patch replaces those fudge factors with explicit
charges for the number of tuple comparisons needed to descend the index
tree, plus a small charge per page touched in the descent. The cost
multipliers are chosen so that the resulting charges are in the vicinity of
the historical (pre-9.2) fudge factors for indexes of up to about a million
tuples, while not ballooning unreasonably beyond that, as the old fudge
factor did (even more so in 9.2).
To make this work accurately for btree indexes, add some code that allows
extraction of the known root-page height from a btree. There's no
equivalent number readily available for other index types, but we can use
the log of the number of index pages as an approximate substitute.
This seems like too much of a behavioral change to risk back-patching,
but it should improve matters going forward. In 9.2 I'll just revert
the fudge-factor change.
2013-01-11 18:56:58 +01:00
|
|
|
WRITE_INT_FIELD(tree_height);
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_INT_FIELD(ncolumns);
|
Redesign the planner's handling of index-descent cost estimation.
Historically we've used a couple of very ad-hoc fudge factors to try to
get the right results when indexes of different sizes would satisfy a
query with the same number of index leaf tuples being visited. In
commit 21a39de5809cd3050a37d2554323cc1d0cbeed9d I tweaked one of these
fudge factors, with results that proved disastrous for larger indexes.
Commit bf01e34b556ff37982ba2d882db424aa484c0d07 fudged it some more,
but still with not a lot of principle behind it.
What seems like a better way to address these issues is to explicitly model
index-descent costs, since that's what's really at stake when considering
diferent indexes with similar leaf-page-level costs. We tried that once
long ago, and found that charging random_page_cost per page descended
through was way too much, because upper btree levels tend to stay in cache
in real-world workloads. However, there's still CPU costs to think about,
and the previous fudge factors can be seen as a crude attempt to account
for those costs. So this patch replaces those fudge factors with explicit
charges for the number of tuple comparisons needed to descend the index
tree, plus a small charge per page touched in the descent. The cost
multipliers are chosen so that the resulting charges are in the vicinity of
the historical (pre-9.2) fudge factors for indexes of up to about a million
tuples, while not ballooning unreasonably beyond that, as the old fudge
factor did (even more so in 9.2).
To make this work accurately for btree indexes, add some code that allows
extraction of the known root-page height from a btree. There's no
equivalent number readily available for other index types, but we can use
the log of the number of index pages as an approximate substitute.
This seems like too much of a behavioral change to risk back-patching,
but it should improve matters going forward. In 9.2 I'll just revert
the fudge-factor change.
2013-01-11 18:56:58 +01:00
|
|
|
/* array fields aren't really worth the trouble to print */
|
2011-02-17 01:24:45 +01:00
|
|
|
WRITE_OID_FIELD(relam);
|
2011-10-11 20:20:06 +02:00
|
|
|
/* indexprs is redundant since we print indextlist */
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_NODE_FIELD(indpred);
|
2011-10-11 20:20:06 +02:00
|
|
|
WRITE_NODE_FIELD(indextlist);
|
Support using index-only scans with partial indexes in more cases.
Previously, the planner would reject an index-only scan if any restriction
clause for its table used a column not available from the index, even
if that restriction clause would later be dropped from the plan entirely
because it's implied by the index's predicate. This is a fairly common
situation for partial indexes because predicates using columns not included
in the index are often the most useful kind of predicate, and we have to
duplicate (or at least imply) the predicate in the WHERE clause in order
to get the index to be considered at all. So index-only scans were
essentially unavailable with such partial indexes.
To fix, we have to do detection of implied-by-predicate clauses much
earlier in the planner. This patch puts it in check_index_predicates
(nee check_partial_indexes), meaning it gets done for every partial index,
whereas we previously only considered this issue at createplan time,
so that the work was only done for an index actually selected for use.
That could result in a noticeable planning slowdown for queries against
tables with many partial indexes. However, testing suggested that there
isn't really a significant cost, especially not with reasonable numbers
of partial indexes. We do get a small additional benefit, which is that
cost_index is more accurate since it correctly discounts the evaluation
cost of clauses that will be removed. We can also avoid considering such
clauses as potential indexquals, which saves useless matching cycles in
the case where the predicate columns aren't in the index, and prevents
generating bogus plans that double-count the clause's selectivity when
the columns are in the index.
Tomas Vondra and Kyotaro Horiguchi, reviewed by Kevin Grittner and
Konstantin Knizhnik, and whacked around a little by me
2016-03-31 20:48:56 +02:00
|
|
|
WRITE_NODE_FIELD(indrestrictinfo);
|
2005-06-06 00:32:58 +02:00
|
|
|
WRITE_BOOL_FIELD(predOK);
|
|
|
|
WRITE_BOOL_FIELD(unique);
|
2011-10-23 06:43:39 +02:00
|
|
|
WRITE_BOOL_FIELD(immediate);
|
2011-02-17 01:24:45 +01:00
|
|
|
WRITE_BOOL_FIELD(hypothetical);
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
/* we don't bother with fields copied from the index AM's API struct */
|
2005-06-06 00:32:58 +02:00
|
|
|
}
|
|
|
|
|
2016-06-18 21:22:34 +02:00
|
|
|
static void
|
|
|
|
_outForeignKeyOptInfo(StringInfo str, const ForeignKeyOptInfo *node)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
WRITE_NODE_TYPE("FOREIGNKEYOPTINFO");
|
|
|
|
|
|
|
|
WRITE_UINT_FIELD(con_relid);
|
|
|
|
WRITE_UINT_FIELD(ref_relid);
|
|
|
|
WRITE_INT_FIELD(nkeys);
|
2018-12-22 06:53:37 +01:00
|
|
|
WRITE_ATTRNUMBER_ARRAY(conkey, node->nkeys);
|
|
|
|
WRITE_ATTRNUMBER_ARRAY(confkey, node->nkeys);
|
|
|
|
WRITE_OID_ARRAY(conpfeqop, node->nkeys);
|
2016-06-18 21:22:34 +02:00
|
|
|
WRITE_INT_FIELD(nmatched_ec);
|
Fix foreign-key selectivity estimation in the presence of constants.
get_foreign_key_join_selectivity() looks for join clauses that equate
the two sides of the FK constraint. However, if we have a query like
"WHERE fktab.a = pktab.a and fktab.a = 1", it won't find any such join
clause, because equivclass.c replaces the given clauses with "fktab.a
= 1 and pktab.a = 1", which can be enforced at the scan level, leaving
nothing to be done for column "a" at the join level.
We can fix that expectation without much trouble, but then a new problem
arises: applying the foreign-key-based selectivity rule produces a
rowcount underestimate, because we're effectively double-counting the
selectivity of the "fktab.a = 1" clause. So we have to cancel that
selectivity out of the estimate.
To fix, refactor process_implied_equality() so that it can pass back the
new RestrictInfo to its callers in equivclass.c, allowing the generated
"fktab.a = 1" clause to be saved in the EquivalenceClass's ec_derives
list. Then it's not much trouble to dig out the relevant RestrictInfo
when we need to adjust an FK selectivity estimate. (While at it, we
can also remove the expensive use of initialize_mergeclause_eclasses()
to set up the new RestrictInfo's left_ec and right_ec pointers.
The equivclass.c code can set those basically for free.)
This seems like clearly a bug fix, but I'm hesitant to back-patch it,
first because there's some API/ABI risk for extensions and second because
we're usually loath to destabilize plan choices in stable branches.
Per report from Sigrid Ehrenreich.
Discussion: https://postgr.es/m/1019549.1603770457@sss.pgh.pa.us
Discussion: https://postgr.es/m/AM6PR02MB5287A0ADD936C1FA80973E72AB190@AM6PR02MB5287.eurprd02.prod.outlook.com
2020-10-28 16:15:47 +01:00
|
|
|
WRITE_INT_FIELD(nconst_ec);
|
2016-06-18 21:22:34 +02:00
|
|
|
WRITE_INT_FIELD(nmatched_rcols);
|
|
|
|
WRITE_INT_FIELD(nmatched_ri);
|
|
|
|
/* for compactness, just print the number of matches per column: */
|
|
|
|
appendStringInfoString(str, " :eclass");
|
|
|
|
for (i = 0; i < node->nkeys; i++)
|
|
|
|
appendStringInfo(str, " %d", (node->eclass[i] != NULL));
|
|
|
|
appendStringInfoString(str, " :rinfos");
|
|
|
|
for (i = 0; i < node->nkeys; i++)
|
|
|
|
appendStringInfo(str, " %d", list_length(node->rinfos[i]));
|
|
|
|
}
|
|
|
|
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
|
|
|
static void
|
|
|
|
_outStatisticExtInfo(StringInfo str, const StatisticExtInfo *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("STATISTICEXTINFO");
|
|
|
|
|
|
|
|
/* NB: this isn't a complete set of fields */
|
|
|
|
WRITE_OID_FIELD(statOid);
|
|
|
|
/* don't write rel, leads to infinite recursion in plan tree dump */
|
|
|
|
WRITE_CHAR_FIELD(kind);
|
|
|
|
WRITE_BITMAPSET_FIELD(keys);
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outEquivalenceClass(StringInfo str, const EquivalenceClass *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2007-01-20 21:45:41 +01:00
|
|
|
/*
|
|
|
|
* To simplify reading, we just chase up to the topmost merged EC and
|
|
|
|
* print that, without bothering to show the merge-ees separately.
|
|
|
|
*/
|
|
|
|
while (node->ec_merged)
|
|
|
|
node = node->ec_merged;
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2007-01-20 21:45:41 +01:00
|
|
|
WRITE_NODE_TYPE("EQUIVALENCECLASS");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(ec_opfamilies);
|
2011-03-20 01:29:08 +01:00
|
|
|
WRITE_OID_FIELD(ec_collation);
|
2007-01-20 21:45:41 +01:00
|
|
|
WRITE_NODE_FIELD(ec_members);
|
|
|
|
WRITE_NODE_FIELD(ec_sources);
|
2007-01-22 21:00:40 +01:00
|
|
|
WRITE_NODE_FIELD(ec_derives);
|
2007-01-20 21:45:41 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(ec_relids);
|
|
|
|
WRITE_BOOL_FIELD(ec_has_const);
|
|
|
|
WRITE_BOOL_FIELD(ec_has_volatile);
|
|
|
|
WRITE_BOOL_FIELD(ec_below_outer_join);
|
|
|
|
WRITE_BOOL_FIELD(ec_broken);
|
2007-11-08 22:49:48 +01:00
|
|
|
WRITE_UINT_FIELD(ec_sortref);
|
Improve RLS planning by marking individual quals with security levels.
In an RLS query, we must ensure that security filter quals are evaluated
before ordinary query quals, in case the latter contain "leaky" functions
that could expose the contents of sensitive rows. The original
implementation of RLS planning ensured this by pushing the scan of a
secured table into a sub-query that it marked as a security-barrier view.
Unfortunately this results in very inefficient plans in many cases, because
the sub-query cannot be flattened and gets planned independently of the
rest of the query.
To fix, drop the use of sub-queries to enforce RLS qual order, and instead
mark each qual (RestrictInfo) with a security_level field establishing its
priority for evaluation. Quals must be evaluated in security_level order,
except that "leakproof" quals can be allowed to go ahead of quals of lower
security_level, if it's helpful to do so. This has to be enforced within
the ordering of any one list of quals to be evaluated at a table scan node,
and we also have to ensure that quals are not chosen for early evaluation
(i.e., use as an index qual or TID scan qual) if they're not allowed to go
ahead of other quals at the scan node.
This is sufficient to fix the problem for RLS quals, since we only support
RLS policies on simple tables and thus RLS quals will always exist at the
table scan level only. Eventually these qual ordering rules should be
enforced for join quals as well, which would permit improving planning for
explicit security-barrier views; but that's a task for another patch.
Note that FDWs would need to be aware of these rules --- and not, for
example, send an insecure qual for remote execution --- but since we do
not yet allow RLS policies on foreign tables, the case doesn't arise.
This will need to be addressed before we can allow such policies.
Patch by me, reviewed by Stephen Frost and Dean Rasheed.
Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
|
|
|
WRITE_UINT_FIELD(ec_min_security);
|
|
|
|
WRITE_UINT_FIELD(ec_max_security);
|
2007-01-20 21:45:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outEquivalenceMember(StringInfo str, const EquivalenceMember *node)
|
2007-01-20 21:45:41 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("EQUIVALENCEMEMBER");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(em_expr);
|
|
|
|
WRITE_BITMAPSET_FIELD(em_relids);
|
Fix planning of non-strict equivalence clauses above outer joins.
If a potential equivalence clause references a variable from the nullable
side of an outer join, the planner needs to take care that derived clauses
are not pushed to below the outer join; else they may use the wrong value
for the variable. (The problem arises only with non-strict clauses, since
if an upper clause can be proven strict then the outer join will get
simplified to a plain join.) The planner attempted to prevent this type
of error by checking that potential equivalence clauses aren't
outerjoin-delayed as a whole, but actually we have to check each side
separately, since the two sides of the clause will get moved around
separately if it's treated as an equivalence. Bugs of this type can be
demonstrated as far back as 7.4, even though releases before 8.3 had only
a very ad-hoc notion of equivalence clauses.
In addition, we neglected to account for the possibility that such clauses
might have nonempty nullable_relids even when not outerjoin-delayed; so the
equivalence-class machinery lacked logic to compute correct nullable_relids
values for clauses it constructs. This oversight was harmless before 9.2
because we were only using RestrictInfo.nullable_relids for OR clauses;
but as of 9.2 it could result in pushing constructed equivalence clauses
to incorrect places. (This accounts for bug #7604 from Bill MacArthur.)
Fix the first problem by adding a new test check_equivalence_delay() in
distribute_qual_to_rels, and fix the second one by adding code in
equivclass.c and called functions to set correct nullable_relids for
generated clauses. Although I believe the second part of this is not
currently necessary before 9.2, I chose to back-patch it anyway, partly to
keep the logic similar across branches and partly because it seems possible
we might find other reasons why we need valid values of nullable_relids in
the older branches.
Add regression tests illustrating these problems. In 9.0 and up, also
add test cases checking that we can push constants through outer joins,
since we've broken that optimization before and I nearly broke it again
with an overly simplistic patch for this problem.
2012-10-18 18:28:45 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(em_nullable_relids);
|
2007-01-20 21:45:41 +01:00
|
|
|
WRITE_BOOL_FIELD(em_is_const);
|
|
|
|
WRITE_BOOL_FIELD(em_is_child);
|
|
|
|
WRITE_OID_FIELD(em_datatype);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPathKey(StringInfo str, const PathKey *node)
|
2007-01-20 21:45:41 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PATHKEY");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(pk_eclass);
|
|
|
|
WRITE_OID_FIELD(pk_opfamily);
|
|
|
|
WRITE_INT_FIELD(pk_strategy);
|
|
|
|
WRITE_BOOL_FIELD(pk_nulls_first);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2016-03-14 21:59:59 +01:00
|
|
|
static void
|
|
|
|
_outPathTarget(StringInfo str, const PathTarget *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PATHTARGET");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(exprs);
|
2021-09-14 09:34:50 +02:00
|
|
|
WRITE_INDEX_ARRAY(sortgrouprefs, list_length(node->exprs));
|
2016-03-14 21:59:59 +01:00
|
|
|
WRITE_FLOAT_FIELD(cost.startup, "%.2f");
|
|
|
|
WRITE_FLOAT_FIELD(cost.per_tuple, "%.2f");
|
|
|
|
WRITE_INT_FIELD(width);
|
2021-03-29 03:47:05 +02:00
|
|
|
WRITE_ENUM_FIELD(has_volatile_expr, VolatileFunctionStatus);
|
2016-03-14 21:59:59 +01:00
|
|
|
}
|
|
|
|
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
static void
|
|
|
|
_outParamPathInfo(StringInfo str, const ParamPathInfo *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PARAMPATHINFO");
|
|
|
|
|
|
|
|
WRITE_BITMAPSET_FIELD(ppi_req_outer);
|
|
|
|
WRITE_FLOAT_FIELD(ppi_rows, "%.0f");
|
|
|
|
WRITE_NODE_FIELD(ppi_clauses);
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outRestrictInfo(StringInfo str, const RestrictInfo *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("RESTRICTINFO");
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2003-06-16 00:51:45 +02:00
|
|
|
/* NB: this isn't a complete set of fields */
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(clause);
|
2004-01-05 06:07:36 +01:00
|
|
|
WRITE_BOOL_FIELD(is_pushed_down);
|
2005-11-15 00:54:23 +01:00
|
|
|
WRITE_BOOL_FIELD(outerjoin_delayed);
|
2004-01-05 06:07:36 +01:00
|
|
|
WRITE_BOOL_FIELD(can_join);
|
Revise the planner's handling of "pseudoconstant" WHERE clauses, that is
clauses containing no variables and no volatile functions. Such a clause
can be used as a one-time qual in a gating Result plan node, to suppress
plan execution entirely when it is false. Even when the clause is true,
putting it in a gating node wins by avoiding repeated evaluation of the
clause. In previous PG releases, query_planner() would do this for
pseudoconstant clauses appearing at the top level of the jointree, but
there was no ability to generate a gating Result deeper in the plan tree.
To fix it, get rid of the special case in query_planner(), and instead
process pseudoconstant clauses through the normal RestrictInfo qual
distribution mechanism. When a pseudoconstant clause is found attached to
a path node in create_plan(), pull it out and generate a gating Result at
that point. This requires special-casing pseudoconstants in selectivity
estimation and cost_qual_eval, but on the whole it's pretty clean.
It probably even makes the planner a bit faster than before for the normal
case of no pseudoconstants, since removing pull_constant_clauses saves one
useless traversal of the qual tree. Per gripe from Phil Frost.
2006-07-01 20:38:33 +02:00
|
|
|
WRITE_BOOL_FIELD(pseudoconstant);
|
Improve RLS planning by marking individual quals with security levels.
In an RLS query, we must ensure that security filter quals are evaluated
before ordinary query quals, in case the latter contain "leaky" functions
that could expose the contents of sensitive rows. The original
implementation of RLS planning ensured this by pushing the scan of a
secured table into a sub-query that it marked as a security-barrier view.
Unfortunately this results in very inefficient plans in many cases, because
the sub-query cannot be flattened and gets planned independently of the
rest of the query.
To fix, drop the use of sub-queries to enforce RLS qual order, and instead
mark each qual (RestrictInfo) with a security_level field establishing its
priority for evaluation. Quals must be evaluated in security_level order,
except that "leakproof" quals can be allowed to go ahead of quals of lower
security_level, if it's helpful to do so. This has to be enforced within
the ordering of any one list of quals to be evaluated at a table scan node,
and we also have to ensure that quals are not chosen for early evaluation
(i.e., use as an index qual or TID scan qual) if they're not allowed to go
ahead of other quals at the scan node.
This is sufficient to fix the problem for RLS quals, since we only support
RLS policies on simple tables and thus RLS quals will always exist at the
table scan level only. Eventually these qual ordering rules should be
enforced for join quals as well, which would permit improving planning for
explicit security-barrier views; but that's a task for another patch.
Note that FDWs would need to be aware of these rules --- and not, for
example, send an insecure qual for remote execution --- but since we do
not yet allow RLS policies on foreign tables, the case doesn't arise.
This will need to be addressed before we can allow such policies.
Patch by me, reviewed by Stephen Frost and Dean Rasheed.
Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
|
|
|
WRITE_BOOL_FIELD(leakproof);
|
2021-03-29 03:47:05 +02:00
|
|
|
WRITE_ENUM_FIELD(has_volatile, VolatileFunctionStatus);
|
Improve RLS planning by marking individual quals with security levels.
In an RLS query, we must ensure that security filter quals are evaluated
before ordinary query quals, in case the latter contain "leaky" functions
that could expose the contents of sensitive rows. The original
implementation of RLS planning ensured this by pushing the scan of a
secured table into a sub-query that it marked as a security-barrier view.
Unfortunately this results in very inefficient plans in many cases, because
the sub-query cannot be flattened and gets planned independently of the
rest of the query.
To fix, drop the use of sub-queries to enforce RLS qual order, and instead
mark each qual (RestrictInfo) with a security_level field establishing its
priority for evaluation. Quals must be evaluated in security_level order,
except that "leakproof" quals can be allowed to go ahead of quals of lower
security_level, if it's helpful to do so. This has to be enforced within
the ordering of any one list of quals to be evaluated at a table scan node,
and we also have to ensure that quals are not chosen for early evaluation
(i.e., use as an index qual or TID scan qual) if they're not allowed to go
ahead of other quals at the scan node.
This is sufficient to fix the problem for RLS quals, since we only support
RLS policies on simple tables and thus RLS quals will always exist at the
table scan level only. Eventually these qual ordering rules should be
enforced for join quals as well, which would permit improving planning for
explicit security-barrier views; but that's a task for another patch.
Note that FDWs would need to be aware of these rules --- and not, for
example, send an insecure qual for remote execution --- but since we do
not yet allow RLS policies on foreign tables, the case doesn't arise.
This will need to be addressed before we can allow such policies.
Patch by me, reviewed by Stephen Frost and Dean Rasheed.
Discussion: https://postgr.es/m/8185.1477432701@sss.pgh.pa.us
2017-01-18 18:58:20 +01:00
|
|
|
WRITE_UINT_FIELD(security_level);
|
2004-01-04 04:51:52 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(clause_relids);
|
2005-06-09 06:19:00 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(required_relids);
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(outer_relids);
|
2009-04-16 22:42:16 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(nullable_relids);
|
2003-02-08 21:20:55 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(left_relids);
|
|
|
|
WRITE_BITMAPSET_FIELD(right_relids);
|
2004-01-04 01:07:32 +01:00
|
|
|
WRITE_NODE_FIELD(orclause);
|
2007-02-12 18:19:30 +01:00
|
|
|
/* don't write parent_ec, leads to infinite recursion in plan tree dump */
|
2009-02-07 00:43:24 +01:00
|
|
|
WRITE_FLOAT_FIELD(norm_selec, "%.4f");
|
|
|
|
WRITE_FLOAT_FIELD(outer_selec, "%.4f");
|
2007-01-20 21:45:41 +01:00
|
|
|
WRITE_NODE_FIELD(mergeopfamilies);
|
2007-02-12 18:19:30 +01:00
|
|
|
/* don't write left_ec, leads to infinite recursion in plan tree dump */
|
|
|
|
/* don't write right_ec, leads to infinite recursion in plan tree dump */
|
2007-01-22 21:00:40 +01:00
|
|
|
WRITE_NODE_FIELD(left_em);
|
|
|
|
WRITE_NODE_FIELD(right_em);
|
2007-01-20 21:45:41 +01:00
|
|
|
WRITE_BOOL_FIELD(outer_is_left);
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_OID_FIELD(hashjoinoperator);
|
2021-11-08 02:40:33 +01:00
|
|
|
WRITE_OID_FIELD(left_hasheqoperator);
|
|
|
|
WRITE_OID_FIELD(right_hasheqoperator);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
Refactor the representation of indexable clauses in IndexPaths.
In place of three separate but interrelated lists (indexclauses,
indexquals, and indexqualcols), an IndexPath now has one list
"indexclauses" of IndexClause nodes. This holds basically the same
information as before, but in a more useful format: in particular, there
is now a clear connection between an indexclause (an original restriction
clause from WHERE or JOIN/ON) and the indexquals (directly usable index
conditions) derived from it.
We also change the ground rules a bit by mandating that clause commutation,
if needed, be done up-front so that what is stored in the indexquals list
is always directly usable as an index condition. This gets rid of repeated
re-determination of which side of the clause is the indexkey during costing
and plan generation, as well as repeated lookups of the commutator
operator. To minimize the added up-front cost, the typical case of
commuting a plain OpExpr is handled by a new special-purpose function
commute_restrictinfo(). For RowCompareExprs, generating the new clause
properly commuted to begin with is not really any more complex than before,
it's just different --- and we can save doing that work twice, as the
pretty-klugy original implementation did.
Tracking the connection between original and derived clauses lets us
also track explicitly whether the derived clauses are an exact or lossy
translation of the original. This provides a cheap solution to getting
rid of unnecessary rechecks of boolean index clauses, which previously
seemed like it'd be more expensive than it was worth.
Another pleasant (IMO) side-effect is that EXPLAIN now always shows
index clauses with the indexkey on the left; this seems less confusing.
This commit leaves expand_indexqual_conditions() and some related
functions in a slightly messy state. I didn't bother to change them
any more than minimally necessary to work with the new data structure,
because all that code is going to be refactored out of existence in
a follow-on patch.
Discussion: https://postgr.es/m/22182.1549124950@sss.pgh.pa.us
2019-02-09 23:30:43 +01:00
|
|
|
static void
|
|
|
|
_outIndexClause(StringInfo str, const IndexClause *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("INDEXCLAUSE");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(rinfo);
|
|
|
|
WRITE_NODE_FIELD(indexquals);
|
|
|
|
WRITE_BOOL_FIELD(lossy);
|
|
|
|
WRITE_INT_FIELD(indexcol);
|
|
|
|
WRITE_NODE_FIELD(indexcols);
|
|
|
|
}
|
|
|
|
|
2008-10-21 22:42:53 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlaceHolderVar(StringInfo str, const PlaceHolderVar *node)
|
2008-10-21 22:42:53 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PLACEHOLDERVAR");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(phexpr);
|
|
|
|
WRITE_BITMAPSET_FIELD(phrels);
|
|
|
|
WRITE_UINT_FIELD(phid);
|
|
|
|
WRITE_UINT_FIELD(phlevelsup);
|
|
|
|
}
|
|
|
|
|
2008-08-14 20:48:00 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outSpecialJoinInfo(StringInfo str, const SpecialJoinInfo *node)
|
2005-12-20 03:30:36 +01:00
|
|
|
{
|
2008-08-14 20:48:00 +02:00
|
|
|
WRITE_NODE_TYPE("SPECIALJOININFO");
|
2005-12-20 03:30:36 +01:00
|
|
|
|
|
|
|
WRITE_BITMAPSET_FIELD(min_lefthand);
|
|
|
|
WRITE_BITMAPSET_FIELD(min_righthand);
|
2007-08-31 03:44:06 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(syn_lefthand);
|
|
|
|
WRITE_BITMAPSET_FIELD(syn_righthand);
|
2008-08-14 20:48:00 +02:00
|
|
|
WRITE_ENUM_FIELD(jointype, JoinType);
|
2005-12-20 03:30:36 +01:00
|
|
|
WRITE_BOOL_FIELD(lhs_strict);
|
2007-05-23 01:23:58 +02:00
|
|
|
WRITE_BOOL_FIELD(delay_upper_joins);
|
Improve planner's cost estimation in the presence of semijoins.
If we have a semijoin, say
SELECT * FROM x WHERE x1 IN (SELECT y1 FROM y)
and we're estimating the cost of a parameterized indexscan on x, the number
of repetitions of the indexscan should not be taken as the size of y; it'll
really only be the number of distinct values of y1, because the only valid
plan with y on the outside of a nestloop would require y to be unique-ified
before joining it to x. Most of the time this doesn't make that much
difference, but sometimes it can lead to drastically underestimating the
cost of the indexscan and hence choosing a bad plan, as pointed out by
David Kubečka.
Fixing this is a bit difficult because parameterized indexscans are costed
out quite early in the planning process, before we have the information
that would be needed to call estimate_num_groups() and thereby estimate the
number of distinct values of the join column(s). However we can move the
code that extracts a semijoin RHS's unique-ification columns, so that it's
done in initsplan.c rather than on-the-fly in create_unique_path(). That
shouldn't make any difference speed-wise and it's really a bit cleaner too.
The other bit of information we need is the size of the semijoin RHS,
which is easy if it's a single relation (we make those estimates before
considering indexscan costs) but problematic if it's a join relation.
The solution adopted here is just to use the product of the sizes of the
join component rels. That will generally be an overestimate, but since
estimate_num_groups() only uses this input as a clamp, an overestimate
shouldn't hurt us too badly. In any case we don't allow this new logic
to produce a value larger than we would have chosen before, so that at
worst an overestimate leaves us no wiser than we were before.
2015-03-12 02:21:00 +01:00
|
|
|
WRITE_BOOL_FIELD(semi_can_btree);
|
|
|
|
WRITE_BOOL_FIELD(semi_can_hash);
|
|
|
|
WRITE_NODE_FIELD(semi_operators);
|
|
|
|
WRITE_NODE_FIELD(semi_rhs_exprs);
|
2003-01-20 19:55:07 +01:00
|
|
|
}
|
|
|
|
|
2006-01-31 22:39:25 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outAppendRelInfo(StringInfo str, const AppendRelInfo *node)
|
2006-01-31 22:39:25 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("APPENDRELINFO");
|
|
|
|
|
|
|
|
WRITE_UINT_FIELD(parent_relid);
|
|
|
|
WRITE_UINT_FIELD(child_relid);
|
|
|
|
WRITE_OID_FIELD(parent_reltype);
|
|
|
|
WRITE_OID_FIELD(child_reltype);
|
|
|
|
WRITE_NODE_FIELD(translated_vars);
|
2019-12-03 00:05:29 +01:00
|
|
|
WRITE_INT_FIELD(num_child_cols);
|
|
|
|
WRITE_ATTRNUMBER_ARRAY(parent_colnos, node->num_child_cols);
|
2006-01-31 22:39:25 +01:00
|
|
|
WRITE_OID_FIELD(parent_reloid);
|
|
|
|
}
|
|
|
|
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
static void
|
|
|
|
_outRowIdentityVarInfo(StringInfo str, const RowIdentityVarInfo *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("ROWIDENTITYVARINFO");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(rowidvar);
|
|
|
|
WRITE_INT_FIELD(rowidwidth);
|
|
|
|
WRITE_STRING_FIELD(rowidname);
|
|
|
|
WRITE_BITMAPSET_FIELD(rowidrels);
|
|
|
|
}
|
|
|
|
|
2008-10-21 22:42:53 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlaceHolderInfo(StringInfo str, const PlaceHolderInfo *node)
|
2008-10-21 22:42:53 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PLACEHOLDERINFO");
|
|
|
|
|
|
|
|
WRITE_UINT_FIELD(phid);
|
|
|
|
WRITE_NODE_FIELD(ph_var);
|
|
|
|
WRITE_BITMAPSET_FIELD(ph_eval_at);
|
2013-08-18 02:22:37 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(ph_lateral);
|
2008-10-21 22:42:53 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(ph_needed);
|
|
|
|
WRITE_INT_FIELD(ph_width);
|
|
|
|
}
|
|
|
|
|
2010-11-04 17:01:17 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outMinMaxAggInfo(StringInfo str, const MinMaxAggInfo *node)
|
2010-11-04 17:01:17 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("MINMAXAGGINFO");
|
|
|
|
|
|
|
|
WRITE_OID_FIELD(aggfnoid);
|
|
|
|
WRITE_OID_FIELD(aggsortop);
|
|
|
|
WRITE_NODE_FIELD(target);
|
2011-03-22 05:34:31 +01:00
|
|
|
/* We intentionally omit subroot --- too large, not interesting enough */
|
|
|
|
WRITE_NODE_FIELD(path);
|
|
|
|
WRITE_FLOAT_FIELD(pathcost, "%.2f");
|
|
|
|
WRITE_NODE_FIELD(param);
|
2010-11-04 17:01:17 +01:00
|
|
|
}
|
|
|
|
|
2007-02-19 08:03:34 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outPlannerParamItem(StringInfo str, const PlannerParamItem *node)
|
2007-02-19 08:03:34 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PLANNERPARAMITEM");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(item);
|
Fix PARAM_EXEC assignment mechanism to be safe in the presence of WITH.
The planner previously assumed that parameter Vars having the same absolute
query level, varno, and varattno could safely be assigned the same runtime
PARAM_EXEC slot, even though they might be different Vars appearing in
different subqueries. This was (probably) safe before the introduction of
CTEs, but the lazy-evalution mechanism used for CTEs means that a CTE can
be executed during execution of some other subquery, causing the lifespan
of Params at the same syntactic nesting level as the CTE to overlap with
use of the same slots inside the CTE. In 9.1 we created additional hazards
by using the same parameter-assignment technology for nestloop inner scan
parameters, but it was broken before that, as illustrated by the added
regression test.
To fix, restructure the planner's management of PlannerParamItems so that
items having different semantic lifespans are kept rigorously separated.
This will probably result in complex queries using more runtime PARAM_EXEC
slots than before, but the slots are cheap enough that this hardly matters.
Also, stop generating PlannerParamItems containing Params for subquery
outputs: all we really need to do is reserve the PARAM_EXEC slot number,
and that now only takes incrementing a counter. The planning code is
simpler and probably faster than before, as well as being more correct.
Per report from Vik Reykja.
These changes will mostly also need to be made in the back branches, but
I'm going to hold off on that until after 9.2.0 wraps.
2012-09-05 18:54:03 +02:00
|
|
|
WRITE_INT_FIELD(paramId);
|
2007-02-19 08:03:34 +01:00
|
|
|
}
|
|
|
|
|
Introduce extensible node types.
An extensible node is always tagged T_Extensible, but the extnodename
field identifies it more specifically; it may also include arbitrary
private data. Extensible nodes can be copied, tested for equality,
serialized, and deserialized, but the core system doesn't know
anything about them otherwise. Some extensions may find it useful to
include these nodes in fdw_private or custom_private lists in lieu of
arm-wrestling their data into a format that the core code can
understand.
Along the way, so as not to burden the authors of such extensible
node types too much, expose the functions for writing serialized
tokens, and for serializing and deserializing bitmapsets.
KaiGai Kohei, per a design suggested by me. Reviewed by Andres Freund
and by me, and further edited by me.
2016-02-12 15:31:16 +01:00
|
|
|
/*****************************************************************************
|
|
|
|
*
|
|
|
|
* Stuff from extensible.h
|
|
|
|
*
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outExtensibleNode(StringInfo str, const ExtensibleNode *node)
|
|
|
|
{
|
|
|
|
const ExtensibleNodeMethods *methods;
|
|
|
|
|
|
|
|
methods = GetExtensibleNodeMethods(node->extnodename, false);
|
|
|
|
|
|
|
|
WRITE_NODE_TYPE("EXTENSIBLENODE");
|
|
|
|
|
|
|
|
WRITE_STRING_FIELD(extnodename);
|
|
|
|
|
|
|
|
/* serialize the private fields */
|
|
|
|
methods->nodeOut(str, node);
|
|
|
|
}
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
/*****************************************************************************
|
|
|
|
*
|
|
|
|
* Stuff from parsenodes.h.
|
|
|
|
*
|
|
|
|
*****************************************************************************/
|
|
|
|
|
2012-04-18 16:43:16 +02:00
|
|
|
/*
|
|
|
|
* print the basic stuff of all nodes that inherit from CreateStmt
|
|
|
|
*/
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
2012-04-18 16:43:16 +02:00
|
|
|
_outCreateStmtInfo(StringInfo str, const CreateStmt *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(relation);
|
|
|
|
WRITE_NODE_FIELD(tableElts);
|
|
|
|
WRITE_NODE_FIELD(inhRelations);
|
Implement table partitioning.
Table partitioning is like table inheritance and reuses much of the
existing infrastructure, but there are some important differences.
The parent is called a partitioned table and is always empty; it may
not have indexes or non-inherited constraints, since those make no
sense for a relation with no data of its own. The children are called
partitions and contain all of the actual data. Each partition has an
implicit partitioning constraint. Multiple inheritance is not
allowed, and partitioning and inheritance can't be mixed. Partitions
can't have extra columns and may not allow nulls unless the parent
does. Tuples inserted into the parent are automatically routed to the
correct partition, so tuple-routing ON INSERT triggers are not needed.
Tuple routing isn't yet supported for partitions which are foreign
tables, and it doesn't handle updates that cross partition boundaries.
Currently, tables can be range-partitioned or list-partitioned. List
partitioning is limited to a single column, but range partitioning can
involve multiple columns. A partitioning "column" can be an
expression.
Because table partitioning is less general than table inheritance, it
is hoped that it will be easier to reason about properties of
partitions, and therefore that this will serve as a better foundation
for a variety of possible optimizations, including query planner
optimizations. The tuple routing based which this patch does based on
the implicit partitioning constraints is an example of this, but it
seems likely that many other useful optimizations are also possible.
Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat,
Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova,
Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
|
|
|
WRITE_NODE_FIELD(partspec);
|
|
|
|
WRITE_NODE_FIELD(partbound);
|
2010-01-29 00:21:13 +01:00
|
|
|
WRITE_NODE_FIELD(ofTypename);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(constraints);
|
2006-07-02 04:23:23 +02:00
|
|
|
WRITE_NODE_FIELD(options);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_ENUM_FIELD(oncommit, OnCommitAction);
|
2004-06-18 08:14:31 +02:00
|
|
|
WRITE_STRING_FIELD(tablespacename);
|
2019-03-06 20:55:28 +01:00
|
|
|
WRITE_STRING_FIELD(accessMethod);
|
2010-07-26 01:21:22 +02:00
|
|
|
WRITE_BOOL_FIELD(if_not_exists);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2012-04-18 16:43:16 +02:00
|
|
|
static void
|
|
|
|
_outCreateStmt(StringInfo str, const CreateStmt *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("CREATESTMT");
|
|
|
|
|
|
|
|
_outCreateStmtInfo(str, (const CreateStmt *) node);
|
|
|
|
}
|
|
|
|
|
2011-01-02 05:48:11 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outCreateForeignTableStmt(StringInfo str, const CreateForeignTableStmt *node)
|
2011-01-02 05:48:11 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("CREATEFOREIGNTABLESTMT");
|
|
|
|
|
2012-04-18 16:43:16 +02:00
|
|
|
_outCreateStmtInfo(str, (const CreateStmt *) node);
|
2011-01-02 05:48:11 +01:00
|
|
|
|
|
|
|
WRITE_STRING_FIELD(servername);
|
|
|
|
WRITE_NODE_FIELD(options);
|
|
|
|
}
|
|
|
|
|
2014-07-10 21:01:31 +02:00
|
|
|
static void
|
|
|
|
_outImportForeignSchemaStmt(StringInfo str, const ImportForeignSchemaStmt *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("IMPORTFOREIGNSCHEMASTMT");
|
|
|
|
|
|
|
|
WRITE_STRING_FIELD(server_name);
|
|
|
|
WRITE_STRING_FIELD(remote_schema);
|
|
|
|
WRITE_STRING_FIELD(local_schema);
|
|
|
|
WRITE_ENUM_FIELD(list_type, ImportForeignSchemaType);
|
|
|
|
WRITE_NODE_FIELD(table_list);
|
|
|
|
WRITE_NODE_FIELD(options);
|
|
|
|
}
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outIndexStmt(StringInfo str, const IndexStmt *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
2004-06-18 08:14:31 +02:00
|
|
|
WRITE_NODE_TYPE("INDEXSTMT");
|
2002-12-12 16:49:42 +01:00
|
|
|
|
|
|
|
WRITE_STRING_FIELD(idxname);
|
|
|
|
WRITE_NODE_FIELD(relation);
|
|
|
|
WRITE_STRING_FIELD(accessMethod);
|
2004-06-18 08:14:31 +02:00
|
|
|
WRITE_STRING_FIELD(tableSpace);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(indexParams);
|
2018-04-07 22:00:39 +02:00
|
|
|
WRITE_NODE_FIELD(indexIncludingParams);
|
2006-07-02 04:23:23 +02:00
|
|
|
WRITE_NODE_FIELD(options);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(whereClause);
|
2009-12-07 06:22:23 +01:00
|
|
|
WRITE_NODE_FIELD(excludeOpNames);
|
Avoid pre-determining index names during CREATE TABLE LIKE parsing.
Formerly, when trying to copy both indexes and comments, CREATE TABLE LIKE
had to pre-assign names to indexes that had comments, because it made up an
explicit CommentStmt command to apply the comment and so it had to know the
name for the index. This creates bad interactions with other indexes, as
shown in bug #6734 from Daniele Varrazzo: the preassignment logic couldn't
take any other indexes into account so it could choose a conflicting name.
To fix, add a field to IndexStmt that allows it to carry a comment to be
assigned to the new index. (This isn't a user-exposed feature of CREATE
INDEX, only an internal option.) Now we don't need preassignment of index
names in any situation.
I also took the opportunity to refactor DefineIndex to accept the IndexStmt
as such, rather than passing all its fields individually in a mile-long
parameter list.
Back-patch to 9.2, but no further, because it seems too dangerous to change
IndexStmt or DefineIndex's API in released branches. The bug exists back
to 9.0 where CREATE TABLE LIKE grew the ability to copy comments, but given
the lack of prior complaints we'll just let it go unfixed before 9.2.
2012-07-16 19:25:18 +02:00
|
|
|
WRITE_STRING_FIELD(idxcomment);
|
2011-01-25 21:42:03 +01:00
|
|
|
WRITE_OID_FIELD(indexOid);
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
WRITE_OID_FIELD(oldNumber);
|
Skip WAL for new relfilenodes, under wal_level=minimal.
Until now, only selected bulk operations (e.g. COPY) did this. If a
given relfilenode received both a WAL-skipping COPY and a WAL-logged
operation (e.g. INSERT), recovery could lose tuples from the COPY. See
src/backend/access/transam/README section "Skipping WAL for New
RelFileNode" for the new coding rules. Maintainers of table access
methods should examine that section.
To maintain data durability, just before commit, we choose between an
fsync of the relfilenode and copying its contents to WAL. A new GUC,
wal_skip_threshold, guides that choice. If this change slows a workload
that creates small, permanent relfilenodes under wal_level=minimal, try
adjusting wal_skip_threshold. Users setting a timeout on COMMIT may
need to adjust that timeout, and log_min_duration_statement analysis
will reflect time consumption moving to COMMIT from commands like COPY.
Internally, this requires a reliable determination of whether
RollbackAndReleaseCurrentSubTransaction() would unlink a relation's
current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the
specification of rd_createSubid such that the field is zero when a new
rel has an old rd_node. Make relcache.c retain entries for certain
dropped relations until end of transaction.
Bump XLOG_PAGE_MAGIC, since this introduces XLOG_GIST_ASSIGN_LSN.
Future servers accept older WAL, so this bump is discretionary.
Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert
Haas. Heikki Linnakangas and Michael Paquier implemented earlier
designs that materially clarified the problem. Reviewed, in earlier
designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane,
Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout.
Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
2020-04-04 21:25:34 +02:00
|
|
|
WRITE_UINT_FIELD(oldCreateSubid);
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
|
|
|
WRITE_UINT_FIELD(oldFirstRelfilelocatorSubid);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_BOOL_FIELD(unique);
|
2022-02-03 11:29:54 +01:00
|
|
|
WRITE_BOOL_FIELD(nulls_not_distinct);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_BOOL_FIELD(primary);
|
|
|
|
WRITE_BOOL_FIELD(isconstraint);
|
2009-07-29 22:56:21 +02:00
|
|
|
WRITE_BOOL_FIELD(deferrable);
|
|
|
|
WRITE_BOOL_FIELD(initdeferred);
|
Get rid of multiple applications of transformExpr() to the same tree.
transformExpr() has for many years had provisions to do nothing when
applied to an already-transformed expression tree. However, this was
always ugly and of dubious reliability, so we'd be much better off without
it. The primary historical reason for it was that gram.y sometimes
returned multiple links to the same subexpression, which is no longer true
as of my BETWEEN fixes. We'd also grown some lazy hacks in CREATE TABLE
LIKE (failing to distinguish between raw and already-transformed index
specifications) and one or two other places.
This patch removes the need for and support for re-transforming already
transformed expressions. The index case is dealt with by adding a flag
to struct IndexStmt to indicate that it's already been transformed;
which has some benefit anyway in that tablecmds.c can now Assert that
transformation has happened rather than just assuming. The other main
reason was some rather sloppy code for array type coercion, which can
be fixed (and its performance improved too) by refactoring.
I did leave transformJoinUsingClause() still constructing expressions
containing untransformed operator nodes being applied to Vars, so that
transformExpr() still has to allow Var inputs. But that's a much narrower,
and safer, special case than before, since Vars will never appear in a raw
parse tree, and they don't have any substructure to worry about.
In passing fix some oversights in the patch that added CREATE INDEX
IF NOT EXISTS (missing processing of IndexStmt.if_not_exists). These
appear relatively harmless, but still sloppy coding practice.
2015-02-22 19:59:09 +01:00
|
|
|
WRITE_BOOL_FIELD(transformed);
|
2006-08-25 06:06:58 +02:00
|
|
|
WRITE_BOOL_FIELD(concurrent);
|
Get rid of multiple applications of transformExpr() to the same tree.
transformExpr() has for many years had provisions to do nothing when
applied to an already-transformed expression tree. However, this was
always ugly and of dubious reliability, so we'd be much better off without
it. The primary historical reason for it was that gram.y sometimes
returned multiple links to the same subexpression, which is no longer true
as of my BETWEEN fixes. We'd also grown some lazy hacks in CREATE TABLE
LIKE (failing to distinguish between raw and already-transformed index
specifications) and one or two other places.
This patch removes the need for and support for re-transforming already
transformed expressions. The index case is dealt with by adding a flag
to struct IndexStmt to indicate that it's already been transformed;
which has some benefit anyway in that tablecmds.c can now Assert that
transformation has happened rather than just assuming. The other main
reason was some rather sloppy code for array type coercion, which can
be fixed (and its performance improved too) by refactoring.
I did leave transformJoinUsingClause() still constructing expressions
containing untransformed operator nodes being applied to Vars, so that
transformExpr() still has to allow Var inputs. But that's a much narrower,
and safer, special case than before, since Vars will never appear in a raw
parse tree, and they don't have any substructure to worry about.
In passing fix some oversights in the patch that added CREATE INDEX
IF NOT EXISTS (missing processing of IndexStmt.if_not_exists). These
appear relatively harmless, but still sloppy coding practice.
2015-02-22 19:59:09 +01:00
|
|
|
WRITE_BOOL_FIELD(if_not_exists);
|
Fix tablespace inheritance for partitioned rels
Commit ca4103025dfe left a few loose ends. The most important one
(broken pg_dump output) is already fixed by virtue of commit
3b23552ad8bb, but some things remained:
* When ALTER TABLE rewrites tables, the indexes must remain in the
tablespace they were originally in. This didn't work because
index recreation during ALTER TABLE runs manufactured SQL (yuck),
which runs afoul of default_tablespace in competition with the parent
relation tablespace. To fix, reset default_tablespace to the empty
string temporarily, and add the TABLESPACE clause as appropriate.
* Setting a partitioned rel's tablespace to the database default is
confusing; if it worked, it would direct the partitions to that
tablespace regardless of default_tablespace. But in reality it does
not work, and making it work is a larger project. Therefore, throw
an error when this condition is detected, to alert the unwary.
Add some docs and tests, too.
Author: Álvaro Herrera
Discussion: https://postgr.es/m/CAKJS1f_1c260nOt_vBJ067AZ3JXptXVRohDVMLEBmudX1YEx-A@mail.gmail.com
2019-04-25 16:20:23 +02:00
|
|
|
WRITE_BOOL_FIELD(reset_default_tblspc);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
|
|
|
static void
|
|
|
|
_outCreateStatsStmt(StringInfo str, const CreateStatsStmt *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("CREATESTATSSTMT");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(defnames);
|
Change CREATE STATISTICS syntax
Previously, we had the WITH clause in the middle of the command, where
you'd specify both generic options as well as statistic types. Few
people liked this, so this commit changes it to remove the WITH keyword
from that clause and makes it accept statistic types only. (We
currently don't have any generic options, but if we invent in the
future, we will gain a new WITH clause, probably at the end of the
command).
Also, the column list is now specified without parens, which makes the
whole command look more similar to a SELECT command. This change will
let us expand the command to supporting expressions (not just columns
names) as well as multiple tables and their join conditions.
Tom added lots of code comments and fixed some parts of the CREATE
STATISTICS reference page, too; more changes in this area are
forthcoming. He also fixed a potential problem in the alter_generic
regression test, reducing verbosity on a cascaded drop to avoid
dependency on message ordering, as we do in other tests.
Tom also closed a security bug: we documented that table ownership was
required in order to create a statistics object on it, but didn't
actually implement it.
Implement tab-completion for statistics objects. This can stand some
more improvement.
Authors: Alvaro Herrera, with lots of cleanup by Tom Lane
Discussion: https://postgr.es/m/20170420212426.ltvgyhnefvhixm6i@alvherre.pgsql
2017-05-12 19:59:23 +02:00
|
|
|
WRITE_NODE_FIELD(stat_types);
|
|
|
|
WRITE_NODE_FIELD(exprs);
|
|
|
|
WRITE_NODE_FIELD(relations);
|
Clone extended stats in CREATE TABLE (LIKE INCLUDING ALL)
The LIKE INCLUDING ALL clause to CREATE TABLE intuitively indicates
cloning of extended statistics on the source table, but it failed to do
so. Patch it up so that it does. Also include an INCLUDING STATISTICS
option to the LIKE clause, so that the behavior can be requested
individually, or excluded individually.
While at it, reorder the INCLUDING options, both in code and in docs, in
alphabetical order which makes more sense than feature-implementation
order that was previously used.
Backpatch this to Postgres 10, where extended statistics were
introduced, because this is seen as an oversight in a fresh feature
which is better to get consistent from the get-go instead of changing
only in pg11.
In pg11, comments on statistics objects are cloned too. In pg10 they
are not, because I (Álvaro) was too coward to change the parse node as
required to support it. Also, in pg10 I chose not to renumber the
parser symbols for the various INCLUDING options in LIKE, for the same
reason. Any corresponding user-visible changes (docs) are backpatched,
though.
Reported-by: Stephen Froehlich
Author: David Rowley
Reviewed-by: Álvaro Herrera, Tomas Vondra
Discussion: https://postgr.es/m/CY1PR0601MB1927315B45667A1B679D0FD5E5EF0@CY1PR0601MB1927.namprd06.prod.outlook.com
2018-03-05 23:37:19 +01:00
|
|
|
WRITE_STRING_FIELD(stxcomment);
|
2021-06-06 20:52:58 +02:00
|
|
|
WRITE_BOOL_FIELD(transformed);
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
|
|
|
WRITE_BOOL_FIELD(if_not_exists);
|
|
|
|
}
|
|
|
|
|
Allow setting statistics target for extended statistics
When building statistics, we need to decide how many rows to sample and
how accurate the resulting statistics should be. Until now, it was not
possible to explicitly define statistics target for extended statistics
objects, the value was always computed from the per-attribute targets
with a fallback to the system-wide default statistics target.
That's a bit inconvenient, as it ties together the statistics target set
for per-column and extended statistics. In some cases it may be useful
to require larger sample / higher accuracy for extended statics (or the
other way around), but with this approach that's not possible.
So this commit introduces a new command, allowing to specify statistics
target for individual extended statistics objects, overriding the value
derived from per-attribute targets (and the system default).
ALTER STATISTICS stat_name SET STATISTICS target_value;
When determining statistics target for an extended statistics object we
first look at this explicitly set value. When this value is -1, we fall
back to the old formula, looking at the per-attribute targets first and
then the system default. This means the behavior is backwards compatible
with older PostgreSQL releases.
Author: Tomas Vondra
Discussion: https://postgr.es/m/20190618213357.vli3i23vpkset2xd@development
Reviewed-by: Kirk Jamison, Dean Rasheed
2019-09-10 20:09:27 +02:00
|
|
|
static void
|
|
|
|
_outAlterStatsStmt(StringInfo str, const AlterStatsStmt *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("ALTERSTATSSTMT");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(defnames);
|
|
|
|
WRITE_INT_FIELD(stxstattarget);
|
|
|
|
WRITE_BOOL_FIELD(missing_ok);
|
|
|
|
}
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outNotifyStmt(StringInfo str, const NotifyStmt *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
2021-09-15 16:35:41 +02:00
|
|
|
WRITE_NODE_TYPE("NOTIFYSTMT");
|
2002-12-12 16:49:42 +01:00
|
|
|
|
2008-09-01 22:42:46 +02:00
|
|
|
WRITE_STRING_FIELD(conditionname);
|
2010-02-16 23:34:57 +01:00
|
|
|
WRITE_STRING_FIELD(payload);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
2003-03-10 04:53:52 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outDeclareCursorStmt(StringInfo str, const DeclareCursorStmt *node)
|
2003-03-10 04:53:52 +01:00
|
|
|
{
|
2021-09-15 16:35:41 +02:00
|
|
|
WRITE_NODE_TYPE("DECLARECURSORSTMT");
|
2003-03-10 04:53:52 +01:00
|
|
|
|
|
|
|
WRITE_STRING_FIELD(portalname);
|
|
|
|
WRITE_INT_FIELD(options);
|
|
|
|
WRITE_NODE_FIELD(query);
|
|
|
|
}
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outSelectStmt(StringInfo str, const SelectStmt *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("SELECT");
|
|
|
|
|
2004-06-09 21:08:20 +02:00
|
|
|
WRITE_NODE_FIELD(distinctClause);
|
2007-04-28 00:05:49 +02:00
|
|
|
WRITE_NODE_FIELD(intoClause);
|
2004-06-09 21:08:20 +02:00
|
|
|
WRITE_NODE_FIELD(targetList);
|
|
|
|
WRITE_NODE_FIELD(fromClause);
|
|
|
|
WRITE_NODE_FIELD(whereClause);
|
|
|
|
WRITE_NODE_FIELD(groupClause);
|
Implement GROUP BY DISTINCT
With grouping sets, it's possible that some of the grouping sets are
duplicate. This is especially common with CUBE and ROLLUP clauses. For
example GROUP BY CUBE (a,b), CUBE (b,c) is equivalent to
GROUP BY GROUPING SETS (
(a, b, c),
(a, b, c),
(a, b, c),
(a, b),
(a, b),
(a, b),
(a),
(a),
(a),
(c, a),
(c, a),
(c, a),
(c),
(b, c),
(b),
()
)
Some of the grouping sets are calculated multiple times, which is mostly
unnecessary. This commit implements a new GROUP BY DISTINCT feature, as
defined in the SQL standard, which eliminates the duplicate sets.
Author: Vik Fearing
Reviewed-by: Erik Rijkers, Georgios Kokolatos, Tomas Vondra
Discussion: https://postgr.es/m/bf3805a8-d7d1-ae61-fece-761b7ff41ecc@postgresfriends.org
2021-03-18 17:45:38 +01:00
|
|
|
WRITE_BOOL_FIELD(groupDistinct);
|
2004-06-09 21:08:20 +02:00
|
|
|
WRITE_NODE_FIELD(havingClause);
|
2008-12-28 19:54:01 +01:00
|
|
|
WRITE_NODE_FIELD(windowClause);
|
2006-08-02 03:59:48 +02:00
|
|
|
WRITE_NODE_FIELD(valuesLists);
|
2004-06-09 21:08:20 +02:00
|
|
|
WRITE_NODE_FIELD(sortClause);
|
|
|
|
WRITE_NODE_FIELD(limitOffset);
|
|
|
|
WRITE_NODE_FIELD(limitCount);
|
2020-04-07 22:22:13 +02:00
|
|
|
WRITE_ENUM_FIELD(limitOption, LimitOption);
|
2005-08-01 22:31:16 +02:00
|
|
|
WRITE_NODE_FIELD(lockingClause);
|
2012-07-31 23:56:21 +02:00
|
|
|
WRITE_NODE_FIELD(withClause);
|
2004-06-09 21:08:20 +02:00
|
|
|
WRITE_ENUM_FIELD(op, SetOperation);
|
|
|
|
WRITE_BOOL_FIELD(all);
|
|
|
|
WRITE_NODE_FIELD(larg);
|
|
|
|
WRITE_NODE_FIELD(rarg);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
2021-04-07 21:30:08 +02:00
|
|
|
static void
|
|
|
|
_outReturnStmt(StringInfo str, const ReturnStmt *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("RETURN");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(returnval);
|
|
|
|
}
|
|
|
|
|
2021-01-04 17:52:00 +01:00
|
|
|
static void
|
|
|
|
_outPLAssignStmt(StringInfo str, const PLAssignStmt *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PLASSIGN");
|
|
|
|
|
|
|
|
WRITE_STRING_FIELD(name);
|
|
|
|
WRITE_NODE_FIELD(indirection);
|
|
|
|
WRITE_INT_FIELD(nnames);
|
|
|
|
WRITE_NODE_FIELD(val);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outFuncCall(StringInfo str, const FuncCall *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("FUNCCALL");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(funcname);
|
|
|
|
WRITE_NODE_FIELD(args);
|
2009-12-15 18:57:48 +01:00
|
|
|
WRITE_NODE_FIELD(agg_order);
|
2013-07-17 02:15:36 +02:00
|
|
|
WRITE_NODE_FIELD(agg_filter);
|
Improve our ability to regurgitate SQL-syntax function calls.
The SQL spec calls out nonstandard syntax for certain function calls,
for example substring() with numeric position info is supposed to be
spelled "SUBSTRING(string FROM start FOR count)". We accept many
of these things, but up to now would not print them in the same format,
instead simplifying down to "substring"(string, start, count).
That's long annoyed me because it creates an interoperability
problem: we're gratuitously injecting Postgres-specific syntax into
what might otherwise be a perfectly spec-compliant view definition.
However, the real reason for addressing it right now is to support
a planned change in the semantics of EXTRACT() a/k/a date_part().
When we switch that to returning numeric, we'll have the parser
translate EXTRACT() to some new function name (might as well be
"extract" if you ask me) and then teach ruleutils.c to reverse-list
that per SQL spec. In this way existing calls to date_part() will
continue to have the old semantics.
To implement this, invent a new CoercionForm value COERCE_SQL_SYNTAX,
and make the parser insert that rather than COERCE_EXPLICIT_CALL when
the input has SQL-spec decoration. (But if the input has the form of
a plain function call, continue to mark it COERCE_EXPLICIT_CALL, even
if it's calling one of these functions.) Then ruleutils.c recognizes
COERCE_SQL_SYNTAX as a cue to emit SQL call syntax. It can know
which decoration to emit using hard-wired knowledge about the
functions that could be called this way. (While this solution isn't
extensible without manual additions, neither is the grammar, so this
doesn't seem unmaintainable.) Notice that this solution will
reverse-list a function call with SQL decoration only if it was
entered that way; so dump-and-reload will not by itself produce any
changes in the appearance of views.
This requires adding a CoercionForm field to struct FuncCall.
(I couldn't resist the temptation to rearrange that struct's
field order a tad while I was at it.) FuncCall doesn't appear
in stored rules, so that change isn't a reason for a catversion
bump, but I did one anyway because the new enum value for
CoercionForm fields could confuse old backend code.
Possible future work:
* Perhaps CoercionForm should now be renamed to DisplayForm,
or something like that, to reflect its more general meaning.
This'd require touching a couple hundred places, so it's not
clear it's worth the code churn.
* The SQLValueFunction node type, which was invented partly for
the same goal of improving SQL-compatibility of view output,
could perhaps be replaced with regular function calls marked
with COERCE_SQL_SYNTAX. It's unclear if this would be a net
code savings, however.
Discussion: https://postgr.es/m/42b73d2d-da12-ba9f-570a-420e0cce19d9@phystech.edu
2020-11-04 18:34:50 +01:00
|
|
|
WRITE_NODE_FIELD(over);
|
Support ordered-set (WITHIN GROUP) aggregates.
This patch introduces generic support for ordered-set and hypothetical-set
aggregate functions, as well as implementations of the instances defined in
SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(),
percent_rank(), cume_dist()). We also added mode() though it is not in the
spec, as well as versions of percentile_cont() and percentile_disc() that
can compute multiple percentile values in one pass over the data.
Unlike the original submission, this patch puts full control of the sorting
process in the hands of the aggregate's support functions. To allow the
support functions to find out how they're supposed to sort, a new API
function AggGetAggref() is added to nodeAgg.c. This allows retrieval of
the aggregate call's Aggref node, which may have other uses beyond the
immediate need. There is also support for ordered-set aggregates to
install cleanup callback functions, so that they can be sure that
infrastructure such as tuplesort objects gets cleaned up.
In passing, make some fixes in the recently-added support for variadic
aggregates, and make some editorial adjustments in the recent FILTER
additions for aggregates. Also, simplify use of IsBinaryCoercible() by
allowing it to succeed whenever the target type is ANY or ANYELEMENT.
It was inconsistent that it dealt with other polymorphic target types
but not these.
Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing,
and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
|
|
|
WRITE_BOOL_FIELD(agg_within_group);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_BOOL_FIELD(agg_star);
|
|
|
|
WRITE_BOOL_FIELD(agg_distinct);
|
2008-07-16 03:30:23 +02:00
|
|
|
WRITE_BOOL_FIELD(func_variadic);
|
Improve our ability to regurgitate SQL-syntax function calls.
The SQL spec calls out nonstandard syntax for certain function calls,
for example substring() with numeric position info is supposed to be
spelled "SUBSTRING(string FROM start FOR count)". We accept many
of these things, but up to now would not print them in the same format,
instead simplifying down to "substring"(string, start, count).
That's long annoyed me because it creates an interoperability
problem: we're gratuitously injecting Postgres-specific syntax into
what might otherwise be a perfectly spec-compliant view definition.
However, the real reason for addressing it right now is to support
a planned change in the semantics of EXTRACT() a/k/a date_part().
When we switch that to returning numeric, we'll have the parser
translate EXTRACT() to some new function name (might as well be
"extract" if you ask me) and then teach ruleutils.c to reverse-list
that per SQL spec. In this way existing calls to date_part() will
continue to have the old semantics.
To implement this, invent a new CoercionForm value COERCE_SQL_SYNTAX,
and make the parser insert that rather than COERCE_EXPLICIT_CALL when
the input has SQL-spec decoration. (But if the input has the form of
a plain function call, continue to mark it COERCE_EXPLICIT_CALL, even
if it's calling one of these functions.) Then ruleutils.c recognizes
COERCE_SQL_SYNTAX as a cue to emit SQL call syntax. It can know
which decoration to emit using hard-wired knowledge about the
functions that could be called this way. (While this solution isn't
extensible without manual additions, neither is the grammar, so this
doesn't seem unmaintainable.) Notice that this solution will
reverse-list a function call with SQL decoration only if it was
entered that way; so dump-and-reload will not by itself produce any
changes in the appearance of views.
This requires adding a CoercionForm field to struct FuncCall.
(I couldn't resist the temptation to rearrange that struct's
field order a tad while I was at it.) FuncCall doesn't appear
in stored rules, so that change isn't a reason for a catversion
bump, but I did one anyway because the new enum value for
CoercionForm fields could confuse old backend code.
Possible future work:
* Perhaps CoercionForm should now be renamed to DisplayForm,
or something like that, to reflect its more general meaning.
This'd require touching a couple hundred places, so it's not
clear it's worth the code churn.
* The SQLValueFunction node type, which was invented partly for
the same goal of improving SQL-compatibility of view output,
could perhaps be replaced with regular function calls marked
with COERCE_SQL_SYNTAX. It's unclear if this would be a net
code savings, however.
Discussion: https://postgr.es/m/42b73d2d-da12-ba9f-570a-420e0cce19d9@phystech.edu
2020-11-04 18:34:50 +01:00
|
|
|
WRITE_ENUM_FIELD(funcformat, CoercionForm);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
2004-06-09 21:08:20 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outDefElem(StringInfo str, const DefElem *node)
|
2004-06-09 21:08:20 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("DEFELEM");
|
|
|
|
|
2009-04-04 23:12:31 +02:00
|
|
|
WRITE_STRING_FIELD(defnamespace);
|
2004-06-09 21:08:20 +02:00
|
|
|
WRITE_STRING_FIELD(defname);
|
|
|
|
WRITE_NODE_FIELD(arg);
|
2009-04-04 23:12:31 +02:00
|
|
|
WRITE_ENUM_FIELD(defaction, DefElemAction);
|
2016-09-06 18:00:00 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2009-02-02 20:31:40 +01:00
|
|
|
}
|
|
|
|
|
2010-11-13 06:34:45 +01:00
|
|
|
static void
|
2012-01-07 13:58:13 +01:00
|
|
|
_outTableLikeClause(StringInfo str, const TableLikeClause *node)
|
2010-11-13 06:34:45 +01:00
|
|
|
{
|
2012-01-07 13:58:13 +01:00
|
|
|
WRITE_NODE_TYPE("TABLELIKECLAUSE");
|
2010-11-13 06:34:45 +01:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(relation);
|
|
|
|
WRITE_UINT_FIELD(options);
|
2020-12-01 20:02:27 +01:00
|
|
|
WRITE_OID_FIELD(relationOid);
|
2010-11-13 06:34:45 +01:00
|
|
|
}
|
|
|
|
|
2005-08-01 22:31:16 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outLockingClause(StringInfo str, const LockingClause *node)
|
2005-08-01 22:31:16 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("LOCKINGCLAUSE");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(lockedRels);
|
Improve concurrency of foreign key locking
This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE". UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.
Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.
The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid. Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates. This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed. pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.
Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header. This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.
Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)
With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.
As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.
Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane. There's probably room for several more tests.
There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it. Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.
This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
1290721684-sup-3951@alvh.no-ip.org
1294953201-sup-2099@alvh.no-ip.org
1320343602-sup-2290@alvh.no-ip.org
1339690386-sup-8927@alvh.no-ip.org
4FE5FF020200002500048A3D@gw.wicourts.gov
4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
|
|
|
WRITE_ENUM_FIELD(strength, LockClauseStrength);
|
2014-10-07 22:23:34 +02:00
|
|
|
WRITE_ENUM_FIELD(waitPolicy, LockWaitPolicy);
|
2005-08-01 22:31:16 +02:00
|
|
|
}
|
|
|
|
|
2007-02-03 15:06:56 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outXmlSerialize(StringInfo str, const XmlSerialize *node)
|
2007-02-03 15:06:56 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("XMLSERIALIZE");
|
|
|
|
|
|
|
|
WRITE_ENUM_FIELD(xmloption, XmlOptionType);
|
|
|
|
WRITE_NODE_FIELD(expr);
|
2009-07-16 08:33:46 +02:00
|
|
|
WRITE_NODE_FIELD(typeName);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2007-02-03 15:06:56 +01:00
|
|
|
}
|
|
|
|
|
2016-11-04 16:49:50 +01:00
|
|
|
static void
|
|
|
|
_outTriggerTransition(StringInfo str, const TriggerTransition *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("TRIGGERTRANSITION");
|
|
|
|
|
|
|
|
WRITE_STRING_FIELD(name);
|
|
|
|
WRITE_BOOL_FIELD(isNew);
|
|
|
|
WRITE_BOOL_FIELD(isTable);
|
|
|
|
}
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outColumnDef(StringInfo str, const ColumnDef *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("COLUMNDEF");
|
|
|
|
|
|
|
|
WRITE_STRING_FIELD(colname);
|
2009-07-16 08:33:46 +02:00
|
|
|
WRITE_NODE_FIELD(typeName);
|
Allow configurable LZ4 TOAST compression.
There is now a per-column COMPRESSION option which can be set to pglz
(the default, and the only option in up until now) or lz4. Or, if you
like, you can set the new default_toast_compression GUC to lz4, and
then that will be the default for new table columns for which no value
is specified. We don't have lz4 support in the PostgreSQL code, so
to use lz4 compression, PostgreSQL must be built --with-lz4.
In general, TOAST compression means compression of individual column
values, not the whole tuple, and those values can either be compressed
inline within the tuple or compressed and then stored externally in
the TOAST table, so those properties also apply to this feature.
Prior to this commit, a TOAST pointer has two unused bits as part of
the va_extsize field, and a compessed datum has two unused bits as
part of the va_rawsize field. These bits are unused because the length
of a varlena is limited to 1GB; we now use them to indicate the
compression type that was used. This means we only have bit space for
2 more built-in compresison types, but we could work around that
problem, if necessary, by introducing a new vartag_external value for
any further types we end up wanting to add. Hopefully, it won't be
too important to offer a wide selection of algorithms here, since
each one we add not only takes more coding but also adds a build
dependency for every packager. Nevertheless, it seems worth doing
at least this much, because LZ4 gets better compression than PGLZ
with less CPU usage.
It's possible for LZ4-compressed datums to leak into composite type
values stored on disk, just as it is for PGLZ. It's also possible for
LZ4-compressed attributes to be copied into a different table via SQL
commands such as CREATE TABLE AS or INSERT .. SELECT. It would be
expensive to force such values to be decompressed, so PostgreSQL has
never done so. For the same reasons, we also don't force recompression
of already-compressed values even if the target table prefers a
different compression method than was used for the source data. These
architectural decisions are perhaps arguable but revisiting them is
well beyond the scope of what seemed possible to do as part of this
project. However, it's relatively cheap to recompress as part of
VACUUM FULL or CLUSTER, so this commit adjusts those commands to do
so, if the configured compression method of the table happens not to
match what was used for some column value stored therein.
Dilip Kumar. The original patches on which this work was based were
written by Ildus Kurbangaliev, and those were patches were based on
even earlier work by Nikita Glukhov, but the design has since changed
very substantially, since allow a potentially large number of
compression methods that could be added and dropped on a running
system proved too problematic given some of the architectural issues
mentioned above; the choice of which specific compression method to
add first is now different; and a lot of the code has been heavily
refactored. More recently, Justin Przyby helped quite a bit with
testing and reviewing and this version also includes some code
contributions from him. Other design input and review from Tomas
Vondra, Álvaro Herrera, Andres Freund, Oleg Bartunov, Alexander
Korotkov, and me.
Discussion: http://postgr.es/m/20170907194236.4cefce96%40wp.localdomain
Discussion: http://postgr.es/m/CAFiTN-uUpX3ck%3DK0mLEk-G_kUQY%3DSNOTeqdaNRR9FMdQrHKebw%40mail.gmail.com
2021-03-19 20:10:38 +01:00
|
|
|
WRITE_STRING_FIELD(compression);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_INT_FIELD(inhcount);
|
|
|
|
WRITE_BOOL_FIELD(is_local);
|
|
|
|
WRITE_BOOL_FIELD(is_not_null);
|
Remove collation information from TypeName, where it does not belong.
The initial collations patch treated a COLLATE spec as part of a TypeName,
following what can only be described as brain fade on the part of the SQL
committee. It's a lot more reasonable to treat COLLATE as a syntactically
separate object, so that it can be added in only the productions where it
actually belongs, rather than needing to reject it in a boatload of places
where it doesn't belong (something the original patch mostly failed to do).
In addition this change lets us meet the spec's requirement to allow
COLLATE anywhere in the clauses of a ColumnDef, and it avoids unfriendly
behavior for constructs such as "foo::type COLLATE collation".
To do this, pull collation information out of TypeName and put it in
ColumnDef instead, thus reverting most of the collation-related changes in
parse_type.c's API. I made one additional structural change, which was to
use a ColumnDef as an intermediate node in AT_AlterColumnType AlterTableCmd
nodes. This provides enough room to get rid of the "transform" wart in
AlterTableCmd too, since the ColumnDef can carry the USING expression
easily enough.
Also fix some other minor bugs that have crept in in the same areas,
like failure to copy recently-added fields of ColumnDef in copyfuncs.c.
While at it, document the formerly secret ability to specify a collation
in ALTER TABLE ALTER COLUMN TYPE, ALTER TYPE ADD ATTRIBUTE, and
ALTER TYPE ALTER ATTRIBUTE TYPE; and correct some misstatements about
what the default collation selection will be when COLLATE is omitted.
BTW, the three-parameter form of format_type() should go away too,
since it just contributes to the confusion in this area; but I'll do
that in a separate patch.
2011-03-10 04:38:52 +01:00
|
|
|
WRITE_BOOL_FIELD(is_from_type);
|
2012-04-18 16:43:16 +02:00
|
|
|
WRITE_CHAR_FIELD(storage);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(raw_default);
|
2009-10-06 02:55:26 +02:00
|
|
|
WRITE_NODE_FIELD(cooked_default);
|
2017-04-06 14:33:16 +02:00
|
|
|
WRITE_CHAR_FIELD(identity);
|
2018-02-02 20:20:50 +01:00
|
|
|
WRITE_NODE_FIELD(identitySequence);
|
2019-03-30 08:13:09 +01:00
|
|
|
WRITE_CHAR_FIELD(generated);
|
Remove collation information from TypeName, where it does not belong.
The initial collations patch treated a COLLATE spec as part of a TypeName,
following what can only be described as brain fade on the part of the SQL
committee. It's a lot more reasonable to treat COLLATE as a syntactically
separate object, so that it can be added in only the productions where it
actually belongs, rather than needing to reject it in a boatload of places
where it doesn't belong (something the original patch mostly failed to do).
In addition this change lets us meet the spec's requirement to allow
COLLATE anywhere in the clauses of a ColumnDef, and it avoids unfriendly
behavior for constructs such as "foo::type COLLATE collation".
To do this, pull collation information out of TypeName and put it in
ColumnDef instead, thus reverting most of the collation-related changes in
parse_type.c's API. I made one additional structural change, which was to
use a ColumnDef as an intermediate node in AT_AlterColumnType AlterTableCmd
nodes. This provides enough room to get rid of the "transform" wart in
AlterTableCmd too, since the ColumnDef can carry the USING expression
easily enough.
Also fix some other minor bugs that have crept in in the same areas,
like failure to copy recently-added fields of ColumnDef in copyfuncs.c.
While at it, document the formerly secret ability to specify a collation
in ALTER TABLE ALTER COLUMN TYPE, ALTER TYPE ADD ATTRIBUTE, and
ALTER TYPE ALTER ATTRIBUTE TYPE; and correct some misstatements about
what the default collation selection will be when COLLATE is omitted.
BTW, the three-parameter form of format_type() should go away too,
since it just contributes to the confusion in this area; but I'll do
that in a separate patch.
2011-03-10 04:38:52 +01:00
|
|
|
WRITE_NODE_FIELD(collClause);
|
|
|
|
WRITE_OID_FIELD(collOid);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(constraints);
|
2011-08-05 19:24:03 +02:00
|
|
|
WRITE_NODE_FIELD(fdwoptions);
|
Support multi-argument UNNEST(), and TABLE() syntax for multiple functions.
This patch adds the ability to write TABLE( function1(), function2(), ...)
as a single FROM-clause entry. The result is the concatenation of the
first row from each function, followed by the second row from each
function, etc; with NULLs inserted if any function produces fewer rows than
others. This is believed to be a much more useful behavior than what
Postgres currently does with multiple SRFs in a SELECT list.
This syntax also provides a reasonable way to combine use of column
definition lists with WITH ORDINALITY: put the column definition list
inside TABLE(), where it's clear that it doesn't control the ordinality
column as well.
Also implement SQL-compliant multiple-argument UNNEST(), by turning
UNNEST(a,b,c) into TABLE(unnest(a), unnest(b), unnest(c)).
The SQL standard specifies TABLE() with only a single function, not
multiple functions, and it seems to require an implicit UNNEST() which is
not what this patch does. There may be something wrong with that reading
of the spec, though, because if it's right then the spec's TABLE() is just
a pointless alternative spelling of UNNEST(). After further review of
that, we might choose to adopt a different syntax for what this patch does,
but in any case this functionality seems clearly worthwhile.
Andrew Gierth, reviewed by Zoltán Böszörményi and Heikki Linnakangas, and
significantly revised by me
2013-11-22 01:37:02 +01:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outTypeName(StringInfo str, const TypeName *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("TYPENAME");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(names);
|
2009-07-16 08:33:46 +02:00
|
|
|
WRITE_OID_FIELD(typeOid);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_BOOL_FIELD(setof);
|
|
|
|
WRITE_BOOL_FIELD(pct_type);
|
2006-12-30 22:21:56 +01:00
|
|
|
WRITE_NODE_FIELD(typmods);
|
|
|
|
WRITE_INT_FIELD(typemod);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(arrayBounds);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outTypeCast(StringInfo str, const TypeCast *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("TYPECAST");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(arg);
|
2009-07-16 08:33:46 +02:00
|
|
|
WRITE_NODE_FIELD(typeName);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
2011-02-08 22:04:18 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outCollateClause(StringInfo str, const CollateClause *node)
|
2011-02-08 22:04:18 +01:00
|
|
|
{
|
2011-03-11 22:27:51 +01:00
|
|
|
WRITE_NODE_TYPE("COLLATECLAUSE");
|
2011-02-08 22:04:18 +01:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(arg);
|
2011-03-11 22:27:51 +01:00
|
|
|
WRITE_NODE_FIELD(collname);
|
2011-02-08 22:04:18 +01:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outIndexElem(StringInfo str, const IndexElem *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("INDEXELEM");
|
|
|
|
|
|
|
|
WRITE_STRING_FIELD(name);
|
2003-05-28 18:04:02 +02:00
|
|
|
WRITE_NODE_FIELD(expr);
|
Adjust naming of indexes and their columns per recent discussion.
Index expression columns are now named after the FigureColname result for
their expressions, rather than always being "pg_expression_N". Digits are
appended to this name if needed to make the column name unique within the
index. (That happens for regular columns too, thus fixing the old problem
that CREATE INDEX fooi ON foo (f1, f1) fails. Before exclusion indexes
there was no real reason to do such a thing, but now maybe there is.)
Default names for indexes and associated constraints now include the column
names of all their columns, not only the first one as in previous practice.
(Of course, this will be truncated as needed to fit in NAMEDATALEN. Also,
pkey indexes retain the historical behavior of not naming specific columns
at all.)
An example of the results:
regression=# create table foo (f1 int, f2 text,
regression(# exclude (f1 with =, lower(f2) with =));
NOTICE: CREATE TABLE / EXCLUDE will create implicit index "foo_f1_lower_exclusion" for table "foo"
CREATE TABLE
regression=# \d foo_f1_lower_exclusion
Index "public.foo_f1_lower_exclusion"
Column | Type | Definition
--------+---------+------------
f1 | integer | f1
lower | text | lower(f2)
btree, for table "public.foo"
2009-12-23 03:35:25 +01:00
|
|
|
WRITE_STRING_FIELD(indexcolname);
|
2011-02-08 22:04:18 +01:00
|
|
|
WRITE_NODE_FIELD(collation);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(opclass);
|
Implement operator class parameters
PostgreSQL provides set of template index access methods, where opclasses have
much freedom in the semantics of indexing. These index AMs are GiST, GIN,
SP-GiST and BRIN. There opclasses define representation of keys, operations on
them and supported search strategies. So, it's natural that opclasses may be
faced some tradeoffs, which require user-side decision. This commit implements
opclass parameters allowing users to set some values, which tell opclass how to
index the particular dataset.
This commit doesn't introduce new storage in system catalog. Instead it uses
pg_attribute.attoptions, which is used for table column storage options but
unused for index attributes.
In order to evade changing signature of each opclass support function, we
implement unified way to pass options to opclass support functions. Options
are set to fn_expr as the constant bytea expression. It's possible due to the
fact that opclass support functions are executed outside of expressions, so
fn_expr is unused for them.
This commit comes with some examples of opclass options usage. We parametrize
signature length in GiST. That applies to multiple opclasses: tsvector_ops,
gist__intbig_ops, gist_ltree_ops, gist__ltree_ops, gist_trgm_ops and
gist_hstore_ops. Also we parametrize maximum number of integer ranges for
gist__int_ops. However, the main future usage of this feature is expected
to be json, where users would be able to specify which way to index particular
json parts.
Catversion is bumped.
Discussion: https://postgr.es/m/d22c3a18-31c7-1879-fc11-4c1ce2f5e5af%40postgrespro.ru
Author: Nikita Glukhov, revised by me
Reviwed-by: Nikolay Shaplov, Robert Haas, Tom Lane, Tomas Vondra, Alvaro Herrera
2020-03-30 18:17:11 +02:00
|
|
|
WRITE_NODE_FIELD(opclassopts);
|
2007-01-09 03:14:16 +01:00
|
|
|
WRITE_ENUM_FIELD(ordering, SortByDir);
|
|
|
|
WRITE_ENUM_FIELD(nulls_ordering, SortByNulls);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
Extended statistics on expressions
Allow defining extended statistics on expressions, not just just on
simple column references. With this commit, expressions are supported
by all existing extended statistics kinds, improving the same types of
estimates. A simple example may look like this:
CREATE TABLE t (a int);
CREATE STATISTICS s ON mod(a,10), mod(a,20) FROM t;
ANALYZE t;
The collected statistics are useful e.g. to estimate queries with those
expressions in WHERE or GROUP BY clauses:
SELECT * FROM t WHERE mod(a,10) = 0 AND mod(a,20) = 0;
SELECT 1 FROM t GROUP BY mod(a,10), mod(a,20);
This introduces new internal statistics kind 'e' (expressions) which is
built automatically when the statistics object definition includes any
expressions. This represents single-expression statistics, as if there
was an expression index (but without the index maintenance overhead).
The statistics is stored in pg_statistics_ext_data as an array of
composite types, which is possible thanks to 79f6a942bd.
CREATE STATISTICS allows building statistics on a single expression, in
which case in which case it's not possible to specify statistics kinds.
A new system view pg_stats_ext_exprs can be used to display expression
statistics, similarly to pg_stats and pg_stats_ext views.
ALTER TABLE ... ALTER COLUMN ... TYPE now treats indexes the same way it
treats indexes, i.e. it drops and recreates the statistics. This means
all statistics are reset, and we no longer try to preserve at least the
functional dependencies. This should not be a major issue in practice,
as the functional dependencies actually rely on per-column statistics,
which were always reset anyway.
Author: Tomas Vondra
Reviewed-by: Justin Pryzby, Dean Rasheed, Zhihong Yu
Discussion: https://postgr.es/m/ad7891d2-e90c-b446-9fe2-7419143847d7%40enterprisedb.com
2021-03-26 23:22:01 +01:00
|
|
|
static void
|
|
|
|
_outStatsElem(StringInfo str, const StatsElem *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("STATSELEM");
|
|
|
|
|
|
|
|
WRITE_STRING_FIELD(name);
|
|
|
|
WRITE_NODE_FIELD(expr);
|
|
|
|
}
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outQuery(StringInfo str, const Query *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("QUERY");
|
|
|
|
|
|
|
|
WRITE_ENUM_FIELD(commandType, CmdType);
|
|
|
|
WRITE_ENUM_FIELD(querySource, QuerySource);
|
2012-03-27 21:14:13 +02:00
|
|
|
/* we intentionally do not print the queryId field */
|
2003-05-02 22:54:36 +02:00
|
|
|
WRITE_BOOL_FIELD(canSetTag);
|
2002-12-12 16:49:42 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Hack to work around missing outfuncs routines for a lot of the
|
|
|
|
* utility-statement node types. (The only one we actually *need* for
|
|
|
|
* rules support is NotifyStmt.) Someday we ought to support 'em all, but
|
|
|
|
* for the meantime do this to avoid getting lots of warnings when running
|
|
|
|
* with debug_print_parse on.
|
|
|
|
*/
|
|
|
|
if (node->utilityStmt)
|
2001-01-07 02:08:48 +01:00
|
|
|
{
|
2002-12-12 16:49:42 +01:00
|
|
|
switch (nodeTag(node->utilityStmt))
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_CreateStmt:
|
|
|
|
case T_IndexStmt:
|
|
|
|
case T_NotifyStmt:
|
2003-03-10 04:53:52 +01:00
|
|
|
case T_DeclareCursorStmt:
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(utilityStmt);
|
|
|
|
break;
|
|
|
|
default:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, " :utilityStmt ?");
|
2002-12-12 16:49:42 +01:00
|
|
|
break;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
2002-12-12 16:49:42 +01:00
|
|
|
else
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, " :utilityStmt <>");
|
2002-12-12 16:49:42 +01:00
|
|
|
|
|
|
|
WRITE_INT_FIELD(resultRelation);
|
|
|
|
WRITE_BOOL_FIELD(hasAggs);
|
2008-12-28 19:54:01 +01:00
|
|
|
WRITE_BOOL_FIELD(hasWindowFuncs);
|
2016-09-13 19:54:24 +02:00
|
|
|
WRITE_BOOL_FIELD(hasTargetSRFs);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_BOOL_FIELD(hasSubLinks);
|
2008-08-02 23:32:01 +02:00
|
|
|
WRITE_BOOL_FIELD(hasDistinctOn);
|
2008-10-04 23:56:55 +02:00
|
|
|
WRITE_BOOL_FIELD(hasRecursive);
|
2011-02-26 00:56:23 +01:00
|
|
|
WRITE_BOOL_FIELD(hasModifyingCTE);
|
2009-10-28 15:55:47 +01:00
|
|
|
WRITE_BOOL_FIELD(hasForUpdate);
|
Row-Level Security Policies (RLS)
Building on the updatable security-barrier views work, add the
ability to define policies on tables to limit the set of rows
which are returned from a query and which are allowed to be added
to a table. Expressions defined by the policy for filtering are
added to the security barrier quals of the query, while expressions
defined to check records being added to a table are added to the
with-check options of the query.
New top-level commands are CREATE/ALTER/DROP POLICY and are
controlled by the table owner. Row Security is able to be enabled
and disabled by the owner on a per-table basis using
ALTER TABLE .. ENABLE/DISABLE ROW SECURITY.
Per discussion, ROW SECURITY is disabled on tables by default and
must be enabled for policies on the table to be used. If no
policies exist on a table with ROW SECURITY enabled, a default-deny
policy is used and no records will be visible.
By default, row security is applied at all times except for the
table owner and the superuser. A new GUC, row_security, is added
which can be set to ON, OFF, or FORCE. When set to FORCE, row
security will be applied even for the table owner and superusers.
When set to OFF, row security will be disabled when allowed and an
error will be thrown if the user does not have rights to bypass row
security.
Per discussion, pg_dump sets row_security = OFF by default to ensure
that exports and backups will have all data in the table or will
error if there are insufficient privileges to bypass row security.
A new option has been added to pg_dump, --enable-row-security, to
ask pg_dump to export with row security enabled.
A new role capability, BYPASSRLS, which can only be set by the
superuser, is added to allow other users to be able to bypass row
security using row_security = OFF.
Many thanks to the various individuals who have helped with the
design, particularly Robert Haas for his feedback.
Authors include Craig Ringer, KaiGai Kohei, Adam Brightwell, Dean
Rasheed, with additional changes and rework by me.
Reviewers have included all of the above, Greg Smith,
Jeff McCormick, and Robert Haas.
2014-09-19 17:18:35 +02:00
|
|
|
WRITE_BOOL_FIELD(hasRowSecurity);
|
2021-04-07 21:30:08 +02:00
|
|
|
WRITE_BOOL_FIELD(isReturn);
|
2008-10-04 23:56:55 +02:00
|
|
|
WRITE_NODE_FIELD(cteList);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(rtable);
|
|
|
|
WRITE_NODE_FIELD(jointree);
|
|
|
|
WRITE_NODE_FIELD(targetList);
|
2017-04-06 14:33:16 +02:00
|
|
|
WRITE_ENUM_FIELD(override, OverridingKind);
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
WRITE_NODE_FIELD(onConflict);
|
2006-08-12 04:52:06 +02:00
|
|
|
WRITE_NODE_FIELD(returningList);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(groupClause);
|
Implement GROUP BY DISTINCT
With grouping sets, it's possible that some of the grouping sets are
duplicate. This is especially common with CUBE and ROLLUP clauses. For
example GROUP BY CUBE (a,b), CUBE (b,c) is equivalent to
GROUP BY GROUPING SETS (
(a, b, c),
(a, b, c),
(a, b, c),
(a, b),
(a, b),
(a, b),
(a),
(a),
(a),
(c, a),
(c, a),
(c, a),
(c),
(b, c),
(b),
()
)
Some of the grouping sets are calculated multiple times, which is mostly
unnecessary. This commit implements a new GROUP BY DISTINCT feature, as
defined in the SQL standard, which eliminates the duplicate sets.
Author: Vik Fearing
Reviewed-by: Erik Rijkers, Georgios Kokolatos, Tomas Vondra
Discussion: https://postgr.es/m/bf3805a8-d7d1-ae61-fece-761b7ff41ecc@postgresfriends.org
2021-03-18 17:45:38 +01:00
|
|
|
WRITE_BOOL_FIELD(groupDistinct);
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
WRITE_NODE_FIELD(groupingSets);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(havingQual);
|
2008-12-28 19:54:01 +01:00
|
|
|
WRITE_NODE_FIELD(windowClause);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(distinctClause);
|
|
|
|
WRITE_NODE_FIELD(sortClause);
|
|
|
|
WRITE_NODE_FIELD(limitOffset);
|
|
|
|
WRITE_NODE_FIELD(limitCount);
|
2020-04-07 22:22:13 +02:00
|
|
|
WRITE_ENUM_FIELD(limitOption, LimitOption);
|
2006-04-30 20:30:40 +02:00
|
|
|
WRITE_NODE_FIELD(rowMarks);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_NODE_FIELD(setOperations);
|
2010-08-07 04:44:09 +02:00
|
|
|
WRITE_NODE_FIELD(constraintDeps);
|
2018-09-18 21:08:28 +02:00
|
|
|
WRITE_NODE_FIELD(withCheckOptions);
|
2022-03-28 16:45:58 +02:00
|
|
|
WRITE_NODE_FIELD(mergeActionList);
|
|
|
|
WRITE_BOOL_FIELD(mergeUseOuterJoin);
|
Change representation of statement lists, and add statement location info.
This patch makes several changes that improve the consistency of
representation of lists of statements. It's always been the case
that the output of parse analysis is a list of Query nodes, whatever
the types of the individual statements in the list. This patch brings
similar consistency to the outputs of raw parsing and planning steps:
* The output of raw parsing is now always a list of RawStmt nodes;
the statement-type-dependent nodes are one level down from that.
* The output of pg_plan_queries() is now always a list of PlannedStmt
nodes, even for utility statements. In the case of a utility statement,
"planning" just consists of wrapping a CMD_UTILITY PlannedStmt around
the utility node. This list representation is now used in Portal and
CachedPlan plan lists, replacing the former convention of intermixing
PlannedStmts with bare utility-statement nodes.
Now, every list of statements has a consistent head-node type depending
on how far along it is in processing. This allows changing many places
that formerly used generic "Node *" pointers to use a more specific
pointer type, thus reducing the number of IsA() tests and casts needed,
as well as improving code clarity.
Also, the post-parse-analysis representation of DECLARE CURSOR is changed
so that it looks more like EXPLAIN, PREPARE, etc. That is, the contained
SELECT remains a child of the DeclareCursorStmt rather than getting flipped
around to be the other way. It's now true for both Query and PlannedStmt
that utilityStmt is non-null if and only if commandType is CMD_UTILITY.
That allows simplifying a lot of places that were testing both fields.
(I think some of those were just defensive programming, but in many places,
it was actually necessary to avoid confusing DECLARE CURSOR with SELECT.)
Because PlannedStmt carries a canSetTag field, we're also able to get rid
of some ad-hoc rules about how to reconstruct canSetTag for a bare utility
statement; specifically, the assumption that a utility is canSetTag if and
only if it's the only one in its list. While I see no near-term need for
relaxing that restriction, it's nice to get rid of the ad-hocery.
The API of ProcessUtility() is changed so that what it's passed is the
wrapper PlannedStmt not just the bare utility statement. This will affect
all users of ProcessUtility_hook, but the changes are pretty trivial; see
the affected contrib modules for examples of the minimum change needed.
(Most compilers should give pointer-type-mismatch warnings for uncorrected
code.)
There's also a change in the API of ExplainOneQuery_hook, to pass through
cursorOptions instead of expecting hook functions to know what to pick.
This is needed because of the DECLARE CURSOR changes, but really should
have been done in 9.6; it's unlikely that any extant hook functions
know about using CURSOR_OPT_PARALLEL_OK.
Finally, teach gram.y to save statement boundary locations in RawStmt
nodes, and pass those through to Query and PlannedStmt nodes. This allows
more intelligent handling of cases where a source query string contains
multiple statements. This patch doesn't actually do anything with the
information, but a follow-on patch will. (Passing this information through
cleanly is the true motivation for these changes; while I think this is all
good cleanup, it's unlikely we'd have bothered without this end goal.)
catversion bump because addition of location fields to struct Query
affects stored rules.
This patch is by me, but it owes a good deal to Fabien Coelho who did
a lot of preliminary work on the problem, and also reviewed the patch.
Discussion: https://postgr.es/m/alpine.DEB.2.20.1612200926310.29821@lancre
2017-01-14 22:02:35 +01:00
|
|
|
WRITE_LOCATION_FIELD(stmt_location);
|
2020-05-26 01:23:48 +02:00
|
|
|
WRITE_INT_FIELD(stmt_len);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
2013-07-18 23:10:16 +02:00
|
|
|
static void
|
|
|
|
_outWithCheckOption(StringInfo str, const WithCheckOption *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("WITHCHECKOPTION");
|
|
|
|
|
2015-04-25 02:34:26 +02:00
|
|
|
WRITE_ENUM_FIELD(kind, WCOKind);
|
|
|
|
WRITE_STRING_FIELD(relname);
|
2015-09-15 21:49:31 +02:00
|
|
|
WRITE_STRING_FIELD(polname);
|
2013-07-18 23:10:16 +02:00
|
|
|
WRITE_NODE_FIELD(qual);
|
|
|
|
WRITE_BOOL_FIELD(cascaded);
|
|
|
|
}
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outSortGroupClause(StringInfo str, const SortGroupClause *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
2008-08-02 23:32:01 +02:00
|
|
|
WRITE_NODE_TYPE("SORTGROUPCLAUSE");
|
2002-12-12 16:49:42 +01:00
|
|
|
|
|
|
|
WRITE_UINT_FIELD(tleSortGroupRef);
|
2008-08-02 23:32:01 +02:00
|
|
|
WRITE_OID_FIELD(eqop);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_OID_FIELD(sortop);
|
2007-01-09 03:14:16 +01:00
|
|
|
WRITE_BOOL_FIELD(nulls_first);
|
2010-10-31 02:55:20 +01:00
|
|
|
WRITE_BOOL_FIELD(hashable);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
static void
|
|
|
|
_outGroupingSet(StringInfo str, const GroupingSet *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("GROUPINGSET");
|
|
|
|
|
|
|
|
WRITE_ENUM_FIELD(kind, GroupingSetKind);
|
|
|
|
WRITE_NODE_FIELD(content);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
2008-12-28 19:54:01 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outWindowClause(StringInfo str, const WindowClause *node)
|
2008-12-28 19:54:01 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("WINDOWCLAUSE");
|
|
|
|
|
|
|
|
WRITE_STRING_FIELD(name);
|
|
|
|
WRITE_STRING_FIELD(refname);
|
|
|
|
WRITE_NODE_FIELD(partitionClause);
|
|
|
|
WRITE_NODE_FIELD(orderClause);
|
2008-12-31 01:08:39 +01:00
|
|
|
WRITE_INT_FIELD(frameOptions);
|
2010-02-12 18:33:21 +01:00
|
|
|
WRITE_NODE_FIELD(startOffset);
|
|
|
|
WRITE_NODE_FIELD(endOffset);
|
Teach planner and executor about monotonic window funcs
Window functions such as row_number() always return a value higher than
the previously returned value for tuples in any given window partition.
Traditionally queries such as;
SELECT * FROM (
SELECT *, row_number() over (order by c) rn
FROM t
) t WHERE rn <= 10;
were executed fairly inefficiently. Neither the query planner nor the
executor knew that once rn made it to 11 that nothing further would match
the outer query's WHERE clause. It would blindly continue until all
tuples were exhausted from the subquery.
Here we implement means to make the above execute more efficiently.
This is done by way of adding a pg_proc.prosupport function to various of
the built-in window functions and adding supporting code to allow the
support function to inform the planner if the window function is
monotonically increasing, monotonically decreasing, both or neither. The
planner is then able to make use of that information and possibly allow
the executor to short-circuit execution by way of adding a "run condition"
to the WindowAgg to allow it to determine if some of its execution work
can be skipped.
This "run condition" is not like a normal filter. These run conditions
are only built using quals comparing values to monotonic window functions.
For monotonic increasing functions, quals making use of the btree
operators for <, <= and = can be used (assuming the window function column
is on the left). You can see here that once such a condition becomes false
that a monotonic increasing function could never make it subsequently true
again. For monotonically decreasing functions the >, >= and = btree
operators for the given type can be used for run conditions.
The best-case situation for this is when there is a single WindowAgg node
without a PARTITION BY clause. Here when the run condition becomes false
the WindowAgg node can simply return NULL. No more tuples will ever match
the run condition. It's a little more complex when there is a PARTITION
BY clause. In this case, we cannot return NULL as we must still process
other partitions. To speed this case up we pull tuples from the outer
plan to check if they're from the same partition and simply discard them
if they are. When we find a tuple belonging to another partition we start
processing as normal again until the run condition becomes false or we run
out of tuples to process.
When there are multiple WindowAgg nodes to evaluate then this complicates
the situation. For intermediate WindowAggs we must ensure we always
return all tuples to the calling node. Any filtering done could lead to
incorrect results in WindowAgg nodes above. For all intermediate nodes,
we can still save some work when the run condition becomes false. We've
no need to evaluate the WindowFuncs anymore. Other WindowAgg nodes cannot
reference the value of these and these tuples will not appear in the final
result anyway. The savings here are small in comparison to what can be
saved in the top-level WingowAgg, but still worthwhile.
Intermediate WindowAgg nodes never filter out tuples, but here we change
WindowAgg so that the top-level WindowAgg filters out tuples that don't
match the intermediate WindowAgg node's run condition. Such filters
appear in the "Filter" clause in EXPLAIN for the top-level WindowAgg node.
Here we add prosupport functions to allow the above to work for;
row_number(), rank(), dense_rank(), count(*) and count(expr). It appears
technically possible to do the same for min() and max(), however, it seems
unlikely to be useful enough, so that's not done here.
Bump catversion
Author: David Rowley
Reviewed-by: Andy Fan, Zhihong Yu
Discussion: https://postgr.es/m/CAApHDvqvp3At8++yF8ij06sdcoo1S_b2YoaT9D4Nf+MObzsrLQ@mail.gmail.com
2022-04-08 00:34:36 +02:00
|
|
|
WRITE_NODE_FIELD(runCondition);
|
Support all SQL:2011 options for window frame clauses.
This patch adds the ability to use "RANGE offset PRECEDING/FOLLOWING"
frame boundaries in window functions. We'd punted on that back in the
original patch to add window functions, because it was not clear how to
do it in a reasonably data-type-extensible fashion. That problem is
resolved here by adding the ability for btree operator classes to provide
an "in_range" support function that defines how to add or subtract the
RANGE offset value. Factoring it this way also allows the operator class
to avoid overflow problems near the ends of the datatype's range, if it
wishes to expend effort on that. (In the committed patch, the integer
opclasses handle that issue, but it did not seem worth the trouble to
avoid overflow failures for datetime types.)
The patch includes in_range support for the integer_ops opfamily
(int2/int4/int8) as well as the standard datetime types. Support for
other numeric types has been requested, but that seems like suitable
material for a follow-on patch.
In addition, the patch adds GROUPS mode which counts the offset in
ORDER-BY peer groups rather than rows, and it adds the frame_exclusion
options specified by SQL:2011. As far as I can see, we are now fully
up to spec on window framing options.
Existing behaviors remain unchanged, except that I changed the errcode
for a couple of existing error reports to meet the SQL spec's expectation
that negative "offset" values should be reported as SQLSTATE 22013.
Internally and in relevant parts of the documentation, we now consistently
use the terminology "offset PRECEDING/FOLLOWING" rather than "value
PRECEDING/FOLLOWING", since the term "value" is confusingly vague.
Oliver Ford, reviewed and whacked around some by me
Discussion: https://postgr.es/m/CAGMVOdu9sivPAxbNN0X+q19Sfv9edEPv=HibOJhB14TJv_RCQg@mail.gmail.com
2018-02-07 06:06:50 +01:00
|
|
|
WRITE_OID_FIELD(startInRangeFunc);
|
|
|
|
WRITE_OID_FIELD(endInRangeFunc);
|
|
|
|
WRITE_OID_FIELD(inRangeColl);
|
|
|
|
WRITE_BOOL_FIELD(inRangeAsc);
|
|
|
|
WRITE_BOOL_FIELD(inRangeNullsFirst);
|
2008-12-28 19:54:01 +01:00
|
|
|
WRITE_UINT_FIELD(winref);
|
|
|
|
WRITE_BOOL_FIELD(copiedOrder);
|
|
|
|
}
|
|
|
|
|
2006-04-30 20:30:40 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outRowMarkClause(StringInfo str, const RowMarkClause *node)
|
2006-04-30 20:30:40 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("ROWMARKCLAUSE");
|
|
|
|
|
|
|
|
WRITE_UINT_FIELD(rti);
|
Improve concurrency of foreign key locking
This patch introduces two additional lock modes for tuples: "SELECT FOR
KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each
other, in contrast with already existing "SELECT FOR SHARE" and "SELECT
FOR UPDATE". UPDATE commands that do not modify the values stored in
the columns that are part of the key of the tuple now grab a SELECT FOR
NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently
with tuple locks of the FOR KEY SHARE variety.
Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this
means the concurrency improvement applies to them, which is the whole
point of this patch.
The added tuple lock semantics require some rejiggering of the multixact
module, so that the locking level that each transaction is holding can
be stored alongside its Xid. Also, multixacts now need to persist
across server restarts and crashes, because they can now represent not
only tuple locks, but also tuple updates. This means we need more
careful tracking of lifetime of pg_multixact SLRU files; since they now
persist longer, we require more infrastructure to figure out when they
can be removed. pg_upgrade also needs to be careful to copy
pg_multixact files over from the old server to the new, or at least part
of multixact.c state, depending on the versions of the old and new
servers.
Tuple time qualification rules (HeapTupleSatisfies routines) need to be
careful not to consider tuples with the "is multi" infomask bit set as
being only locked; they might need to look up MultiXact values (i.e.
possibly do pg_multixact I/O) to find out the Xid that updated a tuple,
whereas they previously were assured to only use information readily
available from the tuple header. This is considered acceptable, because
the extra I/O would involve cases that would previously cause some
commands to block waiting for concurrent transactions to finish.
Another important change is the fact that locking tuples that have
previously been updated causes the future versions to be marked as
locked, too; this is essential for correctness of foreign key checks.
This causes additional WAL-logging, also (there was previously a single
WAL record for a locked tuple; now there are as many as updated copies
of the tuple there exist.)
With all this in place, contention related to tuples being checked by
foreign key rules should be much reduced.
As a bonus, the old behavior that a subtransaction grabbing a stronger
tuple lock than the parent (sub)transaction held on a given tuple and
later aborting caused the weaker lock to be lost, has been fixed.
Many new spec files were added for isolation tester framework, to ensure
overall behavior is sane. There's probably room for several more tests.
There were several reviewers of this patch; in particular, Noah Misch
and Andres Freund spent considerable time in it. Original idea for the
patch came from Simon Riggs, after a problem report by Joel Jacobson.
Most code is from me, with contributions from Marti Raudsepp, Alexander
Shulgin, Noah Misch and Andres Freund.
This patch was discussed in several pgsql-hackers threads; the most
important start at the following message-ids:
AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com
1290721684-sup-3951@alvh.no-ip.org
1294953201-sup-2099@alvh.no-ip.org
1320343602-sup-2290@alvh.no-ip.org
1339690386-sup-8927@alvh.no-ip.org
4FE5FF020200002500048A3D@gw.wicourts.gov
4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
|
|
|
WRITE_ENUM_FIELD(strength, LockClauseStrength);
|
2014-10-07 22:23:34 +02:00
|
|
|
WRITE_ENUM_FIELD(waitPolicy, LockWaitPolicy);
|
2009-10-28 15:55:47 +01:00
|
|
|
WRITE_BOOL_FIELD(pushedDown);
|
2006-04-30 20:30:40 +02:00
|
|
|
}
|
|
|
|
|
2008-10-04 23:56:55 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outWithClause(StringInfo str, const WithClause *node)
|
2008-10-04 23:56:55 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("WITHCLAUSE");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(ctes);
|
|
|
|
WRITE_BOOL_FIELD(recursive);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
2021-02-01 13:54:59 +01:00
|
|
|
static void
|
|
|
|
_outCTESearchClause(StringInfo str, const CTESearchClause *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("CTESEARCHCLAUSE");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(search_col_list);
|
|
|
|
WRITE_BOOL_FIELD(search_breadth_first);
|
|
|
|
WRITE_STRING_FIELD(search_seq_column);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outCTECycleClause(StringInfo str, const CTECycleClause *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("CTECYCLECLAUSE");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(cycle_col_list);
|
|
|
|
WRITE_STRING_FIELD(cycle_mark_column);
|
|
|
|
WRITE_NODE_FIELD(cycle_mark_value);
|
|
|
|
WRITE_NODE_FIELD(cycle_mark_default);
|
|
|
|
WRITE_STRING_FIELD(cycle_path_column);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
WRITE_OID_FIELD(cycle_mark_type);
|
|
|
|
WRITE_INT_FIELD(cycle_mark_typmod);
|
|
|
|
WRITE_OID_FIELD(cycle_mark_collation);
|
|
|
|
WRITE_OID_FIELD(cycle_mark_neop);
|
|
|
|
}
|
|
|
|
|
2008-10-04 23:56:55 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outCommonTableExpr(StringInfo str, const CommonTableExpr *node)
|
2008-10-04 23:56:55 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("COMMONTABLEEXPR");
|
|
|
|
|
|
|
|
WRITE_STRING_FIELD(ctename);
|
|
|
|
WRITE_NODE_FIELD(aliascolnames);
|
Allow user control of CTE materialization, and change the default behavior.
Historically we've always materialized the full output of a CTE query,
treating WITH as an optimization fence (so that, for example, restrictions
from the outer query cannot be pushed into it). This is appropriate when
the CTE query is INSERT/UPDATE/DELETE, or is recursive; but when the CTE
query is non-recursive and side-effect-free, there's no hazard of changing
the query results by pushing restrictions down.
Another argument for materialization is that it can avoid duplicate
computation of an expensive WITH query --- but that only applies if
the WITH query is called more than once in the outer query. Even then
it could still be a net loss, if each call has restrictions that
would allow just a small part of the WITH query to be computed.
Hence, let's change the behavior for WITH queries that are non-recursive
and side-effect-free. By default, we will inline them into the outer
query (removing the optimization fence) if they are called just once.
If they are called more than once, we will keep the old behavior by
default, but the user can override this and force inlining by specifying
NOT MATERIALIZED. Lastly, the user can force the old behavior by
specifying MATERIALIZED; this would mainly be useful when the query had
deliberately been employing WITH as an optimization fence to prevent a
poor choice of plan.
Andreas Karlsson, Andrew Gierth, David Fetter
Discussion: https://postgr.es/m/87sh48ffhb.fsf@news-spur.riddles.org.uk
2019-02-16 22:11:12 +01:00
|
|
|
WRITE_ENUM_FIELD(ctematerialized, CTEMaterialize);
|
2008-10-04 23:56:55 +02:00
|
|
|
WRITE_NODE_FIELD(ctequery);
|
2021-02-01 13:54:59 +01:00
|
|
|
WRITE_NODE_FIELD(search_clause);
|
|
|
|
WRITE_NODE_FIELD(cycle_clause);
|
2008-10-04 23:56:55 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
WRITE_BOOL_FIELD(cterecursive);
|
|
|
|
WRITE_INT_FIELD(cterefcount);
|
|
|
|
WRITE_NODE_FIELD(ctecolnames);
|
|
|
|
WRITE_NODE_FIELD(ctecoltypes);
|
|
|
|
WRITE_NODE_FIELD(ctecoltypmods);
|
2011-02-08 22:04:18 +01:00
|
|
|
WRITE_NODE_FIELD(ctecolcollations);
|
2008-10-04 23:56:55 +02:00
|
|
|
}
|
|
|
|
|
2022-03-28 16:45:58 +02:00
|
|
|
static void
|
|
|
|
_outMergeWhenClause(StringInfo str, const MergeWhenClause *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("MERGEWHENCLAUSE");
|
|
|
|
|
|
|
|
WRITE_BOOL_FIELD(matched);
|
|
|
|
WRITE_ENUM_FIELD(commandType, CmdType);
|
|
|
|
WRITE_ENUM_FIELD(override, OverridingKind);
|
|
|
|
WRITE_NODE_FIELD(condition);
|
|
|
|
WRITE_NODE_FIELD(targetList);
|
|
|
|
WRITE_NODE_FIELD(values);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outMergeAction(StringInfo str, const MergeAction *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("MERGEACTION");
|
|
|
|
|
|
|
|
WRITE_BOOL_FIELD(matched);
|
|
|
|
WRITE_ENUM_FIELD(commandType, CmdType);
|
|
|
|
WRITE_ENUM_FIELD(override, OverridingKind);
|
|
|
|
WRITE_NODE_FIELD(qual);
|
|
|
|
WRITE_NODE_FIELD(targetList);
|
|
|
|
WRITE_NODE_FIELD(updateColnos);
|
|
|
|
}
|
|
|
|
|
2002-12-12 16:49:42 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outSetOperationStmt(StringInfo str, const SetOperationStmt *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("SETOPERATIONSTMT");
|
|
|
|
|
|
|
|
WRITE_ENUM_FIELD(op, SetOperation);
|
|
|
|
WRITE_BOOL_FIELD(all);
|
|
|
|
WRITE_NODE_FIELD(larg);
|
|
|
|
WRITE_NODE_FIELD(rarg);
|
2004-05-26 06:41:50 +02:00
|
|
|
WRITE_NODE_FIELD(colTypes);
|
2006-08-10 04:36:29 +02:00
|
|
|
WRITE_NODE_FIELD(colTypmods);
|
2011-02-08 22:04:18 +01:00
|
|
|
WRITE_NODE_FIELD(colCollations);
|
2008-08-07 03:11:52 +02:00
|
|
|
WRITE_NODE_FIELD(groupClauses);
|
2002-12-12 16:49:42 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outRangeTblEntry(StringInfo str, const RangeTblEntry *node)
|
2002-12-12 16:49:42 +01:00
|
|
|
{
|
2021-09-15 16:35:41 +02:00
|
|
|
WRITE_NODE_TYPE("RANGETBLENTRY");
|
2002-12-12 16:49:42 +01:00
|
|
|
|
|
|
|
/* put alias + eref first to make dump more legible */
|
|
|
|
WRITE_NODE_FIELD(alias);
|
|
|
|
WRITE_NODE_FIELD(eref);
|
|
|
|
WRITE_ENUM_FIELD(rtekind, RTEKind);
|
|
|
|
|
|
|
|
switch (node->rtekind)
|
|
|
|
{
|
|
|
|
case RTE_RELATION:
|
|
|
|
WRITE_OID_FIELD(relid);
|
2011-02-23 01:23:23 +01:00
|
|
|
WRITE_CHAR_FIELD(relkind);
|
Create an RTE field to record the query's lock mode for each relation.
Add RangeTblEntry.rellockmode, which records the appropriate lock mode for
each RTE_RELATION rangetable entry (either AccessShareLock, RowShareLock,
or RowExclusiveLock depending on the RTE's role in the query).
This patch creates the field and makes all creators of RTE nodes fill it
in reasonably, but for the moment nothing much is done with it. The plan
is to replace assorted post-parser logic that re-determines the right
lockmode to use with simple uses of rte->rellockmode. For now, just add
Asserts in each of those places that the rellockmode matches what they are
computing today. (In some cases the match isn't perfect, so the Asserts
are weaker than you might expect; but this seems OK, as per discussion.)
This passes check-world for me, but it seems worth pushing in this state
to see if the buildfarm finds any problems in cases I failed to test.
catversion bump due to change of stored rules.
Amit Langote, reviewed by David Rowley and Jesper Pedersen,
and whacked around a bit more by me
Discussion: https://postgr.es/m/468c85d9-540e-66a2-1dde-fec2b741e688@lab.ntt.co.jp
2018-09-30 19:55:51 +02:00
|
|
|
WRITE_INT_FIELD(rellockmode);
|
2015-05-15 20:37:10 +02:00
|
|
|
WRITE_NODE_FIELD(tablesample);
|
2002-12-12 16:49:42 +01:00
|
|
|
break;
|
|
|
|
case RTE_SUBQUERY:
|
|
|
|
WRITE_NODE_FIELD(subquery);
|
2011-12-22 22:15:57 +01:00
|
|
|
WRITE_BOOL_FIELD(security_barrier);
|
2002-12-12 16:49:42 +01:00
|
|
|
break;
|
2008-10-04 23:56:55 +02:00
|
|
|
case RTE_JOIN:
|
|
|
|
WRITE_ENUM_FIELD(jointype, JoinType);
|
Reconsider the representation of join alias Vars.
The core idea of this patch is to make the parser generate join alias
Vars (that is, ones with varno pointing to a JOIN RTE) only when the
alias Var is actually different from any raw join input, that is a type
coercion and/or COALESCE is necessary to generate the join output value.
Otherwise just generate varno/varattno pointing to the relevant join
input column.
In effect, this means that the planner's flatten_join_alias_vars()
transformation is already done in the parser, for all cases except
(a) columns that are merged by JOIN USING and are transformed in the
process, and (b) whole-row join Vars. In principle that would allow
us to skip doing flatten_join_alias_vars() in many more queries than
we do now, but we don't have quite enough infrastructure to know that
we can do so --- in particular there's no cheap way to know whether
there are any whole-row join Vars. I'm not sure if it's worth the
trouble to add a Query-level flag for that, and in any case it seems
like fit material for a separate patch. But even without skipping the
work entirely, this should make flatten_join_alias_vars() faster,
particularly where there are nested joins that it previously had to
flatten recursively.
An essential part of this change is to replace Var nodes'
varnoold/varoattno fields with varnosyn/varattnosyn, which have
considerably more tightly-defined meanings than the old fields: when
they differ from varno/varattno, they identify the Var's position in
an aliased JOIN RTE, and the join alias is what ruleutils.c should
print for the Var. This is necessary because the varno change
destroyed ruleutils.c's ability to find the JOIN RTE from the Var's
varno.
Another way in which this change broke ruleutils.c is that it's no
longer feasible to determine, from a JOIN RTE's joinaliasvars list,
which join columns correspond to which columns of the join's immediate
input relations. (If those are sub-joins, the joinaliasvars entries
may point to columns of their base relations, not the sub-joins.)
But that was a horrid mess requiring a lot of fragile assumptions
already, so let's just bite the bullet and add some more JOIN RTE
fields to make it more straightforward to figure that out. I added
two integer-List fields containing the relevant column numbers from
the left and right input rels, plus a count of how many merged columns
there are.
This patch depends on the ParseNamespaceColumn infrastructure that
I added in commit 5815696bc. The biggest bit of code change is
restructuring transformFromClauseItem's handling of JOINs so that
the ParseNamespaceColumn data is propagated upward correctly.
Other than that and the ruleutils fixes, everything pretty much
just works, though some processing is now inessential. I grabbed
two pieces of low-hanging fruit in that line:
1. In find_expr_references, we don't need to recurse into join alias
Vars anymore. There aren't any except for references to merged USING
columns, which are more properly handled when we scan the join's RTE.
This change actually fixes an edge-case issue: we will now record a
dependency on any type-coercion function present in a USING column's
joinaliasvar, even if that join column has no references in the query
text. The odds of the missing dependency causing a problem seem quite
small: you'd have to posit somebody dropping an implicit cast between
two data types, without removing the types themselves, and then having
a stored rule containing a whole-row Var for a join whose USING merge
depends on that cast. So I don't feel a great need to change this in
the back branches. But in theory this way is more correct.
2. markRTEForSelectPriv and markTargetListOrigin don't need to recurse
into join alias Vars either, because the cases they care about don't
apply to alias Vars for USING columns that are semantically distinct
from the underlying columns. This removes the only case in which
markVarForSelectPriv could be called with NULL for the RTE, so adjust
the comments to describe that hack as being strictly internal to
markRTEForSelectPriv.
catversion bump required due to changes in stored rules.
Discussion: https://postgr.es/m/7115.1577986646@sss.pgh.pa.us
2020-01-09 17:56:59 +01:00
|
|
|
WRITE_INT_FIELD(joinmergedcols);
|
2008-10-04 23:56:55 +02:00
|
|
|
WRITE_NODE_FIELD(joinaliasvars);
|
Reconsider the representation of join alias Vars.
The core idea of this patch is to make the parser generate join alias
Vars (that is, ones with varno pointing to a JOIN RTE) only when the
alias Var is actually different from any raw join input, that is a type
coercion and/or COALESCE is necessary to generate the join output value.
Otherwise just generate varno/varattno pointing to the relevant join
input column.
In effect, this means that the planner's flatten_join_alias_vars()
transformation is already done in the parser, for all cases except
(a) columns that are merged by JOIN USING and are transformed in the
process, and (b) whole-row join Vars. In principle that would allow
us to skip doing flatten_join_alias_vars() in many more queries than
we do now, but we don't have quite enough infrastructure to know that
we can do so --- in particular there's no cheap way to know whether
there are any whole-row join Vars. I'm not sure if it's worth the
trouble to add a Query-level flag for that, and in any case it seems
like fit material for a separate patch. But even without skipping the
work entirely, this should make flatten_join_alias_vars() faster,
particularly where there are nested joins that it previously had to
flatten recursively.
An essential part of this change is to replace Var nodes'
varnoold/varoattno fields with varnosyn/varattnosyn, which have
considerably more tightly-defined meanings than the old fields: when
they differ from varno/varattno, they identify the Var's position in
an aliased JOIN RTE, and the join alias is what ruleutils.c should
print for the Var. This is necessary because the varno change
destroyed ruleutils.c's ability to find the JOIN RTE from the Var's
varno.
Another way in which this change broke ruleutils.c is that it's no
longer feasible to determine, from a JOIN RTE's joinaliasvars list,
which join columns correspond to which columns of the join's immediate
input relations. (If those are sub-joins, the joinaliasvars entries
may point to columns of their base relations, not the sub-joins.)
But that was a horrid mess requiring a lot of fragile assumptions
already, so let's just bite the bullet and add some more JOIN RTE
fields to make it more straightforward to figure that out. I added
two integer-List fields containing the relevant column numbers from
the left and right input rels, plus a count of how many merged columns
there are.
This patch depends on the ParseNamespaceColumn infrastructure that
I added in commit 5815696bc. The biggest bit of code change is
restructuring transformFromClauseItem's handling of JOINs so that
the ParseNamespaceColumn data is propagated upward correctly.
Other than that and the ruleutils fixes, everything pretty much
just works, though some processing is now inessential. I grabbed
two pieces of low-hanging fruit in that line:
1. In find_expr_references, we don't need to recurse into join alias
Vars anymore. There aren't any except for references to merged USING
columns, which are more properly handled when we scan the join's RTE.
This change actually fixes an edge-case issue: we will now record a
dependency on any type-coercion function present in a USING column's
joinaliasvar, even if that join column has no references in the query
text. The odds of the missing dependency causing a problem seem quite
small: you'd have to posit somebody dropping an implicit cast between
two data types, without removing the types themselves, and then having
a stored rule containing a whole-row Var for a join whose USING merge
depends on that cast. So I don't feel a great need to change this in
the back branches. But in theory this way is more correct.
2. markRTEForSelectPriv and markTargetListOrigin don't need to recurse
into join alias Vars either, because the cases they care about don't
apply to alias Vars for USING columns that are semantically distinct
from the underlying columns. This removes the only case in which
markVarForSelectPriv could be called with NULL for the RTE, so adjust
the comments to describe that hack as being strictly internal to
markRTEForSelectPriv.
catversion bump required due to changes in stored rules.
Discussion: https://postgr.es/m/7115.1577986646@sss.pgh.pa.us
2020-01-09 17:56:59 +01:00
|
|
|
WRITE_NODE_FIELD(joinleftcols);
|
|
|
|
WRITE_NODE_FIELD(joinrightcols);
|
2021-03-31 17:09:24 +02:00
|
|
|
WRITE_NODE_FIELD(join_using_alias);
|
2008-10-04 23:56:55 +02:00
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case RTE_FUNCTION:
|
Support multi-argument UNNEST(), and TABLE() syntax for multiple functions.
This patch adds the ability to write TABLE( function1(), function2(), ...)
as a single FROM-clause entry. The result is the concatenation of the
first row from each function, followed by the second row from each
function, etc; with NULLs inserted if any function produces fewer rows than
others. This is believed to be a much more useful behavior than what
Postgres currently does with multiple SRFs in a SELECT list.
This syntax also provides a reasonable way to combine use of column
definition lists with WITH ORDINALITY: put the column definition list
inside TABLE(), where it's clear that it doesn't control the ordinality
column as well.
Also implement SQL-compliant multiple-argument UNNEST(), by turning
UNNEST(a,b,c) into TABLE(unnest(a), unnest(b), unnest(c)).
The SQL standard specifies TABLE() with only a single function, not
multiple functions, and it seems to require an implicit UNNEST() which is
not what this patch does. There may be something wrong with that reading
of the spec, though, because if it's right then the spec's TABLE() is just
a pointless alternative spelling of UNNEST(). After further review of
that, we might choose to adopt a different syntax for what this patch does,
but in any case this functionality seems clearly worthwhile.
Andrew Gierth, reviewed by Zoltán Böszörményi and Heikki Linnakangas, and
significantly revised by me
2013-11-22 01:37:02 +01:00
|
|
|
WRITE_NODE_FIELD(functions);
|
2013-07-29 17:38:01 +02:00
|
|
|
WRITE_BOOL_FIELD(funcordinality);
|
2002-12-12 16:49:42 +01:00
|
|
|
break;
|
2017-03-08 16:39:37 +01:00
|
|
|
case RTE_TABLEFUNC:
|
|
|
|
WRITE_NODE_FIELD(tablefunc);
|
|
|
|
break;
|
2006-08-02 03:59:48 +02:00
|
|
|
case RTE_VALUES:
|
|
|
|
WRITE_NODE_FIELD(values_lists);
|
Fix reporting of column typmods for multi-row VALUES constructs.
expandRTE() and get_rte_attribute_type() reported the exprType() and
exprTypmod() values of the expressions in the first row of the VALUES as
being the column type/typmod returned by the VALUES RTE. That's fine for
the data type, since we coerce all expressions in a column to have the same
common type. But we don't coerce them to have a common typmod, so it was
possible for rows after the first one to return values that violate the
claimed column typmod. This leads to the incorrect result seen in bug
#14448 from Hassan Mahmood, as well as some other corner-case misbehaviors.
The desired behavior is the same as we use in other type-unification
cases: report the common typmod if there is one, but otherwise return -1
indicating no particular constraint. It's cheap for transformValuesClause
to determine the common typmod while transforming a multi-row VALUES, but
it'd be less cheap for expandRTE() and get_rte_attribute_type() to
re-determine that info every time they're asked --- possibly a lot less
cheap, if the VALUES has many rows. Therefore, the best fix is to record
the common typmods explicitly in a list in the VALUES RTE, as we were
already doing for column collations. This looks quite a bit like what
we're doing for CTE RTEs, so we can save a little bit of space and code by
unifying the representation for those two RTE types. They both now share
coltypes/coltypmods/colcollations fields. (At some point it might seem
desirable to populate those fields for all RTE types; but right now it
looks like constructing them for other RTE types would add more code and
cycles than it would save.)
The RTE change requires a catversion bump, so this fix is only usable
in HEAD. If we fix this at all in the back branches, the patch will
need to look quite different.
Report: https://postgr.es/m/20161205143037.4377.60754@wrigleys.postgresql.org
Discussion: https://postgr.es/m/27429.1480968538@sss.pgh.pa.us
2016-12-08 17:40:02 +01:00
|
|
|
WRITE_NODE_FIELD(coltypes);
|
|
|
|
WRITE_NODE_FIELD(coltypmods);
|
|
|
|
WRITE_NODE_FIELD(colcollations);
|
2006-08-02 03:59:48 +02:00
|
|
|
break;
|
2008-10-04 23:56:55 +02:00
|
|
|
case RTE_CTE:
|
|
|
|
WRITE_STRING_FIELD(ctename);
|
|
|
|
WRITE_UINT_FIELD(ctelevelsup);
|
|
|
|
WRITE_BOOL_FIELD(self_reference);
|
Fix reporting of column typmods for multi-row VALUES constructs.
expandRTE() and get_rte_attribute_type() reported the exprType() and
exprTypmod() values of the expressions in the first row of the VALUES as
being the column type/typmod returned by the VALUES RTE. That's fine for
the data type, since we coerce all expressions in a column to have the same
common type. But we don't coerce them to have a common typmod, so it was
possible for rows after the first one to return values that violate the
claimed column typmod. This leads to the incorrect result seen in bug
#14448 from Hassan Mahmood, as well as some other corner-case misbehaviors.
The desired behavior is the same as we use in other type-unification
cases: report the common typmod if there is one, but otherwise return -1
indicating no particular constraint. It's cheap for transformValuesClause
to determine the common typmod while transforming a multi-row VALUES, but
it'd be less cheap for expandRTE() and get_rte_attribute_type() to
re-determine that info every time they're asked --- possibly a lot less
cheap, if the VALUES has many rows. Therefore, the best fix is to record
the common typmods explicitly in a list in the VALUES RTE, as we were
already doing for column collations. This looks quite a bit like what
we're doing for CTE RTEs, so we can save a little bit of space and code by
unifying the representation for those two RTE types. They both now share
coltypes/coltypmods/colcollations fields. (At some point it might seem
desirable to populate those fields for all RTE types; but right now it
looks like constructing them for other RTE types would add more code and
cycles than it would save.)
The RTE change requires a catversion bump, so this fix is only usable
in HEAD. If we fix this at all in the back branches, the patch will
need to look quite different.
Report: https://postgr.es/m/20161205143037.4377.60754@wrigleys.postgresql.org
Discussion: https://postgr.es/m/27429.1480968538@sss.pgh.pa.us
2016-12-08 17:40:02 +01:00
|
|
|
WRITE_NODE_FIELD(coltypes);
|
|
|
|
WRITE_NODE_FIELD(coltypmods);
|
|
|
|
WRITE_NODE_FIELD(colcollations);
|
2002-12-12 16:49:42 +01:00
|
|
|
break;
|
2017-04-01 06:17:18 +02:00
|
|
|
case RTE_NAMEDTUPLESTORE:
|
|
|
|
WRITE_STRING_FIELD(enrname);
|
2017-06-14 22:19:46 +02:00
|
|
|
WRITE_FLOAT_FIELD(enrtuples, "%.0f");
|
2017-04-01 06:17:18 +02:00
|
|
|
WRITE_OID_FIELD(relid);
|
|
|
|
WRITE_NODE_FIELD(coltypes);
|
|
|
|
WRITE_NODE_FIELD(coltypmods);
|
|
|
|
WRITE_NODE_FIELD(colcollations);
|
|
|
|
break;
|
In the planner, replace an empty FROM clause with a dummy RTE.
The fact that "SELECT expression" has no base relations has long been a
thorn in the side of the planner. It makes it hard to flatten a sub-query
that looks like that, or is a trivial VALUES() item, because the planner
generally uses relid sets to identify sub-relations, and such a sub-query
would have an empty relid set if we flattened it. prepjointree.c contains
some baroque logic that works around this in certain special cases --- but
there is a much better answer. We can replace an empty FROM clause with a
dummy RTE that acts like a table of one row and no columns, and then there
are no such corner cases to worry about. Instead we need some logic to
get rid of useless dummy RTEs, but that's simpler and covers more cases
than what was there before.
For really trivial cases, where the query is just "SELECT expression" and
nothing else, there's a hazard that adding the extra RTE makes for a
noticeable slowdown; even though it's not much processing, there's not
that much for the planner to do overall. However testing says that the
penalty is very small, close to the noise level. In more complex queries,
this is able to find optimizations that we could not find before.
The new RTE type is called RTE_RESULT, since the "scan" plan type it
gives rise to is a Result node (the same plan we produced for a "SELECT
expression" query before). To avoid confusion, rename the old ResultPath
path type to GroupResultPath, reflecting that it's only used in degenerate
grouping cases where we know the query produces just one grouped row.
(It wouldn't work to unify the two cases, because there are different
rules about where the associated quals live during query_planner.)
Note: although this touches readfuncs.c, I don't think a catversion
bump is required, because the added case can't occur in stored rules,
only plans.
Patch by me, reviewed by David Rowley and Mark Dilger
Discussion: https://postgr.es/m/15944.1521127664@sss.pgh.pa.us
2019-01-28 23:54:10 +01:00
|
|
|
case RTE_RESULT:
|
|
|
|
/* no extra fields */
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
default:
|
2003-07-28 02:09:16 +02:00
|
|
|
elog(ERROR, "unrecognized RTE kind: %d", (int) node->rtekind);
|
2002-12-12 16:49:42 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2012-08-08 01:02:54 +02:00
|
|
|
WRITE_BOOL_FIELD(lateral);
|
2002-12-12 16:49:42 +01:00
|
|
|
WRITE_BOOL_FIELD(inh);
|
|
|
|
WRITE_BOOL_FIELD(inFromCl);
|
2004-01-15 00:01:55 +01:00
|
|
|
WRITE_UINT_FIELD(requiredPerms);
|
2005-06-28 07:09:14 +02:00
|
|
|
WRITE_OID_FIELD(checkAsUser);
|
2009-01-22 21:16:10 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(selectedCols);
|
2015-05-08 00:20:46 +02:00
|
|
|
WRITE_BITMAPSET_FIELD(insertedCols);
|
|
|
|
WRITE_BITMAPSET_FIELD(updatedCols);
|
2019-03-30 08:13:09 +01:00
|
|
|
WRITE_BITMAPSET_FIELD(extraUpdatedCols);
|
2014-04-13 03:04:58 +02:00
|
|
|
WRITE_NODE_FIELD(securityQuals);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
Support multi-argument UNNEST(), and TABLE() syntax for multiple functions.
This patch adds the ability to write TABLE( function1(), function2(), ...)
as a single FROM-clause entry. The result is the concatenation of the
first row from each function, followed by the second row from each
function, etc; with NULLs inserted if any function produces fewer rows than
others. This is believed to be a much more useful behavior than what
Postgres currently does with multiple SRFs in a SELECT list.
This syntax also provides a reasonable way to combine use of column
definition lists with WITH ORDINALITY: put the column definition list
inside TABLE(), where it's clear that it doesn't control the ordinality
column as well.
Also implement SQL-compliant multiple-argument UNNEST(), by turning
UNNEST(a,b,c) into TABLE(unnest(a), unnest(b), unnest(c)).
The SQL standard specifies TABLE() with only a single function, not
multiple functions, and it seems to require an implicit UNNEST() which is
not what this patch does. There may be something wrong with that reading
of the spec, though, because if it's right then the spec's TABLE() is just
a pointless alternative spelling of UNNEST(). After further review of
that, we might choose to adopt a different syntax for what this patch does,
but in any case this functionality seems clearly worthwhile.
Andrew Gierth, reviewed by Zoltán Böszörményi and Heikki Linnakangas, and
significantly revised by me
2013-11-22 01:37:02 +01:00
|
|
|
static void
|
|
|
|
_outRangeTblFunction(StringInfo str, const RangeTblFunction *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("RANGETBLFUNCTION");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(funcexpr);
|
|
|
|
WRITE_INT_FIELD(funccolcount);
|
|
|
|
WRITE_NODE_FIELD(funccolnames);
|
|
|
|
WRITE_NODE_FIELD(funccoltypes);
|
|
|
|
WRITE_NODE_FIELD(funccoltypmods);
|
|
|
|
WRITE_NODE_FIELD(funccolcollations);
|
|
|
|
WRITE_BITMAPSET_FIELD(funcparams);
|
|
|
|
}
|
|
|
|
|
Redesign tablesample method API, and do extensive code review.
The original implementation of TABLESAMPLE modeled the tablesample method
API on index access methods, which wasn't a good choice because, without
specialized DDL commands, there's no way to build an extension that can
implement a TSM. (Raw inserts into system catalogs are not an acceptable
thing to do, because we can't undo them during DROP EXTENSION, nor will
pg_upgrade behave sanely.) Instead adopt an API more like procedural
language handlers or foreign data wrappers, wherein the only SQL-level
support object needed is a single handler function identified by having
a special return type. This lets us get rid of the supporting catalog
altogether, so that no custom DDL support is needed for the feature.
Adjust the API so that it can support non-constant tablesample arguments
(the original coding assumed we could evaluate the argument expressions at
ExecInitSampleScan time, which is undesirable even if it weren't outright
unsafe), and discourage sampling methods from looking at invisible tuples.
Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable
within and across queries, as required by the SQL standard, and deal more
honestly with methods that can't support that requirement.
Make a full code-review pass over the tablesample additions, and fix
assorted bugs, omissions, infelicities, and cosmetic issues (such as
failure to put the added code stanzas in a consistent ordering).
Improve EXPLAIN's output of tablesample plans, too.
Back-patch to 9.5 so that we don't have to support the original API
in production.
2015-07-25 20:39:00 +02:00
|
|
|
static void
|
|
|
|
_outTableSampleClause(StringInfo str, const TableSampleClause *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("TABLESAMPLECLAUSE");
|
|
|
|
|
|
|
|
WRITE_OID_FIELD(tsmhandler);
|
|
|
|
WRITE_NODE_FIELD(args);
|
|
|
|
WRITE_NODE_FIELD(repeatable);
|
|
|
|
}
|
|
|
|
|
1997-12-23 20:50:54 +01:00
|
|
|
static void
|
2021-07-21 10:24:06 +02:00
|
|
|
_outA_Expr(StringInfo str, const A_Expr *node)
|
1997-12-23 20:50:54 +01:00
|
|
|
{
|
2022-07-08 11:03:45 +02:00
|
|
|
WRITE_NODE_TYPE("A_EXPR");
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2003-02-10 05:44:47 +01:00
|
|
|
switch (node->kind)
|
1998-05-10 01:46:35 +02:00
|
|
|
{
|
2003-02-10 05:44:47 +01:00
|
|
|
case AEXPR_OP:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoChar(str, ' ');
|
2003-02-10 05:44:47 +01:00
|
|
|
WRITE_NODE_FIELD(name);
|
|
|
|
break;
|
2003-06-29 02:33:44 +02:00
|
|
|
case AEXPR_OP_ANY:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoChar(str, ' ');
|
2003-06-29 02:33:44 +02:00
|
|
|
WRITE_NODE_FIELD(name);
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, " ANY ");
|
2003-06-29 02:33:44 +02:00
|
|
|
break;
|
|
|
|
case AEXPR_OP_ALL:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoChar(str, ' ');
|
2003-06-29 02:33:44 +02:00
|
|
|
WRITE_NODE_FIELD(name);
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, " ALL ");
|
2003-06-29 02:33:44 +02:00
|
|
|
break;
|
2003-02-10 05:44:47 +01:00
|
|
|
case AEXPR_DISTINCT:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, " DISTINCT ");
|
2003-02-10 05:44:47 +01:00
|
|
|
WRITE_NODE_FIELD(name);
|
|
|
|
break;
|
2016-07-28 23:23:03 +02:00
|
|
|
case AEXPR_NOT_DISTINCT:
|
|
|
|
appendStringInfoString(str, " NOT_DISTINCT ");
|
|
|
|
WRITE_NODE_FIELD(name);
|
|
|
|
break;
|
2003-02-16 03:30:39 +01:00
|
|
|
case AEXPR_NULLIF:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, " NULLIF ");
|
2003-02-16 03:30:39 +01:00
|
|
|
WRITE_NODE_FIELD(name);
|
|
|
|
break;
|
2005-11-28 05:35:32 +01:00
|
|
|
case AEXPR_IN:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, " IN ");
|
2005-11-28 05:35:32 +01:00
|
|
|
WRITE_NODE_FIELD(name);
|
|
|
|
break;
|
2015-02-23 18:46:46 +01:00
|
|
|
case AEXPR_LIKE:
|
|
|
|
appendStringInfoString(str, " LIKE ");
|
|
|
|
WRITE_NODE_FIELD(name);
|
|
|
|
break;
|
|
|
|
case AEXPR_ILIKE:
|
|
|
|
appendStringInfoString(str, " ILIKE ");
|
|
|
|
WRITE_NODE_FIELD(name);
|
|
|
|
break;
|
|
|
|
case AEXPR_SIMILAR:
|
|
|
|
appendStringInfoString(str, " SIMILAR ");
|
|
|
|
WRITE_NODE_FIELD(name);
|
|
|
|
break;
|
2015-02-22 19:57:56 +01:00
|
|
|
case AEXPR_BETWEEN:
|
|
|
|
appendStringInfoString(str, " BETWEEN ");
|
|
|
|
WRITE_NODE_FIELD(name);
|
|
|
|
break;
|
|
|
|
case AEXPR_NOT_BETWEEN:
|
|
|
|
appendStringInfoString(str, " NOT_BETWEEN ");
|
|
|
|
WRITE_NODE_FIELD(name);
|
|
|
|
break;
|
|
|
|
case AEXPR_BETWEEN_SYM:
|
|
|
|
appendStringInfoString(str, " BETWEEN_SYM ");
|
|
|
|
WRITE_NODE_FIELD(name);
|
|
|
|
break;
|
|
|
|
case AEXPR_NOT_BETWEEN_SYM:
|
|
|
|
appendStringInfoString(str, " NOT_BETWEEN_SYM ");
|
|
|
|
WRITE_NODE_FIELD(name);
|
|
|
|
break;
|
2000-05-26 00:43:12 +02:00
|
|
|
default:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, " ??");
|
2000-05-26 00:43:12 +02:00
|
|
|
break;
|
1998-05-10 01:46:35 +02:00
|
|
|
}
|
2002-11-25 19:12:12 +01:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(lexpr);
|
|
|
|
WRITE_NODE_FIELD(rexpr);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
1997-12-23 20:50:54 +01:00
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
static void
|
Remove Value node struct
The Value node struct is a weird construct. It is its own node type,
but most of the time, it actually has a node type of Integer, Float,
String, or BitString. As a consequence, the struct name and the node
type don't match most of the time, and so it has to be treated
specially a lot. There doesn't seem to be any value in the special
construct. There is very little code that wants to accept all Value
variants but nothing else (and even if it did, this doesn't provide
any convenient way to check it), and most code wants either just one
particular node type (usually String), or it accepts a broader set of
node types besides just Value.
This change removes the Value struct and node type and replaces them
by separate Integer, Float, String, and BitString node types that are
proper node types and structs of their own and behave mostly like
normal node types.
Also, this removes the T_Null node tag, which was previously also a
possible variant of Value but wasn't actually used outside of the
Value contained in A_Const. Replace that by an isnull field in
A_Const.
Reviewed-by: Dagfinn Ilmari Mannsåker <ilmari@ilmari.org>
Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/5ba6bc5b-3f95-04f2-2419-f8ddb4c046fb@enterprisedb.com
2021-09-09 07:58:12 +02:00
|
|
|
_outInteger(StringInfo str, const Integer *node)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2022-01-14 10:46:49 +01:00
|
|
|
appendStringInfo(str, "%d", node->ival);
|
Remove Value node struct
The Value node struct is a weird construct. It is its own node type,
but most of the time, it actually has a node type of Integer, Float,
String, or BitString. As a consequence, the struct name and the node
type don't match most of the time, and so it has to be treated
specially a lot. There doesn't seem to be any value in the special
construct. There is very little code that wants to accept all Value
variants but nothing else (and even if it did, this doesn't provide
any convenient way to check it), and most code wants either just one
particular node type (usually String), or it accepts a broader set of
node types besides just Value.
This change removes the Value struct and node type and replaces them
by separate Integer, Float, String, and BitString node types that are
proper node types and structs of their own and behave mostly like
normal node types.
Also, this removes the T_Null node tag, which was previously also a
possible variant of Value but wasn't actually used outside of the
Value contained in A_Const. Replace that by an isnull field in
A_Const.
Reviewed-by: Dagfinn Ilmari Mannsåker <ilmari@ilmari.org>
Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/5ba6bc5b-3f95-04f2-2419-f8ddb4c046fb@enterprisedb.com
2021-09-09 07:58:12 +02:00
|
|
|
}
|
2004-08-29 07:07:03 +02:00
|
|
|
|
Remove Value node struct
The Value node struct is a weird construct. It is its own node type,
but most of the time, it actually has a node type of Integer, Float,
String, or BitString. As a consequence, the struct name and the node
type don't match most of the time, and so it has to be treated
specially a lot. There doesn't seem to be any value in the special
construct. There is very little code that wants to accept all Value
variants but nothing else (and even if it did, this doesn't provide
any convenient way to check it), and most code wants either just one
particular node type (usually String), or it accepts a broader set of
node types besides just Value.
This change removes the Value struct and node type and replaces them
by separate Integer, Float, String, and BitString node types that are
proper node types and structs of their own and behave mostly like
normal node types.
Also, this removes the T_Null node tag, which was previously also a
possible variant of Value but wasn't actually used outside of the
Value contained in A_Const. Replace that by an isnull field in
A_Const.
Reviewed-by: Dagfinn Ilmari Mannsåker <ilmari@ilmari.org>
Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/5ba6bc5b-3f95-04f2-2419-f8ddb4c046fb@enterprisedb.com
2021-09-09 07:58:12 +02:00
|
|
|
static void
|
|
|
|
_outFloat(StringInfo str, const Float *node)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We assume the value is a valid numeric literal and so does not need
|
|
|
|
* quoting.
|
|
|
|
*/
|
2022-01-14 10:46:49 +01:00
|
|
|
appendStringInfoString(str, node->fval);
|
Remove Value node struct
The Value node struct is a weird construct. It is its own node type,
but most of the time, it actually has a node type of Integer, Float,
String, or BitString. As a consequence, the struct name and the node
type don't match most of the time, and so it has to be treated
specially a lot. There doesn't seem to be any value in the special
construct. There is very little code that wants to accept all Value
variants but nothing else (and even if it did, this doesn't provide
any convenient way to check it), and most code wants either just one
particular node type (usually String), or it accepts a broader set of
node types besides just Value.
This change removes the Value struct and node type and replaces them
by separate Integer, Float, String, and BitString node types that are
proper node types and structs of their own and behave mostly like
normal node types.
Also, this removes the T_Null node tag, which was previously also a
possible variant of Value but wasn't actually used outside of the
Value contained in A_Const. Replace that by an isnull field in
A_Const.
Reviewed-by: Dagfinn Ilmari Mannsåker <ilmari@ilmari.org>
Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/5ba6bc5b-3f95-04f2-2419-f8ddb4c046fb@enterprisedb.com
2021-09-09 07:58:12 +02:00
|
|
|
}
|
|
|
|
|
2022-01-14 10:46:49 +01:00
|
|
|
static void
|
|
|
|
_outBoolean(StringInfo str, const Boolean *node)
|
|
|
|
{
|
|
|
|
appendStringInfoString(str, node->boolval ? "true" : "false");
|
|
|
|
}
|
|
|
|
|
Remove Value node struct
The Value node struct is a weird construct. It is its own node type,
but most of the time, it actually has a node type of Integer, Float,
String, or BitString. As a consequence, the struct name and the node
type don't match most of the time, and so it has to be treated
specially a lot. There doesn't seem to be any value in the special
construct. There is very little code that wants to accept all Value
variants but nothing else (and even if it did, this doesn't provide
any convenient way to check it), and most code wants either just one
particular node type (usually String), or it accepts a broader set of
node types besides just Value.
This change removes the Value struct and node type and replaces them
by separate Integer, Float, String, and BitString node types that are
proper node types and structs of their own and behave mostly like
normal node types.
Also, this removes the T_Null node tag, which was previously also a
possible variant of Value but wasn't actually used outside of the
Value contained in A_Const. Replace that by an isnull field in
A_Const.
Reviewed-by: Dagfinn Ilmari Mannsåker <ilmari@ilmari.org>
Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/5ba6bc5b-3f95-04f2-2419-f8ddb4c046fb@enterprisedb.com
2021-09-09 07:58:12 +02:00
|
|
|
static void
|
|
|
|
_outString(StringInfo str, const String *node)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We use outToken to provide escaping of the string's content, but we
|
|
|
|
* don't want it to do anything with an empty string.
|
|
|
|
*/
|
|
|
|
appendStringInfoChar(str, '"');
|
2022-01-14 10:46:49 +01:00
|
|
|
if (node->sval[0] != '\0')
|
|
|
|
outToken(str, node->sval);
|
Remove Value node struct
The Value node struct is a weird construct. It is its own node type,
but most of the time, it actually has a node type of Integer, Float,
String, or BitString. As a consequence, the struct name and the node
type don't match most of the time, and so it has to be treated
specially a lot. There doesn't seem to be any value in the special
construct. There is very little code that wants to accept all Value
variants but nothing else (and even if it did, this doesn't provide
any convenient way to check it), and most code wants either just one
particular node type (usually String), or it accepts a broader set of
node types besides just Value.
This change removes the Value struct and node type and replaces them
by separate Integer, Float, String, and BitString node types that are
proper node types and structs of their own and behave mostly like
normal node types.
Also, this removes the T_Null node tag, which was previously also a
possible variant of Value but wasn't actually used outside of the
Value contained in A_Const. Replace that by an isnull field in
A_Const.
Reviewed-by: Dagfinn Ilmari Mannsåker <ilmari@ilmari.org>
Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/5ba6bc5b-3f95-04f2-2419-f8ddb4c046fb@enterprisedb.com
2021-09-09 07:58:12 +02:00
|
|
|
appendStringInfoChar(str, '"');
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outBitString(StringInfo str, const BitString *node)
|
|
|
|
{
|
|
|
|
/* internal representation already has leading 'b' */
|
2022-01-14 10:46:49 +01:00
|
|
|
appendStringInfoString(str, node->bsval);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
1999-02-23 09:01:47 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outColumnRef(StringInfo str, const ColumnRef *node)
|
1999-02-23 09:01:47 +01:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("COLUMNREF");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(fields);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2002-03-21 17:02:16 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outParamRef(StringInfo str, const ParamRef *node)
|
2002-03-21 17:02:16 +01:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("PARAMREF");
|
|
|
|
|
|
|
|
WRITE_INT_FIELD(number);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2002-03-21 17:02:16 +01:00
|
|
|
}
|
|
|
|
|
2018-09-16 19:02:47 +02:00
|
|
|
/*
|
|
|
|
* Node types found in raw parse trees (supported for debug purposes)
|
|
|
|
*/
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outRawStmt(StringInfo str, const RawStmt *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("RAWSTMT");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(stmt);
|
|
|
|
WRITE_LOCATION_FIELD(stmt_location);
|
|
|
|
WRITE_INT_FIELD(stmt_len);
|
|
|
|
}
|
|
|
|
|
1997-12-23 20:50:54 +01:00
|
|
|
static void
|
2021-07-21 10:24:06 +02:00
|
|
|
_outA_Const(StringInfo str, const A_Const *node)
|
1997-12-23 20:50:54 +01:00
|
|
|
{
|
2004-06-09 21:08:20 +02:00
|
|
|
WRITE_NODE_TYPE("A_CONST");
|
2002-11-25 19:12:12 +01:00
|
|
|
|
Remove Value node struct
The Value node struct is a weird construct. It is its own node type,
but most of the time, it actually has a node type of Integer, Float,
String, or BitString. As a consequence, the struct name and the node
type don't match most of the time, and so it has to be treated
specially a lot. There doesn't seem to be any value in the special
construct. There is very little code that wants to accept all Value
variants but nothing else (and even if it did, this doesn't provide
any convenient way to check it), and most code wants either just one
particular node type (usually String), or it accepts a broader set of
node types besides just Value.
This change removes the Value struct and node type and replaces them
by separate Integer, Float, String, and BitString node types that are
proper node types and structs of their own and behave mostly like
normal node types.
Also, this removes the T_Null node tag, which was previously also a
possible variant of Value but wasn't actually used outside of the
Value contained in A_Const. Replace that by an isnull field in
A_Const.
Reviewed-by: Dagfinn Ilmari Mannsåker <ilmari@ilmari.org>
Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/5ba6bc5b-3f95-04f2-2419-f8ddb4c046fb@enterprisedb.com
2021-09-09 07:58:12 +02:00
|
|
|
if (node->isnull)
|
|
|
|
appendStringInfoString(str, "NULL");
|
|
|
|
else
|
|
|
|
{
|
|
|
|
appendStringInfoString(str, " :val ");
|
|
|
|
outNode(str, &node->val);
|
|
|
|
}
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
1997-12-23 20:50:54 +01:00
|
|
|
}
|
|
|
|
|
2008-08-30 03:39:14 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outA_Star(StringInfo str, const A_Star *node)
|
2008-08-30 03:39:14 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("A_STAR");
|
|
|
|
}
|
|
|
|
|
2002-03-21 17:02:16 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outA_Indices(StringInfo str, const A_Indices *node)
|
2002-03-21 17:02:16 +01:00
|
|
|
{
|
2004-06-09 21:08:20 +02:00
|
|
|
WRITE_NODE_TYPE("A_INDICES");
|
|
|
|
|
2015-12-23 03:05:16 +01:00
|
|
|
WRITE_BOOL_FIELD(is_slice);
|
2004-06-09 21:08:20 +02:00
|
|
|
WRITE_NODE_FIELD(lidx);
|
|
|
|
WRITE_NODE_FIELD(uidx);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outA_Indirection(StringInfo str, const A_Indirection *node)
|
2004-06-09 21:08:20 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("A_INDIRECTION");
|
2002-11-25 19:12:12 +01:00
|
|
|
|
|
|
|
WRITE_NODE_FIELD(arg);
|
|
|
|
WRITE_NODE_FIELD(indirection);
|
2002-03-21 17:02:16 +01:00
|
|
|
}
|
|
|
|
|
2008-03-20 22:42:48 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outA_ArrayExpr(StringInfo str, const A_ArrayExpr *node)
|
2008-03-20 22:42:48 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("A_ARRAYEXPR");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(elements);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2008-03-20 22:42:48 +01:00
|
|
|
}
|
|
|
|
|
2004-06-09 21:08:20 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outResTarget(StringInfo str, const ResTarget *node)
|
2004-06-09 21:08:20 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("RESTARGET");
|
|
|
|
|
|
|
|
WRITE_STRING_FIELD(name);
|
|
|
|
WRITE_NODE_FIELD(indirection);
|
|
|
|
WRITE_NODE_FIELD(val);
|
2008-08-29 01:09:48 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2004-06-09 21:08:20 +02:00
|
|
|
}
|
|
|
|
|
Implement UPDATE tab SET (col1,col2,...) = (SELECT ...), ...
This SQL-standard feature allows a sub-SELECT yielding multiple columns
(but only one row) to be used to compute the new values of several columns
to be updated. While the same results can be had with an independent
sub-SELECT per column, such a workaround can require a great deal of
duplicated computation.
The standard actually says that the source for a multi-column assignment
could be any row-valued expression. The implementation used here is
tightly tied to our existing sub-SELECT support and can't handle other
cases; the Bison grammar would have some issues with them too. However,
I don't feel too bad about this since other cases can be converted into
sub-SELECTs. For instance, "SET (a,b,c) = row_valued_function(x)" could
be written "SET (a,b,c) = (SELECT * FROM row_valued_function(x))".
2014-06-18 19:22:25 +02:00
|
|
|
static void
|
|
|
|
_outMultiAssignRef(StringInfo str, const MultiAssignRef *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("MULTIASSIGNREF");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(source);
|
|
|
|
WRITE_INT_FIELD(colno);
|
|
|
|
WRITE_INT_FIELD(ncolumns);
|
|
|
|
}
|
|
|
|
|
2008-07-17 18:02:12 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outSortBy(StringInfo str, const SortBy *node)
|
2008-07-17 18:02:12 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("SORTBY");
|
|
|
|
|
2008-09-01 22:42:46 +02:00
|
|
|
WRITE_NODE_FIELD(node);
|
2008-07-17 18:02:12 +02:00
|
|
|
WRITE_ENUM_FIELD(sortby_dir, SortByDir);
|
|
|
|
WRITE_ENUM_FIELD(sortby_nulls, SortByNulls);
|
|
|
|
WRITE_NODE_FIELD(useOp);
|
2008-09-01 22:42:46 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
2008-07-17 18:02:12 +02:00
|
|
|
}
|
|
|
|
|
2008-12-28 19:54:01 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outWindowDef(StringInfo str, const WindowDef *node)
|
2008-12-28 19:54:01 +01:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("WINDOWDEF");
|
|
|
|
|
|
|
|
WRITE_STRING_FIELD(name);
|
|
|
|
WRITE_STRING_FIELD(refname);
|
|
|
|
WRITE_NODE_FIELD(partitionClause);
|
|
|
|
WRITE_NODE_FIELD(orderClause);
|
2008-12-31 01:08:39 +01:00
|
|
|
WRITE_INT_FIELD(frameOptions);
|
2010-02-12 18:33:21 +01:00
|
|
|
WRITE_NODE_FIELD(startOffset);
|
|
|
|
WRITE_NODE_FIELD(endOffset);
|
2008-12-28 19:54:01 +01:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
2008-10-04 23:56:55 +02:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outRangeSubselect(StringInfo str, const RangeSubselect *node)
|
2008-10-04 23:56:55 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("RANGESUBSELECT");
|
|
|
|
|
2012-08-08 01:02:54 +02:00
|
|
|
WRITE_BOOL_FIELD(lateral);
|
2008-10-04 23:56:55 +02:00
|
|
|
WRITE_NODE_FIELD(subquery);
|
|
|
|
WRITE_NODE_FIELD(alias);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outRangeFunction(StringInfo str, const RangeFunction *node)
|
2008-10-04 23:56:55 +02:00
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("RANGEFUNCTION");
|
|
|
|
|
2012-08-08 01:02:54 +02:00
|
|
|
WRITE_BOOL_FIELD(lateral);
|
Support multi-argument UNNEST(), and TABLE() syntax for multiple functions.
This patch adds the ability to write TABLE( function1(), function2(), ...)
as a single FROM-clause entry. The result is the concatenation of the
first row from each function, followed by the second row from each
function, etc; with NULLs inserted if any function produces fewer rows than
others. This is believed to be a much more useful behavior than what
Postgres currently does with multiple SRFs in a SELECT list.
This syntax also provides a reasonable way to combine use of column
definition lists with WITH ORDINALITY: put the column definition list
inside TABLE(), where it's clear that it doesn't control the ordinality
column as well.
Also implement SQL-compliant multiple-argument UNNEST(), by turning
UNNEST(a,b,c) into TABLE(unnest(a), unnest(b), unnest(c)).
The SQL standard specifies TABLE() with only a single function, not
multiple functions, and it seems to require an implicit UNNEST() which is
not what this patch does. There may be something wrong with that reading
of the spec, though, because if it's right then the spec's TABLE() is just
a pointless alternative spelling of UNNEST(). After further review of
that, we might choose to adopt a different syntax for what this patch does,
but in any case this functionality seems clearly worthwhile.
Andrew Gierth, reviewed by Zoltán Böszörményi and Heikki Linnakangas, and
significantly revised by me
2013-11-22 01:37:02 +01:00
|
|
|
WRITE_BOOL_FIELD(ordinality);
|
2013-12-10 15:34:37 +01:00
|
|
|
WRITE_BOOL_FIELD(is_rowsfrom);
|
Support multi-argument UNNEST(), and TABLE() syntax for multiple functions.
This patch adds the ability to write TABLE( function1(), function2(), ...)
as a single FROM-clause entry. The result is the concatenation of the
first row from each function, followed by the second row from each
function, etc; with NULLs inserted if any function produces fewer rows than
others. This is believed to be a much more useful behavior than what
Postgres currently does with multiple SRFs in a SELECT list.
This syntax also provides a reasonable way to combine use of column
definition lists with WITH ORDINALITY: put the column definition list
inside TABLE(), where it's clear that it doesn't control the ordinality
column as well.
Also implement SQL-compliant multiple-argument UNNEST(), by turning
UNNEST(a,b,c) into TABLE(unnest(a), unnest(b), unnest(c)).
The SQL standard specifies TABLE() with only a single function, not
multiple functions, and it seems to require an implicit UNNEST() which is
not what this patch does. There may be something wrong with that reading
of the spec, though, because if it's right then the spec's TABLE() is just
a pointless alternative spelling of UNNEST(). After further review of
that, we might choose to adopt a different syntax for what this patch does,
but in any case this functionality seems clearly worthwhile.
Andrew Gierth, reviewed by Zoltán Böszörményi and Heikki Linnakangas, and
significantly revised by me
2013-11-22 01:37:02 +01:00
|
|
|
WRITE_NODE_FIELD(functions);
|
2008-10-04 23:56:55 +02:00
|
|
|
WRITE_NODE_FIELD(alias);
|
|
|
|
WRITE_NODE_FIELD(coldeflist);
|
|
|
|
}
|
|
|
|
|
Redesign tablesample method API, and do extensive code review.
The original implementation of TABLESAMPLE modeled the tablesample method
API on index access methods, which wasn't a good choice because, without
specialized DDL commands, there's no way to build an extension that can
implement a TSM. (Raw inserts into system catalogs are not an acceptable
thing to do, because we can't undo them during DROP EXTENSION, nor will
pg_upgrade behave sanely.) Instead adopt an API more like procedural
language handlers or foreign data wrappers, wherein the only SQL-level
support object needed is a single handler function identified by having
a special return type. This lets us get rid of the supporting catalog
altogether, so that no custom DDL support is needed for the feature.
Adjust the API so that it can support non-constant tablesample arguments
(the original coding assumed we could evaluate the argument expressions at
ExecInitSampleScan time, which is undesirable even if it weren't outright
unsafe), and discourage sampling methods from looking at invisible tuples.
Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable
within and across queries, as required by the SQL standard, and deal more
honestly with methods that can't support that requirement.
Make a full code-review pass over the tablesample additions, and fix
assorted bugs, omissions, infelicities, and cosmetic issues (such as
failure to put the added code stanzas in a consistent ordering).
Improve EXPLAIN's output of tablesample plans, too.
Back-patch to 9.5 so that we don't have to support the original API
in production.
2015-07-25 20:39:00 +02:00
|
|
|
static void
|
|
|
|
_outRangeTableSample(StringInfo str, const RangeTableSample *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("RANGETABLESAMPLE");
|
|
|
|
|
|
|
|
WRITE_NODE_FIELD(relation);
|
|
|
|
WRITE_NODE_FIELD(method);
|
|
|
|
WRITE_NODE_FIELD(args);
|
|
|
|
WRITE_NODE_FIELD(repeatable);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
2017-03-08 16:39:37 +01:00
|
|
|
static void
|
|
|
|
_outRangeTableFunc(StringInfo str, const RangeTableFunc *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("RANGETABLEFUNC");
|
|
|
|
|
|
|
|
WRITE_BOOL_FIELD(lateral);
|
|
|
|
WRITE_NODE_FIELD(docexpr);
|
|
|
|
WRITE_NODE_FIELD(rowexpr);
|
|
|
|
WRITE_NODE_FIELD(namespaces);
|
|
|
|
WRITE_NODE_FIELD(columns);
|
|
|
|
WRITE_NODE_FIELD(alias);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outRangeTableFuncCol(StringInfo str, const RangeTableFuncCol *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("RANGETABLEFUNCCOL");
|
|
|
|
|
|
|
|
WRITE_STRING_FIELD(colname);
|
|
|
|
WRITE_NODE_FIELD(typeName);
|
|
|
|
WRITE_BOOL_FIELD(for_ordinality);
|
|
|
|
WRITE_BOOL_FIELD(is_not_null);
|
|
|
|
WRITE_NODE_FIELD(colexpr);
|
|
|
|
WRITE_NODE_FIELD(coldefexpr);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
1998-12-04 16:34:49 +01:00
|
|
|
static void
|
2011-12-07 20:46:56 +01:00
|
|
|
_outConstraint(StringInfo str, const Constraint *node)
|
1998-12-04 16:34:49 +01:00
|
|
|
{
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_TYPE("CONSTRAINT");
|
|
|
|
|
2009-07-30 04:45:38 +02:00
|
|
|
WRITE_STRING_FIELD(conname);
|
|
|
|
WRITE_BOOL_FIELD(deferrable);
|
|
|
|
WRITE_BOOL_FIELD(initdeferred);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
1998-12-04 16:34:49 +01:00
|
|
|
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, " :contype ");
|
1998-12-04 16:34:49 +01:00
|
|
|
switch (node->contype)
|
|
|
|
{
|
2009-07-30 04:45:38 +02:00
|
|
|
case CONSTR_NULL:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "NULL");
|
2009-07-30 04:45:38 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
case CONSTR_NOTNULL:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "NOT_NULL");
|
2009-07-30 04:45:38 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
case CONSTR_DEFAULT:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "DEFAULT");
|
2009-07-30 04:45:38 +02:00
|
|
|
WRITE_NODE_FIELD(raw_expr);
|
|
|
|
WRITE_STRING_FIELD(cooked_expr);
|
|
|
|
break;
|
|
|
|
|
2017-04-06 14:33:16 +02:00
|
|
|
case CONSTR_IDENTITY:
|
|
|
|
appendStringInfoString(str, "IDENTITY");
|
|
|
|
WRITE_NODE_FIELD(raw_expr);
|
|
|
|
WRITE_STRING_FIELD(cooked_expr);
|
|
|
|
WRITE_CHAR_FIELD(generated_when);
|
|
|
|
break;
|
|
|
|
|
2019-03-30 08:13:09 +01:00
|
|
|
case CONSTR_GENERATED:
|
|
|
|
appendStringInfoString(str, "GENERATED");
|
|
|
|
WRITE_NODE_FIELD(raw_expr);
|
|
|
|
WRITE_STRING_FIELD(cooked_expr);
|
|
|
|
WRITE_CHAR_FIELD(generated_when);
|
|
|
|
break;
|
|
|
|
|
2009-07-30 04:45:38 +02:00
|
|
|
case CONSTR_CHECK:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "CHECK");
|
2012-04-21 04:46:20 +02:00
|
|
|
WRITE_BOOL_FIELD(is_no_inherit);
|
2009-07-30 04:45:38 +02:00
|
|
|
WRITE_NODE_FIELD(raw_expr);
|
|
|
|
WRITE_STRING_FIELD(cooked_expr);
|
|
|
|
break;
|
|
|
|
|
1998-12-04 16:34:49 +01:00
|
|
|
case CONSTR_PRIMARY:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "PRIMARY_KEY");
|
2002-11-25 19:12:12 +01:00
|
|
|
WRITE_NODE_FIELD(keys);
|
2018-04-07 22:00:39 +02:00
|
|
|
WRITE_NODE_FIELD(including);
|
2006-07-02 04:23:23 +02:00
|
|
|
WRITE_NODE_FIELD(options);
|
2011-01-25 21:42:03 +01:00
|
|
|
WRITE_STRING_FIELD(indexname);
|
2004-08-02 06:28:29 +02:00
|
|
|
WRITE_STRING_FIELD(indexspace);
|
Fix tablespace inheritance for partitioned rels
Commit ca4103025dfe left a few loose ends. The most important one
(broken pg_dump output) is already fixed by virtue of commit
3b23552ad8bb, but some things remained:
* When ALTER TABLE rewrites tables, the indexes must remain in the
tablespace they were originally in. This didn't work because
index recreation during ALTER TABLE runs manufactured SQL (yuck),
which runs afoul of default_tablespace in competition with the parent
relation tablespace. To fix, reset default_tablespace to the empty
string temporarily, and add the TABLESPACE clause as appropriate.
* Setting a partitioned rel's tablespace to the database default is
confusing; if it worked, it would direct the partitions to that
tablespace regardless of default_tablespace. But in reality it does
not work, and making it work is a larger project. Therefore, throw
an error when this condition is detected, to alert the unwary.
Add some docs and tests, too.
Author: Álvaro Herrera
Discussion: https://postgr.es/m/CAKJS1f_1c260nOt_vBJ067AZ3JXptXVRohDVMLEBmudX1YEx-A@mail.gmail.com
2019-04-25 16:20:23 +02:00
|
|
|
WRITE_BOOL_FIELD(reset_default_tblspc);
|
2009-12-07 06:22:23 +01:00
|
|
|
/* access_method and where_clause not currently used */
|
2004-08-02 06:28:29 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
case CONSTR_UNIQUE:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "UNIQUE");
|
2022-02-03 11:29:54 +01:00
|
|
|
WRITE_BOOL_FIELD(nulls_not_distinct);
|
2004-08-02 06:28:29 +02:00
|
|
|
WRITE_NODE_FIELD(keys);
|
2018-04-07 22:00:39 +02:00
|
|
|
WRITE_NODE_FIELD(including);
|
2006-07-02 04:23:23 +02:00
|
|
|
WRITE_NODE_FIELD(options);
|
2011-01-25 21:42:03 +01:00
|
|
|
WRITE_STRING_FIELD(indexname);
|
2004-08-02 06:28:29 +02:00
|
|
|
WRITE_STRING_FIELD(indexspace);
|
Fix tablespace inheritance for partitioned rels
Commit ca4103025dfe left a few loose ends. The most important one
(broken pg_dump output) is already fixed by virtue of commit
3b23552ad8bb, but some things remained:
* When ALTER TABLE rewrites tables, the indexes must remain in the
tablespace they were originally in. This didn't work because
index recreation during ALTER TABLE runs manufactured SQL (yuck),
which runs afoul of default_tablespace in competition with the parent
relation tablespace. To fix, reset default_tablespace to the empty
string temporarily, and add the TABLESPACE clause as appropriate.
* Setting a partitioned rel's tablespace to the database default is
confusing; if it worked, it would direct the partitions to that
tablespace regardless of default_tablespace. But in reality it does
not work, and making it work is a larger project. Therefore, throw
an error when this condition is detected, to alert the unwary.
Add some docs and tests, too.
Author: Álvaro Herrera
Discussion: https://postgr.es/m/CAKJS1f_1c260nOt_vBJ067AZ3JXptXVRohDVMLEBmudX1YEx-A@mail.gmail.com
2019-04-25 16:20:23 +02:00
|
|
|
WRITE_BOOL_FIELD(reset_default_tblspc);
|
2009-12-07 06:22:23 +01:00
|
|
|
/* access_method and where_clause not currently used */
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CONSTR_EXCLUSION:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "EXCLUSION");
|
2009-12-07 06:22:23 +01:00
|
|
|
WRITE_NODE_FIELD(exclusions);
|
2018-04-07 22:00:39 +02:00
|
|
|
WRITE_NODE_FIELD(including);
|
2009-12-07 06:22:23 +01:00
|
|
|
WRITE_NODE_FIELD(options);
|
2011-01-25 21:42:03 +01:00
|
|
|
WRITE_STRING_FIELD(indexname);
|
2009-12-07 06:22:23 +01:00
|
|
|
WRITE_STRING_FIELD(indexspace);
|
Fix tablespace inheritance for partitioned rels
Commit ca4103025dfe left a few loose ends. The most important one
(broken pg_dump output) is already fixed by virtue of commit
3b23552ad8bb, but some things remained:
* When ALTER TABLE rewrites tables, the indexes must remain in the
tablespace they were originally in. This didn't work because
index recreation during ALTER TABLE runs manufactured SQL (yuck),
which runs afoul of default_tablespace in competition with the parent
relation tablespace. To fix, reset default_tablespace to the empty
string temporarily, and add the TABLESPACE clause as appropriate.
* Setting a partitioned rel's tablespace to the database default is
confusing; if it worked, it would direct the partitions to that
tablespace regardless of default_tablespace. But in reality it does
not work, and making it work is a larger project. Therefore, throw
an error when this condition is detected, to alert the unwary.
Add some docs and tests, too.
Author: Álvaro Herrera
Discussion: https://postgr.es/m/CAKJS1f_1c260nOt_vBJ067AZ3JXptXVRohDVMLEBmudX1YEx-A@mail.gmail.com
2019-04-25 16:20:23 +02:00
|
|
|
WRITE_BOOL_FIELD(reset_default_tblspc);
|
2009-12-07 06:22:23 +01:00
|
|
|
WRITE_STRING_FIELD(access_method);
|
|
|
|
WRITE_NODE_FIELD(where_clause);
|
1998-12-04 16:34:49 +01:00
|
|
|
break;
|
|
|
|
|
2009-07-30 04:45:38 +02:00
|
|
|
case CONSTR_FOREIGN:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "FOREIGN_KEY");
|
2009-07-30 04:45:38 +02:00
|
|
|
WRITE_NODE_FIELD(pktable);
|
|
|
|
WRITE_NODE_FIELD(fk_attrs);
|
|
|
|
WRITE_NODE_FIELD(pk_attrs);
|
|
|
|
WRITE_CHAR_FIELD(fk_matchtype);
|
|
|
|
WRITE_CHAR_FIELD(fk_upd_action);
|
|
|
|
WRITE_CHAR_FIELD(fk_del_action);
|
2021-12-08 11:09:44 +01:00
|
|
|
WRITE_NODE_FIELD(fk_del_set_cols);
|
ALTER TABLE: skip FK validation when it's safe to do so
We already skip rewriting the table in these cases, but we still force a
whole table scan to validate the data. This can be skipped, and thus
we can make the whole ALTER TABLE operation just do some catalog touches
instead of scanning the table, when these two conditions hold:
(a) Old and new pg_constraint.conpfeqop match exactly. This is actually
stronger than needed; we could loosen things by way of operator
families, but it'd require a lot more effort.
(b) The functions, if any, implementing a cast from the foreign type to
the primary opcintype are the same. For this purpose, we can consider a
binary coercion equivalent to an exact type match. When the opcintype
is polymorphic, require that the old and new foreign types match
exactly. (Since ri_triggers.c does use the executor, the stronger check
for polymorphic types is no mere future-proofing. However, no core type
exercises its necessity.)
Author: Noah Misch
Committer's note: catalog version bumped due to change of the Constraint
node. I can't actually find any way to have such a node in a stored
rule, but given that we have "out" support for them, better be safe.
2012-02-27 22:28:00 +01:00
|
|
|
WRITE_NODE_FIELD(old_conpfeqop);
|
Avoid repeated name lookups during table and index DDL.
If the name lookups come to different conclusions due to concurrent
activity, we might perform some parts of the DDL on a different table
than other parts. At least in the case of CREATE INDEX, this can be
used to cause the permissions checks to be performed against a
different table than the index creation, allowing for a privilege
escalation attack.
This changes the calling convention for DefineIndex, CreateTrigger,
transformIndexStmt, transformAlterTableStmt, CheckIndexCompatible
(in 9.2 and newer), and AlterTable (in 9.1 and older). In addition,
CheckRelationOwnership is removed in 9.2 and newer and the calling
convention is changed in older branches. A field has also been added
to the Constraint node (FkConstraint in 8.4). Third-party code calling
these functions or using the Constraint node will require updating.
Report by Andres Freund. Patch by Robert Haas and Andres Freund,
reviewed by Tom Lane.
Security: CVE-2014-0062
2014-02-17 15:33:31 +01:00
|
|
|
WRITE_OID_FIELD(old_pktable_oid);
|
2009-07-30 04:45:38 +02:00
|
|
|
WRITE_BOOL_FIELD(skip_validation);
|
2011-03-23 00:10:35 +01:00
|
|
|
WRITE_BOOL_FIELD(initially_valid);
|
1998-12-04 16:34:49 +01:00
|
|
|
break;
|
|
|
|
|
2009-07-30 04:45:38 +02:00
|
|
|
case CONSTR_ATTR_DEFERRABLE:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "ATTR_DEFERRABLE");
|
1998-12-04 16:34:49 +01:00
|
|
|
break;
|
|
|
|
|
2009-07-30 04:45:38 +02:00
|
|
|
case CONSTR_ATTR_NOT_DEFERRABLE:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "ATTR_NOT_DEFERRABLE");
|
1998-12-04 16:34:49 +01:00
|
|
|
break;
|
|
|
|
|
2009-07-30 04:45:38 +02:00
|
|
|
case CONSTR_ATTR_DEFERRED:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "ATTR_DEFERRED");
|
1998-12-04 16:34:49 +01:00
|
|
|
break;
|
|
|
|
|
2009-07-30 04:45:38 +02:00
|
|
|
case CONSTR_ATTR_IMMEDIATE:
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "ATTR_IMMEDIATE");
|
2009-07-30 04:45:38 +02:00
|
|
|
break;
|
2002-11-25 19:12:12 +01:00
|
|
|
|
2009-07-30 04:45:38 +02:00
|
|
|
default:
|
|
|
|
appendStringInfo(str, "<unrecognized_constraint %d>",
|
|
|
|
(int) node->contype);
|
|
|
|
break;
|
|
|
|
}
|
2001-10-25 16:08:11 +02:00
|
|
|
}
|
|
|
|
|
2016-06-18 21:22:34 +02:00
|
|
|
static void
|
|
|
|
_outForeignKeyCacheInfo(StringInfo str, const ForeignKeyCacheInfo *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("FOREIGNKEYCACHEINFO");
|
|
|
|
|
Correct attach/detach logic for FKs in partitions
There was no code to handle foreign key constraints on partitioned
tables in the case of ALTER TABLE DETACH; and if you happened to ATTACH
a partition that already had an equivalent constraint, that one was
ignored and a new constraint was created. Adding this to the fact that
foreign key cloning reuses the constraint name on the partition instead
of generating a new name (as it probably should, to cater to SQL
standard rules about constraint naming within schemas), the result was a
pretty poor user experience -- the most visible failure was that just
detaching a partition and re-attaching it failed with an error such as
ERROR: duplicate key value violates unique constraint "pg_constraint_conrelid_contypid_conname_index"
DETAIL: Key (conrelid, contypid, conname)=(26702, 0, test_result_asset_id_fkey) already exists.
because it would try to create an identically-named constraint in the
partition. To make matters worse, if you tried to drop the constraint
in the now-independent partition, that would fail because the constraint
was still seen as dependent on the constraint in its former parent
partitioned table:
ERROR: cannot drop inherited constraint "test_result_asset_id_fkey" of relation "test_result_cbsystem_0001_0050_monthly_2018_09"
This fix attacks the problem from two angles: first, when the partition
is detached, the constraint is also marked as independent, so the drop
now works. Second, when the partition is re-attached, we scan existing
constraints searching for one matching the FK in the parent, and if one
exists, we link that one to the parent constraint. So we don't end up
with a duplicate -- and better yet, we don't need to scan the referenced
table to verify that the constraint holds.
To implement this I made a small change to previously planner-only
struct ForeignKeyCacheInfo to contain the constraint OID; also relcache
now maintains the list of FKs for partitioned tables too.
Backpatch to 11.
Reported-by: Michael Vitale (bug #15425)
Discussion: https://postgr.es/m/15425-2dbc9d2aa999f816@postgresql.org
2018-10-12 17:36:26 +02:00
|
|
|
WRITE_OID_FIELD(conoid);
|
2016-06-18 21:22:34 +02:00
|
|
|
WRITE_OID_FIELD(conrelid);
|
|
|
|
WRITE_OID_FIELD(confrelid);
|
|
|
|
WRITE_INT_FIELD(nkeys);
|
2018-12-22 06:53:37 +01:00
|
|
|
WRITE_ATTRNUMBER_ARRAY(conkey, node->nkeys);
|
|
|
|
WRITE_ATTRNUMBER_ARRAY(confkey, node->nkeys);
|
|
|
|
WRITE_OID_ARRAY(conpfeqop, node->nkeys);
|
2016-06-18 21:22:34 +02:00
|
|
|
}
|
|
|
|
|
Implement table partitioning.
Table partitioning is like table inheritance and reuses much of the
existing infrastructure, but there are some important differences.
The parent is called a partitioned table and is always empty; it may
not have indexes or non-inherited constraints, since those make no
sense for a relation with no data of its own. The children are called
partitions and contain all of the actual data. Each partition has an
implicit partitioning constraint. Multiple inheritance is not
allowed, and partitioning and inheritance can't be mixed. Partitions
can't have extra columns and may not allow nulls unless the parent
does. Tuples inserted into the parent are automatically routed to the
correct partition, so tuple-routing ON INSERT triggers are not needed.
Tuple routing isn't yet supported for partitions which are foreign
tables, and it doesn't handle updates that cross partition boundaries.
Currently, tables can be range-partitioned or list-partitioned. List
partitioning is limited to a single column, but range partitioning can
involve multiple columns. A partitioning "column" can be an
expression.
Because table partitioning is less general than table inheritance, it
is hoped that it will be easier to reason about properties of
partitions, and therefore that this will serve as a better foundation
for a variety of possible optimizations, including query planner
optimizations. The tuple routing based which this patch does based on
the implicit partitioning constraints is an example of this, but it
seems likely that many other useful optimizations are also possible.
Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat,
Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova,
Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
|
|
|
static void
|
|
|
|
_outPartitionElem(StringInfo str, const PartitionElem *node)
|
|
|
|
{
|
|
|
|
WRITE_NODE_TYPE("PARTITIONELEM");
|
|
|
|
|
|
|
|
WRITE_STRING_FIELD(name);
|
|
|
|
WRITE_NODE_FIELD(expr);
|
|
|
|
WRITE_NODE_FIELD(collation);
|
|
|
|
WRITE_NODE_FIELD(opclass);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
Code review focused on new node types added by partitioning support.
Fix failure to check that we got a plain Const from const-simplification of
a coercion request. This is the cause of bug #14666 from Tian Bing: there
is an int4 to money cast, but it's only stable not immutable (because of
dependence on lc_monetary), resulting in a FuncExpr that the code was
miserably unequipped to deal with, or indeed even to notice that it was
failing to deal with. Add test cases around this coercion behavior.
In view of the above, sprinkle the code liberally with castNode() macros,
in hope of catching the next such bug a bit sooner. Also, change some
functions that were randomly declared to take Node* to take more specific
pointer types. And change some struct fields that were declared Node*
but could be given more specific types, allowing removal of assorted
explicit casts.
Place PARTITION_MAX_KEYS check a bit closer to the code it's protecting.
Likewise check only-one-key-for-list-partitioning restriction in a less
random place.
Avoid not-per-project-style usages like !strcmp(...).
Fix assorted failures to avoid scribbling on the input of parse
transformation. I'm not sure how necessary this is, but it's entirely
silly for these functions to be expending cycles to avoid that and not
getting it right.
Add guards against partitioning on system columns.
Put backend/nodes/ support code into an order that matches handling
of these node types elsewhere.
Annotate the fact that somebody added location fields to PartitionBoundSpec
and PartitionRangeDatum but forgot to handle them in
outfuncs.c/readfuncs.c. This is fairly harmless for production purposes
(since readfuncs.c would just substitute -1 anyway) but it's still bogus.
It's not worth forcing a post-beta1 initdb just to fix this, but if we
have another reason to force initdb before 10.0, we should go back and
clean this up.
Contrariwise, somebody added location fields to PartitionElem and
PartitionSpec but forgot to teach exprLocation() about them.
Consolidate duplicative code in transformPartitionBound().
Improve a couple of error messages.
Improve assorted commentary.
Re-pgindent the files touched by this patch; this affects a few comment
blocks that must have been added quite recently.
Report: https://postgr.es/m/20170524024550.29935.14396@wrigleys.postgresql.org
2017-05-29 05:20:28 +02:00
|
|
|
static void
|
|
|
|
_outPartitionSpec(StringInfo str, const PartitionSpec *node)
|
|
|
|
{
|
2017-05-30 17:32:41 +02:00
|
|
|
WRITE_NODE_TYPE("PARTITIONSPEC");
|
Code review focused on new node types added by partitioning support.
Fix failure to check that we got a plain Const from const-simplification of
a coercion request. This is the cause of bug #14666 from Tian Bing: there
is an int4 to money cast, but it's only stable not immutable (because of
dependence on lc_monetary), resulting in a FuncExpr that the code was
miserably unequipped to deal with, or indeed even to notice that it was
failing to deal with. Add test cases around this coercion behavior.
In view of the above, sprinkle the code liberally with castNode() macros,
in hope of catching the next such bug a bit sooner. Also, change some
functions that were randomly declared to take Node* to take more specific
pointer types. And change some struct fields that were declared Node*
but could be given more specific types, allowing removal of assorted
explicit casts.
Place PARTITION_MAX_KEYS check a bit closer to the code it's protecting.
Likewise check only-one-key-for-list-partitioning restriction in a less
random place.
Avoid not-per-project-style usages like !strcmp(...).
Fix assorted failures to avoid scribbling on the input of parse
transformation. I'm not sure how necessary this is, but it's entirely
silly for these functions to be expending cycles to avoid that and not
getting it right.
Add guards against partitioning on system columns.
Put backend/nodes/ support code into an order that matches handling
of these node types elsewhere.
Annotate the fact that somebody added location fields to PartitionBoundSpec
and PartitionRangeDatum but forgot to handle them in
outfuncs.c/readfuncs.c. This is fairly harmless for production purposes
(since readfuncs.c would just substitute -1 anyway) but it's still bogus.
It's not worth forcing a post-beta1 initdb just to fix this, but if we
have another reason to force initdb before 10.0, we should go back and
clean this up.
Contrariwise, somebody added location fields to PartitionElem and
PartitionSpec but forgot to teach exprLocation() about them.
Consolidate duplicative code in transformPartitionBound().
Improve a couple of error messages.
Improve assorted commentary.
Re-pgindent the files touched by this patch; this affects a few comment
blocks that must have been added quite recently.
Report: https://postgr.es/m/20170524024550.29935.14396@wrigleys.postgresql.org
2017-05-29 05:20:28 +02:00
|
|
|
|
|
|
|
WRITE_STRING_FIELD(strategy);
|
|
|
|
WRITE_NODE_FIELD(partParams);
|
|
|
|
WRITE_LOCATION_FIELD(location);
|
|
|
|
}
|
|
|
|
|
Implement table partitioning.
Table partitioning is like table inheritance and reuses much of the
existing infrastructure, but there are some important differences.
The parent is called a partitioned table and is always empty; it may
not have indexes or non-inherited constraints, since those make no
sense for a relation with no data of its own. The children are called
partitions and contain all of the actual data. Each partition has an
implicit partitioning constraint. Multiple inheritance is not
allowed, and partitioning and inheritance can't be mixed. Partitions
can't have extra columns and may not allow nulls unless the parent
does. Tuples inserted into the parent are automatically routed to the
correct partition, so tuple-routing ON INSERT triggers are not needed.
Tuple routing isn't yet supported for partitions which are foreign
tables, and it doesn't handle updates that cross partition boundaries.
Currently, tables can be range-partitioned or list-partitioned. List
partitioning is limited to a single column, but range partitioning can
involve multiple columns. A partitioning "column" can be an
expression.
Because table partitioning is less general than table inheritance, it
is hoped that it will be easier to reason about properties of
partitions, and therefore that this will serve as a better foundation
for a variety of possible optimizations, including query planner
optimizations. The tuple routing based which this patch does based on
the implicit partitioning constraints is an example of this, but it
seems likely that many other useful optimizations are also possible.
Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat,
Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova,
Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
|
|
|
static void
|
|
|
|
_outPartitionBoundSpec(StringInfo str, const PartitionBoundSpec *node)
|
|
|
|
{
|
2017-05-30 17:32:41 +02:00
|
|
|
WRITE_NODE_TYPE("PARTITIONBOUNDSPEC");
|
Implement table partitioning.
Table partitioning is like table inheritance and reuses much of the
existing infrastructure, but there are some important differences.
The parent is called a partitioned table and is always empty; it may
not have indexes or non-inherited constraints, since those make no
sense for a relation with no data of its own. The children are called
partitions and contain all of the actual data. Each partition has an
implicit partitioning constraint. Multiple inheritance is not
allowed, and partitioning and inheritance can't be mixed. Partitions
can't have extra columns and may not allow nulls unless the parent
does. Tuples inserted into the parent are automatically routed to the
correct partition, so tuple-routing ON INSERT triggers are not needed.
Tuple routing isn't yet supported for partitions which are foreign
tables, and it doesn't handle updates that cross partition boundaries.
Currently, tables can be range-partitioned or list-partitioned. List
partitioning is limited to a single column, but range partitioning can
involve multiple columns. A partitioning "column" can be an
expression.
Because table partitioning is less general than table inheritance, it
is hoped that it will be easier to reason about properties of
partitions, and therefore that this will serve as a better foundation
for a variety of possible optimizations, including query planner
optimizations. The tuple routing based which this patch does based on
the implicit partitioning constraints is an example of this, but it
seems likely that many other useful optimizations are also possible.
Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat,
Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova,
Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
|
|
|
|
|
|
|
WRITE_CHAR_FIELD(strategy);
|
Allow a partitioned table to have a default partition.
Any tuples that don't route to any other partition will route to the
default partition.
Jeevan Ladhe, Beena Emerson, Ashutosh Bapat, Rahila Syed, and Robert
Haas, with review and testing at various stages by (at least) Rushabh
Lathia, Keith Fiske, Amit Langote, Amul Sul, Rajkumar Raghuanshi, Sven
Kunze, Kyotaro Horiguchi, Thom Brown, Rafia Sabih, and Dilip Kumar.
Discussion: http://postgr.es/m/CAH2L28tbN4SYyhS7YV1YBWcitkqbhSWfQCy0G=apRcC_PEO-bg@mail.gmail.com
Discussion: http://postgr.es/m/CAOG9ApEYj34fWMcvBMBQ-YtqR9fTdXhdN82QEKG0SVZ6zeL1xg@mail.gmail.com
2017-09-08 23:28:04 +02:00
|
|
|
WRITE_BOOL_FIELD(is_default);
|
Add hash partitioning.
Hash partitioning is useful when you want to partition a growing data
set evenly. This can be useful to keep table sizes reasonable, which
makes maintenance operations such as VACUUM faster, or to enable
partition-wise join.
At present, we still depend on constraint exclusion for partitioning
pruning, and the shape of the partition constraints for hash
partitioning is such that that doesn't work. Work is underway to fix
that, which should both improve performance and make partitioning
pruning work with hash partitioning.
Amul Sul, reviewed and tested by Dilip Kumar, Ashutosh Bapat, Yugo
Nagata, Rajkumar Raghuwanshi, Jesper Pedersen, and by me. A few
final tweaks also by me.
Discussion: http://postgr.es/m/CAAJ_b96fhpJAP=ALbETmeLk1Uni_GFZD938zgenhF49qgDTjaQ@mail.gmail.com
2017-11-10 00:07:25 +01:00
|
|
|
WRITE_INT_FIELD(modulus);
|
|
|
|
WRITE_INT_FIELD(remainder);
|
Implement table partitioning.
Table partitioning is like table inheritance and reuses much of the
existing infrastructure, but there are some important differences.
The parent is called a partitioned table and is always empty; it may
not have indexes or non-inherited constraints, since those make no
sense for a relation with no data of its own. The children are called
partitions and contain all of the actual data. Each partition has an
implicit partitioning constraint. Multiple inheritance is not
allowed, and partitioning and inheritance can't be mixed. Partitions
can't have extra columns and may not allow nulls unless the parent
does. Tuples inserted into the parent are automatically routed to the
correct partition, so tuple-routing ON INSERT triggers are not needed.
Tuple routing isn't yet supported for partitions which are foreign
tables, and it doesn't handle updates that cross partition boundaries.
Currently, tables can be range-partitioned or list-partitioned. List
partitioning is limited to a single column, but range partitioning can
involve multiple columns. A partitioning "column" can be an
expression.
Because table partitioning is less general than table inheritance, it
is hoped that it will be easier to reason about properties of
partitions, and therefore that this will serve as a better foundation
for a variety of possible optimizations, including query planner
optimizations. The tuple routing based which this patch does based on
the implicit partitioning constraints is an example of this, but it
seems likely that many other useful optimizations are also possible.
Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat,
Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova,
Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
|
|
|
WRITE_NODE_FIELD(listdatums);
|
|
|
|
WRITE_NODE_FIELD(lowerdatums);
|
|
|
|
WRITE_NODE_FIELD(upperdatums);
|
2017-05-30 17:32:41 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
Implement table partitioning.
Table partitioning is like table inheritance and reuses much of the
existing infrastructure, but there are some important differences.
The parent is called a partitioned table and is always empty; it may
not have indexes or non-inherited constraints, since those make no
sense for a relation with no data of its own. The children are called
partitions and contain all of the actual data. Each partition has an
implicit partitioning constraint. Multiple inheritance is not
allowed, and partitioning and inheritance can't be mixed. Partitions
can't have extra columns and may not allow nulls unless the parent
does. Tuples inserted into the parent are automatically routed to the
correct partition, so tuple-routing ON INSERT triggers are not needed.
Tuple routing isn't yet supported for partitions which are foreign
tables, and it doesn't handle updates that cross partition boundaries.
Currently, tables can be range-partitioned or list-partitioned. List
partitioning is limited to a single column, but range partitioning can
involve multiple columns. A partitioning "column" can be an
expression.
Because table partitioning is less general than table inheritance, it
is hoped that it will be easier to reason about properties of
partitions, and therefore that this will serve as a better foundation
for a variety of possible optimizations, including query planner
optimizations. The tuple routing based which this patch does based on
the implicit partitioning constraints is an example of this, but it
seems likely that many other useful optimizations are also possible.
Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat,
Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova,
Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
_outPartitionRangeDatum(StringInfo str, const PartitionRangeDatum *node)
|
|
|
|
{
|
2017-05-30 17:32:41 +02:00
|
|
|
WRITE_NODE_TYPE("PARTITIONRANGEDATUM");
|
Implement table partitioning.
Table partitioning is like table inheritance and reuses much of the
existing infrastructure, but there are some important differences.
The parent is called a partitioned table and is always empty; it may
not have indexes or non-inherited constraints, since those make no
sense for a relation with no data of its own. The children are called
partitions and contain all of the actual data. Each partition has an
implicit partitioning constraint. Multiple inheritance is not
allowed, and partitioning and inheritance can't be mixed. Partitions
can't have extra columns and may not allow nulls unless the parent
does. Tuples inserted into the parent are automatically routed to the
correct partition, so tuple-routing ON INSERT triggers are not needed.
Tuple routing isn't yet supported for partitions which are foreign
tables, and it doesn't handle updates that cross partition boundaries.
Currently, tables can be range-partitioned or list-partitioned. List
partitioning is limited to a single column, but range partitioning can
involve multiple columns. A partitioning "column" can be an
expression.
Because table partitioning is less general than table inheritance, it
is hoped that it will be easier to reason about properties of
partitions, and therefore that this will serve as a better foundation
for a variety of possible optimizations, including query planner
optimizations. The tuple routing based which this patch does based on
the implicit partitioning constraints is an example of this, but it
seems likely that many other useful optimizations are also possible.
Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat,
Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova,
Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
|
|
|
|
Use MINVALUE/MAXVALUE instead of UNBOUNDED for range partition bounds.
Previously, UNBOUNDED meant no lower bound when used in the FROM list,
and no upper bound when used in the TO list, which was OK for
single-column range partitioning, but problematic with multiple
columns. For example, an upper bound of (10.0, UNBOUNDED) would not be
collocated with a lower bound of (10.0, UNBOUNDED), thus making it
difficult or impossible to define contiguous multi-column range
partitions in some cases.
Fix this by using MINVALUE and MAXVALUE instead of UNBOUNDED to
represent a partition column that is unbounded below or above
respectively. This syntax removes any ambiguity, and ensures that if
one partition's lower bound equals another partition's upper bound,
then the partitions are contiguous.
Also drop the constraint prohibiting finite values after an unbounded
column, and just document the fact that any values after MINVALUE or
MAXVALUE are ignored. Previously it was necessary to repeat UNBOUNDED
multiple times, which was needlessly verbose.
Note: Forces a post-PG 10 beta2 initdb.
Report by Amul Sul, original patch by Amit Langote with some
additional hacking by me.
Discussion: https://postgr.es/m/CAAJ_b947mowpLdxL3jo3YLKngRjrq9+Ej4ymduQTfYR+8=YAYQ@mail.gmail.com
2017-07-21 10:20:47 +02:00
|
|
|
WRITE_ENUM_FIELD(kind, PartitionRangeDatumKind);
|
Implement table partitioning.
Table partitioning is like table inheritance and reuses much of the
existing infrastructure, but there are some important differences.
The parent is called a partitioned table and is always empty; it may
not have indexes or non-inherited constraints, since those make no
sense for a relation with no data of its own. The children are called
partitions and contain all of the actual data. Each partition has an
implicit partitioning constraint. Multiple inheritance is not
allowed, and partitioning and inheritance can't be mixed. Partitions
can't have extra columns and may not allow nulls unless the parent
does. Tuples inserted into the parent are automatically routed to the
correct partition, so tuple-routing ON INSERT triggers are not needed.
Tuple routing isn't yet supported for partitions which are foreign
tables, and it doesn't handle updates that cross partition boundaries.
Currently, tables can be range-partitioned or list-partitioned. List
partitioning is limited to a single column, but range partitioning can
involve multiple columns. A partitioning "column" can be an
expression.
Because table partitioning is less general than table inheritance, it
is hoped that it will be easier to reason about properties of
partitions, and therefore that this will serve as a better foundation
for a variety of possible optimizations, including query planner
optimizations. The tuple routing based which this patch does based on
the implicit partitioning constraints is an example of this, but it
seems likely that many other useful optimizations are also possible.
Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat,
Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova,
Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
|
|
|
WRITE_NODE_FIELD(value);
|
2017-05-30 17:32:41 +02:00
|
|
|
WRITE_LOCATION_FIELD(location);
|
Implement table partitioning.
Table partitioning is like table inheritance and reuses much of the
existing infrastructure, but there are some important differences.
The parent is called a partitioned table and is always empty; it may
not have indexes or non-inherited constraints, since those make no
sense for a relation with no data of its own. The children are called
partitions and contain all of the actual data. Each partition has an
implicit partitioning constraint. Multiple inheritance is not
allowed, and partitioning and inheritance can't be mixed. Partitions
can't have extra columns and may not allow nulls unless the parent
does. Tuples inserted into the parent are automatically routed to the
correct partition, so tuple-routing ON INSERT triggers are not needed.
Tuple routing isn't yet supported for partitions which are foreign
tables, and it doesn't handle updates that cross partition boundaries.
Currently, tables can be range-partitioned or list-partitioned. List
partitioning is limited to a single column, but range partitioning can
involve multiple columns. A partitioning "column" can be an
expression.
Because table partitioning is less general than table inheritance, it
is hoped that it will be easier to reason about properties of
partitions, and therefore that this will serve as a better foundation
for a variety of possible optimizations, including query planner
optimizations. The tuple routing based which this patch does based on
the implicit partitioning constraints is an example of this, but it
seems likely that many other useful optimizations are also possible.
Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat,
Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova,
Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
|
|
|
}
|
2002-11-25 19:12:12 +01:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
2016-04-08 23:26:36 +02:00
|
|
|
* outNode -
|
1996-07-09 08:22:35 +02:00
|
|
|
* converts a Node into ascii string and append it to 'str'
|
|
|
|
*/
|
2016-04-08 23:26:36 +02:00
|
|
|
void
|
|
|
|
outNode(StringInfo str, const void *obj)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2018-12-10 17:12:43 +01:00
|
|
|
/* Guard against stack overflow due to overly complex expressions */
|
|
|
|
check_stack_depth();
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
if (obj == NULL)
|
2013-10-31 15:55:59 +01:00
|
|
|
appendStringInfoString(str, "<>");
|
2004-05-26 06:41:50 +02:00
|
|
|
else if (IsA(obj, List) || IsA(obj, IntList) || IsA(obj, OidList))
|
|
|
|
_outList(str, obj);
|
Remove Value node struct
The Value node struct is a weird construct. It is its own node type,
but most of the time, it actually has a node type of Integer, Float,
String, or BitString. As a consequence, the struct name and the node
type don't match most of the time, and so it has to be treated
specially a lot. There doesn't seem to be any value in the special
construct. There is very little code that wants to accept all Value
variants but nothing else (and even if it did, this doesn't provide
any convenient way to check it), and most code wants either just one
particular node type (usually String), or it accepts a broader set of
node types besides just Value.
This change removes the Value struct and node type and replaces them
by separate Integer, Float, String, and BitString node types that are
proper node types and structs of their own and behave mostly like
normal node types.
Also, this removes the T_Null node tag, which was previously also a
possible variant of Value but wasn't actually used outside of the
Value contained in A_Const. Replace that by an isnull field in
A_Const.
Reviewed-by: Dagfinn Ilmari Mannsåker <ilmari@ilmari.org>
Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/5ba6bc5b-3f95-04f2-2419-f8ddb4c046fb@enterprisedb.com
2021-09-09 07:58:12 +02:00
|
|
|
/* nodeRead does not want to see { } around these! */
|
|
|
|
else if (IsA(obj, Integer))
|
|
|
|
_outInteger(str, (Integer *) obj);
|
|
|
|
else if (IsA(obj, Float))
|
|
|
|
_outFloat(str, (Float *) obj);
|
2022-01-14 10:46:49 +01:00
|
|
|
else if (IsA(obj, Boolean))
|
|
|
|
_outBoolean(str, (Boolean *) obj);
|
Remove Value node struct
The Value node struct is a weird construct. It is its own node type,
but most of the time, it actually has a node type of Integer, Float,
String, or BitString. As a consequence, the struct name and the node
type don't match most of the time, and so it has to be treated
specially a lot. There doesn't seem to be any value in the special
construct. There is very little code that wants to accept all Value
variants but nothing else (and even if it did, this doesn't provide
any convenient way to check it), and most code wants either just one
particular node type (usually String), or it accepts a broader set of
node types besides just Value.
This change removes the Value struct and node type and replaces them
by separate Integer, Float, String, and BitString node types that are
proper node types and structs of their own and behave mostly like
normal node types.
Also, this removes the T_Null node tag, which was previously also a
possible variant of Value but wasn't actually used outside of the
Value contained in A_Const. Replace that by an isnull field in
A_Const.
Reviewed-by: Dagfinn Ilmari Mannsåker <ilmari@ilmari.org>
Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/5ba6bc5b-3f95-04f2-2419-f8ddb4c046fb@enterprisedb.com
2021-09-09 07:58:12 +02:00
|
|
|
else if (IsA(obj, String))
|
|
|
|
_outString(str, (String *) obj);
|
|
|
|
else if (IsA(obj, BitString))
|
|
|
|
_outBitString(str, (BitString *) obj);
|
1996-07-09 08:22:35 +02:00
|
|
|
else
|
|
|
|
{
|
2000-01-14 01:53:21 +01:00
|
|
|
appendStringInfoChar(str, '{');
|
1996-07-09 08:22:35 +02:00
|
|
|
switch (nodeTag(obj))
|
|
|
|
{
|
2007-02-20 18:32:18 +01:00
|
|
|
case T_PlannedStmt:
|
|
|
|
_outPlannedStmt(str, obj);
|
|
|
|
break;
|
1996-07-09 08:22:35 +02:00
|
|
|
case T_Result:
|
|
|
|
_outResult(str, obj);
|
|
|
|
break;
|
Move targetlist SRF handling from expression evaluation to new executor node.
Evaluation of set returning functions (SRFs_ in the targetlist (like SELECT
generate_series(1,5)) so far was done in the expression evaluation (i.e.
ExecEvalExpr()) and projection (i.e. ExecProject/ExecTargetList) code.
This meant that most executor nodes performing projection, and most
expression evaluation functions, had to deal with the possibility that an
evaluated expression could return a set of return values.
That's bad because it leads to repeated code in a lot of places. It also,
and that's my (Andres's) motivation, made it a lot harder to implement a
more efficient way of doing expression evaluation.
To fix this, introduce a new executor node (ProjectSet) that can evaluate
targetlists containing one or more SRFs. To avoid the complexity of the old
way of handling nested expressions returning sets (e.g. having to pass up
ExprDoneCond, and dealing with arguments to functions returning sets etc.),
those SRFs can only be at the top level of the node's targetlist. The
planner makes sure (via split_pathtarget_at_srfs()) that SRF evaluation is
only necessary in ProjectSet nodes and that SRFs are only present at the
top level of the node's targetlist. If there are nested SRFs the planner
creates multiple stacked ProjectSet nodes. The ProjectSet nodes always get
input from an underlying node.
We also discussed and prototyped evaluating targetlist SRFs using ROWS
FROM(), but that turned out to be more complicated than we'd hoped.
While moving SRF evaluation to ProjectSet would allow to retain the old
"least common multiple" behavior when multiple SRFs are present in one
targetlist (i.e. continue returning rows until all SRFs are at the end of
their input at the same time), we decided to instead only return rows till
all SRFs are exhausted, returning NULL for already exhausted ones. We
deemed the previous behavior to be too confusing, unexpected and actually
not particularly useful.
As a side effect, the previously prohibited case of multiple set returning
arguments to a function, is now allowed. Not because it's particularly
desirable, but because it ends up working and there seems to be no argument
for adding code to prohibit it.
Currently the behavior for COALESCE and CASE containing SRFs has changed,
returning multiple rows from the expression, even when the SRF containing
"arm" of the expression is not evaluated. That's because the SRFs are
evaluated in a separate ProjectSet node. As that's quite confusing, we're
likely to instead prohibit SRFs in those places. But that's still being
discussed, and the code would reside in places not touched here, so that's
a task for later.
There's a lot of, now superfluous, code dealing with set return expressions
around. But as the changes to get rid of those are verbose largely boring,
it seems better for readability to keep the cleanup as a separate commit.
Author: Tom Lane and Andres Freund
Discussion: https://postgr.es/m/20160822214023.aaxz5l4igypowyri@alap3.anarazel.de
2017-01-18 21:46:50 +01:00
|
|
|
case T_ProjectSet:
|
|
|
|
_outProjectSet(str, obj);
|
|
|
|
break;
|
2009-10-10 03:43:50 +02:00
|
|
|
case T_ModifyTable:
|
|
|
|
_outModifyTable(str, obj);
|
|
|
|
break;
|
1996-07-09 08:22:35 +02:00
|
|
|
case T_Append:
|
|
|
|
_outAppend(str, obj);
|
|
|
|
break;
|
2010-10-14 22:56:39 +02:00
|
|
|
case T_MergeAppend:
|
|
|
|
_outMergeAppend(str, obj);
|
|
|
|
break;
|
2008-10-04 23:56:55 +02:00
|
|
|
case T_RecursiveUnion:
|
|
|
|
_outRecursiveUnion(str, obj);
|
|
|
|
break;
|
2005-04-20 00:35:18 +02:00
|
|
|
case T_BitmapAnd:
|
|
|
|
_outBitmapAnd(str, obj);
|
|
|
|
break;
|
|
|
|
case T_BitmapOr:
|
|
|
|
_outBitmapOr(str, obj);
|
|
|
|
break;
|
Add a Gather executor node.
A Gather executor node runs any number of copies of a plan in an equal
number of workers and merges all of the results into a single tuple
stream. It can also run the plan itself, if the workers are
unavailable or haven't started up yet. It is intended to work with
the Partial Seq Scan node which will be added in future commits.
It could also be used to implement parallel query of a different sort
by itself, without help from Partial Seq Scan, if the single_copy mode
is used. In that mode, a worker executes the plan, and the parallel
leader does not, merely collecting the worker's results. So, a Gather
node could be inserted into a plan to split the execution of that plan
across two processes. Nested Gather nodes aren't currently supported,
but we might want to add support for that in the future.
There's nothing in the planner to actually generate Gather nodes yet,
so it's not quite time to break out the champagne. But we're getting
close.
Amit Kapila. Some designs suggestions were provided by me, and I also
reviewed the patch. Single-copy mode, documentation, and other minor
changes also by me.
2015-10-01 01:23:36 +02:00
|
|
|
case T_Gather:
|
|
|
|
_outGather(str, obj);
|
|
|
|
break;
|
2017-03-09 13:40:36 +01:00
|
|
|
case T_GatherMerge:
|
|
|
|
_outGatherMerge(str, obj);
|
|
|
|
break;
|
1996-07-09 08:22:35 +02:00
|
|
|
case T_Scan:
|
|
|
|
_outScan(str, obj);
|
|
|
|
break;
|
|
|
|
case T_SeqScan:
|
|
|
|
_outSeqScan(str, obj);
|
|
|
|
break;
|
Redesign tablesample method API, and do extensive code review.
The original implementation of TABLESAMPLE modeled the tablesample method
API on index access methods, which wasn't a good choice because, without
specialized DDL commands, there's no way to build an extension that can
implement a TSM. (Raw inserts into system catalogs are not an acceptable
thing to do, because we can't undo them during DROP EXTENSION, nor will
pg_upgrade behave sanely.) Instead adopt an API more like procedural
language handlers or foreign data wrappers, wherein the only SQL-level
support object needed is a single handler function identified by having
a special return type. This lets us get rid of the supporting catalog
altogether, so that no custom DDL support is needed for the feature.
Adjust the API so that it can support non-constant tablesample arguments
(the original coding assumed we could evaluate the argument expressions at
ExecInitSampleScan time, which is undesirable even if it weren't outright
unsafe), and discourage sampling methods from looking at invisible tuples.
Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable
within and across queries, as required by the SQL standard, and deal more
honestly with methods that can't support that requirement.
Make a full code-review pass over the tablesample additions, and fix
assorted bugs, omissions, infelicities, and cosmetic issues (such as
failure to put the added code stanzas in a consistent ordering).
Improve EXPLAIN's output of tablesample plans, too.
Back-patch to 9.5 so that we don't have to support the original API
in production.
2015-07-25 20:39:00 +02:00
|
|
|
case T_SampleScan:
|
|
|
|
_outSampleScan(str, obj);
|
|
|
|
break;
|
1996-07-09 08:22:35 +02:00
|
|
|
case T_IndexScan:
|
|
|
|
_outIndexScan(str, obj);
|
|
|
|
break;
|
2011-10-11 20:20:06 +02:00
|
|
|
case T_IndexOnlyScan:
|
|
|
|
_outIndexOnlyScan(str, obj);
|
|
|
|
break;
|
2005-04-20 00:35:18 +02:00
|
|
|
case T_BitmapIndexScan:
|
|
|
|
_outBitmapIndexScan(str, obj);
|
|
|
|
break;
|
|
|
|
case T_BitmapHeapScan:
|
|
|
|
_outBitmapHeapScan(str, obj);
|
|
|
|
break;
|
1999-11-23 21:07:06 +01:00
|
|
|
case T_TidScan:
|
|
|
|
_outTidScan(str, obj);
|
|
|
|
break;
|
2021-02-27 10:59:36 +01:00
|
|
|
case T_TidRangeScan:
|
|
|
|
_outTidRangeScan(str, obj);
|
|
|
|
break;
|
2000-09-29 20:21:41 +02:00
|
|
|
case T_SubqueryScan:
|
|
|
|
_outSubqueryScan(str, obj);
|
|
|
|
break;
|
2002-05-12 22:10:05 +02:00
|
|
|
case T_FunctionScan:
|
|
|
|
_outFunctionScan(str, obj);
|
|
|
|
break;
|
2017-03-08 16:39:37 +01:00
|
|
|
case T_TableFuncScan:
|
|
|
|
_outTableFuncScan(str, obj);
|
|
|
|
break;
|
2006-08-02 03:59:48 +02:00
|
|
|
case T_ValuesScan:
|
|
|
|
_outValuesScan(str, obj);
|
|
|
|
break;
|
2008-10-04 23:56:55 +02:00
|
|
|
case T_CteScan:
|
|
|
|
_outCteScan(str, obj);
|
|
|
|
break;
|
2017-04-01 06:17:18 +02:00
|
|
|
case T_NamedTuplestoreScan:
|
|
|
|
_outNamedTuplestoreScan(str, obj);
|
|
|
|
break;
|
2008-10-04 23:56:55 +02:00
|
|
|
case T_WorkTableScan:
|
|
|
|
_outWorkTableScan(str, obj);
|
|
|
|
break;
|
2011-02-20 06:17:18 +01:00
|
|
|
case T_ForeignScan:
|
|
|
|
_outForeignScan(str, obj);
|
|
|
|
break;
|
2014-11-07 23:26:02 +01:00
|
|
|
case T_CustomScan:
|
|
|
|
_outCustomScan(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_NestLoop:
|
|
|
|
_outNestLoop(str, obj);
|
|
|
|
break;
|
|
|
|
case T_MergeJoin:
|
|
|
|
_outMergeJoin(str, obj);
|
|
|
|
break;
|
|
|
|
case T_HashJoin:
|
|
|
|
_outHashJoin(str, obj);
|
1996-07-09 08:22:35 +02:00
|
|
|
break;
|
|
|
|
case T_Agg:
|
|
|
|
_outAgg(str, obj);
|
|
|
|
break;
|
2008-12-28 19:54:01 +01:00
|
|
|
case T_WindowAgg:
|
|
|
|
_outWindowAgg(str, obj);
|
|
|
|
break;
|
1996-07-09 08:22:35 +02:00
|
|
|
case T_Group:
|
|
|
|
_outGroup(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_Material:
|
|
|
|
_outMaterial(str, obj);
|
|
|
|
break;
|
2021-07-14 02:43:58 +02:00
|
|
|
case T_Memoize:
|
|
|
|
_outMemoize(str, obj);
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_Sort:
|
|
|
|
_outSort(str, obj);
|
|
|
|
break;
|
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when
the input is already sorted by a prefix of the requested sort keys. For
example when the relation is already sorted by (key1, key2) and we need
to sort it by (key1, key2, key3) we can simply split the input rows into
groups having equal values in (key1, key2), and only sort/compare the
remaining column key3.
This has a number of benefits:
- Reduced memory consumption, because only a single group (determined by
values in the sorted prefix) needs to be kept in memory. This may also
eliminate the need to spill to disk.
- Lower startup cost, because Incremental Sort produce results after each
prefix group, which is beneficial for plans where startup cost matters
(like for example queries with LIMIT clause).
We consider both Sort and Incremental Sort, and decide based on costing.
The implemented algorithm operates in two different modes:
- Fetching a minimum number of tuples without check of equality on the
prefix keys, and sorting on all columns when safe.
- Fetching all tuples for a single prefix group and then sorting by
comparing only the remaining (non-prefix) keys.
We always start in the first mode, and employ a heuristic to switch into
the second mode if we believe it's beneficial - the goal is to minimize
the number of unnecessary comparions while keeping memory consumption
below work_mem.
This is a very old patch series. The idea was originally proposed by
Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the
patch was taken over by James Coleman, who wrote and rewrote most of the
current code.
There were many reviewers/contributors since 2013 - I've done my best to
pick the most active ones, and listed them in this commit message.
Author: James Coleman, Alexander Korotkov
Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov
Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com
Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
2020-04-06 21:33:28 +02:00
|
|
|
case T_IncrementalSort:
|
|
|
|
_outIncrementalSort(str, obj);
|
|
|
|
break;
|
1996-07-09 08:22:35 +02:00
|
|
|
case T_Unique:
|
|
|
|
_outUnique(str, obj);
|
|
|
|
break;
|
2008-09-09 20:58:09 +02:00
|
|
|
case T_Hash:
|
|
|
|
_outHash(str, obj);
|
|
|
|
break;
|
2000-10-05 21:11:39 +02:00
|
|
|
case T_SetOp:
|
|
|
|
_outSetOp(str, obj);
|
|
|
|
break;
|
2009-10-12 20:10:51 +02:00
|
|
|
case T_LockRows:
|
|
|
|
_outLockRows(str, obj);
|
|
|
|
break;
|
2000-10-26 23:38:24 +02:00
|
|
|
case T_Limit:
|
|
|
|
_outLimit(str, obj);
|
|
|
|
break;
|
2010-07-12 19:01:06 +02:00
|
|
|
case T_NestLoopParam:
|
|
|
|
_outNestLoopParam(str, obj);
|
|
|
|
break;
|
Re-implement EvalPlanQual processing to improve its performance and eliminate
a lot of strange behaviors that occurred in join cases. We now identify the
"current" row for every joined relation in UPDATE, DELETE, and SELECT FOR
UPDATE/SHARE queries. If an EvalPlanQual recheck is necessary, we jam the
appropriate row into each scan node in the rechecking plan, forcing it to emit
only that one row. The former behavior could rescan the whole of each joined
relation for each recheck, which was terrible for performance, and what's much
worse could result in duplicated output tuples.
Also, the original implementation of EvalPlanQual could not re-use the recheck
execution tree --- it had to go through a full executor init and shutdown for
every row to be tested. To avoid this overhead, I've associated a special
runtime Param with each LockRows or ModifyTable plan node, and arranged to
make every scan node below such a node depend on that Param. Thus, by
signaling a change in that Param, the EPQ machinery can just rescan the
already-built test plan.
This patch also adds a prohibition on set-returning functions in the
targetlist of SELECT FOR UPDATE/SHARE. This is needed to avoid the
duplicate-output-tuple problem. It seems fairly reasonable since the
other restrictions on SELECT FOR UPDATE are meant to ensure that there
is a unique correspondence between source tuples and result tuples,
which an output SRF destroys as much as anything else does.
2009-10-26 03:26:45 +01:00
|
|
|
case T_PlanRowMark:
|
|
|
|
_outPlanRowMark(str, obj);
|
|
|
|
break;
|
2018-06-10 22:30:14 +02:00
|
|
|
case T_PartitionPruneInfo:
|
|
|
|
_outPartitionPruneInfo(str, obj);
|
|
|
|
break;
|
2018-08-02 01:42:46 +02:00
|
|
|
case T_PartitionedRelPruneInfo:
|
|
|
|
_outPartitionedRelPruneInfo(str, obj);
|
|
|
|
break;
|
2018-06-10 22:30:14 +02:00
|
|
|
case T_PartitionPruneStepOp:
|
|
|
|
_outPartitionPruneStepOp(str, obj);
|
|
|
|
break;
|
|
|
|
case T_PartitionPruneStepCombine:
|
|
|
|
_outPartitionPruneStepCombine(str, obj);
|
|
|
|
break;
|
2008-09-09 20:58:09 +02:00
|
|
|
case T_PlanInvalItem:
|
|
|
|
_outPlanInvalItem(str, obj);
|
1996-07-09 08:22:35 +02:00
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_Alias:
|
|
|
|
_outAlias(str, obj);
|
|
|
|
break;
|
|
|
|
case T_RangeVar:
|
|
|
|
_outRangeVar(str, obj);
|
1996-07-09 08:22:35 +02:00
|
|
|
break;
|
2017-03-08 16:39:37 +01:00
|
|
|
case T_TableFunc:
|
|
|
|
_outTableFunc(str, obj);
|
|
|
|
break;
|
2007-02-20 18:32:18 +01:00
|
|
|
case T_IntoClause:
|
|
|
|
_outIntoClause(str, obj);
|
|
|
|
break;
|
1996-07-09 08:22:35 +02:00
|
|
|
case T_Var:
|
|
|
|
_outVar(str, obj);
|
|
|
|
break;
|
|
|
|
case T_Const:
|
|
|
|
_outConst(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_Param:
|
|
|
|
_outParam(str, obj);
|
|
|
|
break;
|
1999-01-24 01:28:37 +01:00
|
|
|
case T_Aggref:
|
|
|
|
_outAggref(str, obj);
|
1996-07-09 08:22:35 +02:00
|
|
|
break;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
case T_GroupingFunc:
|
|
|
|
_outGroupingFunc(str, obj);
|
|
|
|
break;
|
2008-12-28 19:54:01 +01:00
|
|
|
case T_WindowFunc:
|
|
|
|
_outWindowFunc(str, obj);
|
|
|
|
break;
|
2019-02-01 16:50:32 +01:00
|
|
|
case T_SubscriptingRef:
|
|
|
|
_outSubscriptingRef(str, obj);
|
1996-07-09 08:22:35 +02:00
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_FuncExpr:
|
|
|
|
_outFuncExpr(str, obj);
|
1996-07-09 08:22:35 +02:00
|
|
|
break;
|
2009-10-08 04:39:25 +02:00
|
|
|
case T_NamedArgExpr:
|
|
|
|
_outNamedArgExpr(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_OpExpr:
|
|
|
|
_outOpExpr(str, obj);
|
1996-07-09 08:22:35 +02:00
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_DistinctExpr:
|
|
|
|
_outDistinctExpr(str, obj);
|
|
|
|
break;
|
2011-03-20 01:29:08 +01:00
|
|
|
case T_NullIfExpr:
|
|
|
|
_outNullIfExpr(str, obj);
|
|
|
|
break;
|
2003-06-29 02:33:44 +02:00
|
|
|
case T_ScalarArrayOpExpr:
|
|
|
|
_outScalarArrayOpExpr(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_BoolExpr:
|
|
|
|
_outBoolExpr(str, obj);
|
|
|
|
break;
|
|
|
|
case T_SubLink:
|
|
|
|
_outSubLink(str, obj);
|
|
|
|
break;
|
2002-12-14 01:17:59 +01:00
|
|
|
case T_SubPlan:
|
|
|
|
_outSubPlan(str, obj);
|
1996-07-09 08:22:35 +02:00
|
|
|
break;
|
2008-08-22 02:16:04 +02:00
|
|
|
case T_AlternativeSubPlan:
|
|
|
|
_outAlternativeSubPlan(str, obj);
|
|
|
|
break;
|
2000-09-12 23:07:18 +02:00
|
|
|
case T_FieldSelect:
|
|
|
|
_outFieldSelect(str, obj);
|
|
|
|
break;
|
2004-06-09 21:08:20 +02:00
|
|
|
case T_FieldStore:
|
|
|
|
_outFieldStore(str, obj);
|
|
|
|
break;
|
2000-09-12 23:07:18 +02:00
|
|
|
case T_RelabelType:
|
|
|
|
_outRelabelType(str, obj);
|
|
|
|
break;
|
2007-06-05 23:31:09 +02:00
|
|
|
case T_CoerceViaIO:
|
|
|
|
_outCoerceViaIO(str, obj);
|
|
|
|
break;
|
2007-03-28 01:21:12 +02:00
|
|
|
case T_ArrayCoerceExpr:
|
|
|
|
_outArrayCoerceExpr(str, obj);
|
|
|
|
break;
|
2004-12-12 00:26:51 +01:00
|
|
|
case T_ConvertRowtypeExpr:
|
|
|
|
_outConvertRowtypeExpr(str, obj);
|
|
|
|
break;
|
2011-03-11 22:27:51 +01:00
|
|
|
case T_CollateExpr:
|
|
|
|
_outCollateExpr(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_CaseExpr:
|
|
|
|
_outCaseExpr(str, obj);
|
2000-09-12 23:07:18 +02:00
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_CaseWhen:
|
|
|
|
_outCaseWhen(str, obj);
|
2000-09-29 20:21:41 +02:00
|
|
|
break;
|
2004-03-17 21:48:43 +01:00
|
|
|
case T_CaseTestExpr:
|
|
|
|
_outCaseTestExpr(str, obj);
|
|
|
|
break;
|
2003-04-09 01:20:04 +02:00
|
|
|
case T_ArrayExpr:
|
|
|
|
_outArrayExpr(str, obj);
|
|
|
|
break;
|
2004-05-11 00:44:49 +02:00
|
|
|
case T_RowExpr:
|
|
|
|
_outRowExpr(str, obj);
|
|
|
|
break;
|
2005-12-28 02:30:02 +01:00
|
|
|
case T_RowCompareExpr:
|
|
|
|
_outRowCompareExpr(str, obj);
|
|
|
|
break;
|
2003-02-16 03:30:39 +01:00
|
|
|
case T_CoalesceExpr:
|
|
|
|
_outCoalesceExpr(str, obj);
|
|
|
|
break;
|
2005-06-27 00:05:42 +02:00
|
|
|
case T_MinMaxExpr:
|
|
|
|
_outMinMaxExpr(str, obj);
|
|
|
|
break;
|
2016-08-17 02:33:01 +02:00
|
|
|
case T_SQLValueFunction:
|
|
|
|
_outSQLValueFunction(str, obj);
|
|
|
|
break;
|
2006-12-24 01:29:20 +01:00
|
|
|
case T_XmlExpr:
|
|
|
|
_outXmlExpr(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_NullTest:
|
|
|
|
_outNullTest(str, obj);
|
|
|
|
break;
|
|
|
|
case T_BooleanTest:
|
|
|
|
_outBooleanTest(str, obj);
|
|
|
|
break;
|
2003-02-03 22:15:45 +01:00
|
|
|
case T_CoerceToDomain:
|
|
|
|
_outCoerceToDomain(str, obj);
|
2002-12-12 16:49:42 +01:00
|
|
|
break;
|
2003-02-03 22:15:45 +01:00
|
|
|
case T_CoerceToDomainValue:
|
|
|
|
_outCoerceToDomainValue(str, obj);
|
2000-09-12 23:07:18 +02:00
|
|
|
break;
|
2003-07-03 18:34:26 +02:00
|
|
|
case T_SetToDefault:
|
|
|
|
_outSetToDefault(str, obj);
|
|
|
|
break;
|
2007-06-11 03:16:30 +02:00
|
|
|
case T_CurrentOfExpr:
|
|
|
|
_outCurrentOfExpr(str, obj);
|
|
|
|
break;
|
Code review for NextValueExpr expression node type.
Add missing infrastructure for this node type, notably in ruleutils.c where
its lack could demonstrably cause EXPLAIN to fail. Add outfuncs/readfuncs
support. (outfuncs support is useful today for debugging purposes. The
readfuncs support may never be needed, since at present it would only
matter for parallel query and NextValueExpr should never appear in a
parallelizable query; but it seems like a bad idea to have a primnode type
that isn't fully supported here.) Teach planner infrastructure that
NextValueExpr is a volatile, parallel-unsafe, non-leaky expression node
with cost cpu_operator_cost. Given its limited scope of usage, there
*might* be no live bug today from the lack of that knowledge, but it's
certainly going to bite us on the rear someday. Teach pg_stat_statements
about the new node type, too.
While at it, also teach cost_qual_eval() that MinMaxExpr, SQLValueFunction,
XmlExpr, and CoerceToDomain should be charged as cpu_operator_cost.
Failing to do this for SQLValueFunction was an oversight in my commit
0bb51aa96. The others are longer-standing oversights, but no time like the
present to fix them. (In principle, CoerceToDomain could have cost much
higher than this, but it doesn't presently seem worth trying to examine the
domain's constraints here.)
Modify execExprInterp.c to execute NextValueExpr as an out-of-line
function; it seems quite unlikely to me that it's worth insisting that
it be inlined in all expression eval methods. Besides, providing the
out-of-line function doesn't stop anyone from inlining if they want to.
Adjust some places where NextValueExpr support had been inserted with the
aid of a dartboard rather than keeping it in the same order as elsewhere.
Discussion: https://postgr.es/m/23862.1499981661@sss.pgh.pa.us
2017-07-14 21:25:43 +02:00
|
|
|
case T_NextValueExpr:
|
|
|
|
_outNextValueExpr(str, obj);
|
|
|
|
break;
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
case T_InferenceElem:
|
|
|
|
_outInferenceElem(str, obj);
|
|
|
|
break;
|
1996-07-09 08:22:35 +02:00
|
|
|
case T_TargetEntry:
|
|
|
|
_outTargetEntry(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_RangeTblRef:
|
|
|
|
_outRangeTblRef(str, obj);
|
2002-03-21 17:02:16 +01:00
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_JoinExpr:
|
|
|
|
_outJoinExpr(str, obj);
|
|
|
|
break;
|
|
|
|
case T_FromExpr:
|
|
|
|
_outFromExpr(str, obj);
|
1996-07-09 08:22:35 +02:00
|
|
|
break;
|
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE.
The newly added ON CONFLICT clause allows to specify an alternative to
raising a unique or exclusion constraint violation error when inserting.
ON CONFLICT refers to constraints that can either be specified using a
inference clause (by specifying the columns of a unique constraint) or
by naming a unique or exclusion constraint. DO NOTHING avoids the
constraint violation, without touching the pre-existing row. DO UPDATE
SET ... [WHERE ...] updates the pre-existing tuple, and has access to
both the tuple proposed for insertion and the existing tuple; the
optional WHERE clause can be used to prevent an update from being
executed. The UPDATE SET and WHERE clauses have access to the tuple
proposed for insertion using the "magic" EXCLUDED alias, and to the
pre-existing tuple using the table name or its alias.
This feature is often referred to as upsert.
This is implemented using a new infrastructure called "speculative
insertion". It is an optimistic variant of regular insertion that first
does a pre-check for existing tuples and then attempts an insert. If a
violating tuple was inserted concurrently, the speculatively inserted
tuple is deleted and a new attempt is made. If the pre-check finds a
matching tuple the alternative DO NOTHING or DO UPDATE action is taken.
If the insertion succeeds without detecting a conflict, the tuple is
deemed inserted.
To handle the possible ambiguity between the excluded alias and a table
named excluded, and for convenience with long relation names, INSERT
INTO now can alias its target table.
Bumps catversion as stored rules change.
Author: Peter Geoghegan, with significant contributions from Heikki
Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes.
Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs,
Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
|
|
|
case T_OnConflictExpr:
|
|
|
|
_outOnConflictExpr(str, obj);
|
|
|
|
break;
|
1996-07-09 08:22:35 +02:00
|
|
|
case T_Path:
|
|
|
|
_outPath(str, obj);
|
|
|
|
break;
|
|
|
|
case T_IndexPath:
|
|
|
|
_outIndexPath(str, obj);
|
|
|
|
break;
|
2005-04-20 00:35:18 +02:00
|
|
|
case T_BitmapHeapPath:
|
|
|
|
_outBitmapHeapPath(str, obj);
|
|
|
|
break;
|
2005-04-21 21:18:13 +02:00
|
|
|
case T_BitmapAndPath:
|
|
|
|
_outBitmapAndPath(str, obj);
|
|
|
|
break;
|
|
|
|
case T_BitmapOrPath:
|
|
|
|
_outBitmapOrPath(str, obj);
|
|
|
|
break;
|
1999-11-23 21:07:06 +01:00
|
|
|
case T_TidPath:
|
|
|
|
_outTidPath(str, obj);
|
|
|
|
break;
|
2021-06-07 21:32:53 +02:00
|
|
|
case T_TidRangePath:
|
|
|
|
_outTidRangePath(str, obj);
|
|
|
|
break;
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
case T_SubqueryScanPath:
|
|
|
|
_outSubqueryScanPath(str, obj);
|
|
|
|
break;
|
2011-02-20 06:17:18 +01:00
|
|
|
case T_ForeignPath:
|
|
|
|
_outForeignPath(str, obj);
|
|
|
|
break;
|
2014-11-07 23:26:02 +01:00
|
|
|
case T_CustomPath:
|
|
|
|
_outCustomPath(str, obj);
|
|
|
|
break;
|
2000-11-12 01:37:02 +01:00
|
|
|
case T_AppendPath:
|
|
|
|
_outAppendPath(str, obj);
|
|
|
|
break;
|
2010-10-14 22:56:39 +02:00
|
|
|
case T_MergeAppendPath:
|
|
|
|
_outMergeAppendPath(str, obj);
|
|
|
|
break;
|
In the planner, replace an empty FROM clause with a dummy RTE.
The fact that "SELECT expression" has no base relations has long been a
thorn in the side of the planner. It makes it hard to flatten a sub-query
that looks like that, or is a trivial VALUES() item, because the planner
generally uses relid sets to identify sub-relations, and such a sub-query
would have an empty relid set if we flattened it. prepjointree.c contains
some baroque logic that works around this in certain special cases --- but
there is a much better answer. We can replace an empty FROM clause with a
dummy RTE that acts like a table of one row and no columns, and then there
are no such corner cases to worry about. Instead we need some logic to
get rid of useless dummy RTEs, but that's simpler and covers more cases
than what was there before.
For really trivial cases, where the query is just "SELECT expression" and
nothing else, there's a hazard that adding the extra RTE makes for a
noticeable slowdown; even though it's not much processing, there's not
that much for the planner to do overall. However testing says that the
penalty is very small, close to the noise level. In more complex queries,
this is able to find optimizations that we could not find before.
The new RTE type is called RTE_RESULT, since the "scan" plan type it
gives rise to is a Result node (the same plan we produced for a "SELECT
expression" query before). To avoid confusion, rename the old ResultPath
path type to GroupResultPath, reflecting that it's only used in degenerate
grouping cases where we know the query produces just one grouped row.
(It wouldn't work to unify the two cases, because there are different
rules about where the associated quals live during query_planner.)
Note: although this touches readfuncs.c, I don't think a catversion
bump is required, because the added case can't occur in stored rules,
only plans.
Patch by me, reviewed by David Rowley and Mark Dilger
Discussion: https://postgr.es/m/15944.1521127664@sss.pgh.pa.us
2019-01-28 23:54:10 +01:00
|
|
|
case T_GroupResultPath:
|
|
|
|
_outGroupResultPath(str, obj);
|
2002-11-06 01:00:45 +01:00
|
|
|
break;
|
2002-11-30 06:21:03 +01:00
|
|
|
case T_MaterialPath:
|
|
|
|
_outMaterialPath(str, obj);
|
|
|
|
break;
|
2021-07-14 02:43:58 +02:00
|
|
|
case T_MemoizePath:
|
|
|
|
_outMemoizePath(str, obj);
|
Add Result Cache executor node (take 2)
Here we add a new executor node type named "Result Cache". The planner
can include this node type in the plan to have the executor cache the
results from the inner side of parameterized nested loop joins. This
allows caching of tuples for sets of parameters so that in the event that
the node sees the same parameter values again, it can just return the
cached tuples instead of rescanning the inner side of the join all over
again. Internally, result cache uses a hash table in order to quickly
find tuples that have been previously cached.
For certain data sets, this can significantly improve the performance of
joins. The best cases for using this new node type are for join problems
where a large portion of the tuples from the inner side of the join have
no join partner on the outer side of the join. In such cases, hash join
would have to hash values that are never looked up, thus bloating the hash
table and possibly causing it to multi-batch. Merge joins would have to
skip over all of the unmatched rows. If we use a nested loop join with a
result cache, then we only cache tuples that have at least one join
partner on the outer side of the join. The benefits of using a
parameterized nested loop with a result cache increase when there are
fewer distinct values being looked up and the number of lookups of each
value is large. Also, hash probes to lookup the cache can be much faster
than the hash probe in a hash join as it's common that the result cache's
hash table is much smaller than the hash join's due to result cache only
caching useful tuples rather than all tuples from the inner side of the
join. This variation in hash probe performance is more significant when
the hash join's hash table no longer fits into the CPU's L3 cache, but the
result cache's hash table does. The apparent "random" access of hash
buckets with each hash probe can cause a poor L3 cache hit ratio for large
hash tables. Smaller hash tables generally perform better.
The hash table used for the cache limits itself to not exceeding work_mem
* hash_mem_multiplier in size. We maintain a dlist of keys for this cache
and when we're adding new tuples and realize we've exceeded the memory
budget, we evict cache entries starting with the least recently used ones
until we have enough memory to add the new tuples to the cache.
For parameterized nested loop joins, we now consider using one of these
result cache nodes in between the nested loop node and its inner node. We
determine when this might be useful based on cost, which is primarily
driven off of what the expected cache hit ratio will be. Estimating the
cache hit ratio relies on having good distinct estimates on the nested
loop's parameters.
For now, the planner will only consider using a result cache for
parameterized nested loop joins. This works for both normal joins and
also for LATERAL type joins to subqueries. It is possible to use this new
node for other uses in the future. For example, to cache results from
correlated subqueries. However, that's not done here due to some
difficulties obtaining a distinct estimation on the outer plan to
calculate the estimated cache hit ratio. Currently we plan the inner plan
before planning the outer plan so there is no good way to know if a result
cache would be useful or not since we can't estimate the number of times
the subplan will be called until the outer plan is generated.
The functionality being added here is newly introducing a dependency on
the return value of estimate_num_groups() during the join search.
Previously, during the join search, we only ever needed to perform
selectivity estimations. With this commit, we need to use
estimate_num_groups() in order to estimate what the hit ratio on the
result cache will be. In simple terms, if we expect 10 distinct values
and we expect 1000 outer rows, then we'll estimate the hit ratio to be
99%. Since cache hits are very cheap compared to scanning the underlying
nodes on the inner side of the nested loop join, then this will
significantly reduce the planner's cost for the join. However, it's
fairly easy to see here that things will go bad when estimate_num_groups()
incorrectly returns a value that's significantly lower than the actual
number of distinct values. If this happens then that may cause us to make
use of a nested loop join with a result cache instead of some other join
type, such as a merge or hash join. Our distinct estimations have been
known to be a source of trouble in the past, so the extra reliance on them
here could cause the planner to choose slower plans than it did previous
to having this feature. Distinct estimations are also fairly hard to
estimate accurately when several tables have been joined already or when a
WHERE clause filters out a set of values that are correlated to the
expressions we're estimating the number of distinct value for.
For now, the costing we perform during query planning for result caches
does put quite a bit of faith in the distinct estimations being accurate.
When these are accurate then we should generally see faster execution
times for plans containing a result cache. However, in the real world, we
may find that we need to either change the costings to put less trust in
the distinct estimations being accurate or perhaps even disable this
feature by default. There's always an element of risk when we teach the
query planner to do new tricks that it decides to use that new trick at
the wrong time and causes a regression. Users may opt to get the old
behavior by turning the feature off using the enable_resultcache GUC.
Currently, this is enabled by default. It remains to be seen if we'll
maintain that setting for the release.
Additionally, the name "Result Cache" is the best name I could think of
for this new node at the time I started writing the patch. Nobody seems
to strongly dislike the name. A few people did suggest other names but no
other name seemed to dominate in the brief discussion that there was about
names. Let's allow the beta period to see if the current name pleases
enough people. If there's some consensus on a better name, then we can
change it before the release. Please see the 2nd discussion link below
for the discussion on the "Result Cache" name.
Author: David Rowley
Reviewed-by: Andy Fan, Justin Pryzby, Zhihong Yu, Hou Zhijie
Tested-By: Konstantin Knizhnik
Discussion: https://postgr.es/m/CAApHDvrPcQyQdWERGYWx8J%2B2DLUNgXu%2BfOSbQ1UscxrunyXyrQ%40mail.gmail.com
Discussion: https://postgr.es/m/CAApHDvq=yQXr5kqhRviT2RhNKwToaWr9JAN5t+5_PzhuRJ3wvg@mail.gmail.com
2021-04-02 03:10:56 +02:00
|
|
|
break;
|
2003-01-20 19:55:07 +01:00
|
|
|
case T_UniquePath:
|
|
|
|
_outUniquePath(str, obj);
|
|
|
|
break;
|
2015-11-11 12:29:03 +01:00
|
|
|
case T_GatherPath:
|
|
|
|
_outGatherPath(str, obj);
|
|
|
|
break;
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
case T_ProjectionPath:
|
|
|
|
_outProjectionPath(str, obj);
|
|
|
|
break;
|
Move targetlist SRF handling from expression evaluation to new executor node.
Evaluation of set returning functions (SRFs_ in the targetlist (like SELECT
generate_series(1,5)) so far was done in the expression evaluation (i.e.
ExecEvalExpr()) and projection (i.e. ExecProject/ExecTargetList) code.
This meant that most executor nodes performing projection, and most
expression evaluation functions, had to deal with the possibility that an
evaluated expression could return a set of return values.
That's bad because it leads to repeated code in a lot of places. It also,
and that's my (Andres's) motivation, made it a lot harder to implement a
more efficient way of doing expression evaluation.
To fix this, introduce a new executor node (ProjectSet) that can evaluate
targetlists containing one or more SRFs. To avoid the complexity of the old
way of handling nested expressions returning sets (e.g. having to pass up
ExprDoneCond, and dealing with arguments to functions returning sets etc.),
those SRFs can only be at the top level of the node's targetlist. The
planner makes sure (via split_pathtarget_at_srfs()) that SRF evaluation is
only necessary in ProjectSet nodes and that SRFs are only present at the
top level of the node's targetlist. If there are nested SRFs the planner
creates multiple stacked ProjectSet nodes. The ProjectSet nodes always get
input from an underlying node.
We also discussed and prototyped evaluating targetlist SRFs using ROWS
FROM(), but that turned out to be more complicated than we'd hoped.
While moving SRF evaluation to ProjectSet would allow to retain the old
"least common multiple" behavior when multiple SRFs are present in one
targetlist (i.e. continue returning rows until all SRFs are at the end of
their input at the same time), we decided to instead only return rows till
all SRFs are exhausted, returning NULL for already exhausted ones. We
deemed the previous behavior to be too confusing, unexpected and actually
not particularly useful.
As a side effect, the previously prohibited case of multiple set returning
arguments to a function, is now allowed. Not because it's particularly
desirable, but because it ends up working and there seems to be no argument
for adding code to prohibit it.
Currently the behavior for COALESCE and CASE containing SRFs has changed,
returning multiple rows from the expression, even when the SRF containing
"arm" of the expression is not evaluated. That's because the SRFs are
evaluated in a separate ProjectSet node. As that's quite confusing, we're
likely to instead prohibit SRFs in those places. But that's still being
discussed, and the code would reside in places not touched here, so that's
a task for later.
There's a lot of, now superfluous, code dealing with set return expressions
around. But as the changes to get rid of those are verbose largely boring,
it seems better for readability to keep the cleanup as a separate commit.
Author: Tom Lane and Andres Freund
Discussion: https://postgr.es/m/20160822214023.aaxz5l4igypowyri@alap3.anarazel.de
2017-01-18 21:46:50 +01:00
|
|
|
case T_ProjectSetPath:
|
|
|
|
_outProjectSetPath(str, obj);
|
|
|
|
break;
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
case T_SortPath:
|
|
|
|
_outSortPath(str, obj);
|
|
|
|
break;
|
2020-11-30 22:32:56 +01:00
|
|
|
case T_IncrementalSortPath:
|
|
|
|
_outIncrementalSortPath(str, obj);
|
|
|
|
break;
|
Make the upper part of the planner work by generating and comparing Paths.
I've been saying we needed to do this for more than five years, and here it
finally is. This patch removes the ever-growing tangle of spaghetti logic
that grouping_planner() used to use to try to identify the best plan for
post-scan/join query steps. Now, there is (nearly) independent
consideration of each execution step, and entirely separate construction of
Paths to represent each of the possible ways to do that step. We choose
the best Path or set of Paths using the same add_path() logic that's been
used inside query_planner() for years.
In addition, this patch removes the old restriction that subquery_planner()
could return only a single Plan. It now returns a RelOptInfo containing a
set of Paths, just as query_planner() does, and the parent query level can
use each of those Paths as the basis of a SubqueryScanPath at its level.
This allows finding some optimizations that we missed before, wherein a
subquery was capable of returning presorted data and thereby avoiding a
sort in the parent level, making the overall cost cheaper even though
delivering sorted output was not the cheapest plan for the subquery in
isolation. (A couple of regression test outputs change in consequence of
that. However, there is very little change in visible planner behavior
overall, because the point of this patch is not to get immediate planning
benefits but to create the infrastructure for future improvements.)
There is a great deal left to do here. This patch unblocks a lot of
planner work that was basically impractical in the old code structure,
such as allowing FDWs to implement remote aggregation, or rewriting
plan_set_operations() to allow consideration of multiple implementation
orders for set operations. (The latter will likely require a full
rewrite of plan_set_operations(); what I've done here is only to fix it
to return Paths not Plans.) I have also left unfinished some localized
refactoring in createplan.c and planner.c, because it was not necessary
to get this patch to a working state.
Thanks to Robert Haas, David Rowley, and Amit Kapila for review.
2016-03-07 21:58:22 +01:00
|
|
|
case T_GroupPath:
|
|
|
|
_outGroupPath(str, obj);
|
|
|
|
break;
|
|
|
|
case T_UpperUniquePath:
|
|
|
|
_outUpperUniquePath(str, obj);
|
|
|
|
break;
|
|
|
|
case T_AggPath:
|
|
|
|
_outAggPath(str, obj);
|
|
|
|
break;
|
|
|
|
case T_GroupingSetsPath:
|
|
|
|
_outGroupingSetsPath(str, obj);
|
|
|
|
break;
|
|
|
|
case T_MinMaxAggPath:
|
|
|
|
_outMinMaxAggPath(str, obj);
|
|
|
|
break;
|
|
|
|
case T_WindowAggPath:
|
|
|
|
_outWindowAggPath(str, obj);
|
|
|
|
break;
|
|
|
|
case T_SetOpPath:
|
|
|
|
_outSetOpPath(str, obj);
|
|
|
|
break;
|
|
|
|
case T_RecursiveUnionPath:
|
|
|
|
_outRecursiveUnionPath(str, obj);
|
|
|
|
break;
|
|
|
|
case T_LockRowsPath:
|
|
|
|
_outLockRowsPath(str, obj);
|
|
|
|
break;
|
|
|
|
case T_ModifyTablePath:
|
|
|
|
_outModifyTablePath(str, obj);
|
|
|
|
break;
|
|
|
|
case T_LimitPath:
|
|
|
|
_outLimitPath(str, obj);
|
|
|
|
break;
|
2017-03-09 13:40:36 +01:00
|
|
|
case T_GatherMergePath:
|
|
|
|
_outGatherMergePath(str, obj);
|
|
|
|
break;
|
1999-02-12 07:43:53 +01:00
|
|
|
case T_NestPath:
|
|
|
|
_outNestPath(str, obj);
|
1996-07-09 08:22:35 +02:00
|
|
|
break;
|
|
|
|
case T_MergePath:
|
|
|
|
_outMergePath(str, obj);
|
|
|
|
break;
|
|
|
|
case T_HashPath:
|
|
|
|
_outHashPath(str, obj);
|
|
|
|
break;
|
2007-02-19 08:03:34 +01:00
|
|
|
case T_PlannerGlobal:
|
|
|
|
_outPlannerGlobal(str, obj);
|
|
|
|
break;
|
2005-06-06 00:32:58 +02:00
|
|
|
case T_PlannerInfo:
|
|
|
|
_outPlannerInfo(str, obj);
|
|
|
|
break;
|
|
|
|
case T_RelOptInfo:
|
|
|
|
_outRelOptInfo(str, obj);
|
|
|
|
break;
|
|
|
|
case T_IndexOptInfo:
|
|
|
|
_outIndexOptInfo(str, obj);
|
|
|
|
break;
|
2016-06-18 21:22:34 +02:00
|
|
|
case T_ForeignKeyOptInfo:
|
|
|
|
_outForeignKeyOptInfo(str, obj);
|
|
|
|
break;
|
2007-01-20 21:45:41 +01:00
|
|
|
case T_EquivalenceClass:
|
|
|
|
_outEquivalenceClass(str, obj);
|
|
|
|
break;
|
|
|
|
case T_EquivalenceMember:
|
|
|
|
_outEquivalenceMember(str, obj);
|
|
|
|
break;
|
|
|
|
case T_PathKey:
|
|
|
|
_outPathKey(str, obj);
|
1996-07-09 08:22:35 +02:00
|
|
|
break;
|
2016-03-14 21:59:59 +01:00
|
|
|
case T_PathTarget:
|
|
|
|
_outPathTarget(str, obj);
|
|
|
|
break;
|
Revise parameterized-path mechanism to fix assorted issues.
This patch adjusts the treatment of parameterized paths so that all paths
with the same parameterization (same set of required outer rels) for the
same relation will have the same rowcount estimate. We cache the rowcount
estimates to ensure that property, and hopefully save a few cycles too.
Doing this makes it practical for add_path_precheck to operate without
a rowcount estimate: it need only assume that paths with different
parameterizations never dominate each other, which is close enough to
true anyway for coarse filtering, because normally a more-parameterized
path should yield fewer rows thanks to having more join clauses to apply.
In add_path, we do the full nine yards of comparing rowcount estimates
along with everything else, so that we can discard parameterized paths that
don't actually have an advantage. This fixes some issues I'd found with
add_path rejecting parameterized paths on the grounds that they were more
expensive than not-parameterized ones, even though they yielded many fewer
rows and hence would be cheaper once subsequent joining was considered.
To make the same-rowcounts assumption valid, we have to require that any
parameterized path enforce *all* join clauses that could be obtained from
the particular set of outer rels, even if not all of them are useful for
indexing. This is required at both base scans and joins. It's a good
thing anyway since the net impact is that join quals are checked at the
lowest practical level in the join tree. Hence, discard the original
rather ad-hoc mechanism for choosing parameterization joinquals, and build
a better one that has a more principled rule for when clauses can be moved.
The original rule was actually buggy anyway for lack of knowledge about
which relations are part of an outer join's outer side; getting this right
requires adding an outer_relids field to RestrictInfo.
2012-04-19 21:52:46 +02:00
|
|
|
case T_ParamPathInfo:
|
|
|
|
_outParamPathInfo(str, obj);
|
|
|
|
break;
|
1999-02-03 21:15:53 +01:00
|
|
|
case T_RestrictInfo:
|
|
|
|
_outRestrictInfo(str, obj);
|
1996-07-09 08:22:35 +02:00
|
|
|
break;
|
Refactor the representation of indexable clauses in IndexPaths.
In place of three separate but interrelated lists (indexclauses,
indexquals, and indexqualcols), an IndexPath now has one list
"indexclauses" of IndexClause nodes. This holds basically the same
information as before, but in a more useful format: in particular, there
is now a clear connection between an indexclause (an original restriction
clause from WHERE or JOIN/ON) and the indexquals (directly usable index
conditions) derived from it.
We also change the ground rules a bit by mandating that clause commutation,
if needed, be done up-front so that what is stored in the indexquals list
is always directly usable as an index condition. This gets rid of repeated
re-determination of which side of the clause is the indexkey during costing
and plan generation, as well as repeated lookups of the commutator
operator. To minimize the added up-front cost, the typical case of
commuting a plain OpExpr is handled by a new special-purpose function
commute_restrictinfo(). For RowCompareExprs, generating the new clause
properly commuted to begin with is not really any more complex than before,
it's just different --- and we can save doing that work twice, as the
pretty-klugy original implementation did.
Tracking the connection between original and derived clauses lets us
also track explicitly whether the derived clauses are an exact or lossy
translation of the original. This provides a cheap solution to getting
rid of unnecessary rechecks of boolean index clauses, which previously
seemed like it'd be more expensive than it was worth.
Another pleasant (IMO) side-effect is that EXPLAIN now always shows
index clauses with the indexkey on the left; this seems less confusing.
This commit leaves expand_indexqual_conditions() and some related
functions in a slightly messy state. I didn't bother to change them
any more than minimally necessary to work with the new data structure,
because all that code is going to be refactored out of existence in
a follow-on patch.
Discussion: https://postgr.es/m/22182.1549124950@sss.pgh.pa.us
2019-02-09 23:30:43 +01:00
|
|
|
case T_IndexClause:
|
|
|
|
_outIndexClause(str, obj);
|
|
|
|
break;
|
2008-10-21 22:42:53 +02:00
|
|
|
case T_PlaceHolderVar:
|
|
|
|
_outPlaceHolderVar(str, obj);
|
|
|
|
break;
|
2008-08-14 20:48:00 +02:00
|
|
|
case T_SpecialJoinInfo:
|
|
|
|
_outSpecialJoinInfo(str, obj);
|
2003-01-20 19:55:07 +01:00
|
|
|
break;
|
2006-01-31 22:39:25 +01:00
|
|
|
case T_AppendRelInfo:
|
|
|
|
_outAppendRelInfo(str, obj);
|
|
|
|
break;
|
Rework planning and execution of UPDATE and DELETE.
This patch makes two closely related sets of changes:
1. For UPDATE, the subplan of the ModifyTable node now only delivers
the new values of the changed columns (i.e., the expressions computed
in the query's SET clause) plus row identity information such as CTID.
ModifyTable must re-fetch the original tuple to merge in the old
values of any unchanged columns. The core advantage of this is that
the changed columns are uniform across all tables of an inherited or
partitioned target relation, whereas the other columns might not be.
A secondary advantage, when the UPDATE involves joins, is that less
data needs to pass through the plan tree. The disadvantage of course
is an extra fetch of each tuple to be updated. However, that seems to
be very nearly free in context; even worst-case tests don't show it to
add more than a couple percent to the total query cost. At some point
it might be interesting to combine the re-fetch with the tuple access
that ModifyTable must do anyway to mark the old tuple dead; but that
would require a good deal of refactoring and it seems it wouldn't buy
all that much, so this patch doesn't attempt it.
2. For inherited UPDATE/DELETE, instead of generating a separate
subplan for each target relation, we now generate a single subplan
that is just exactly like a SELECT's plan, then stick ModifyTable
on top of that. To let ModifyTable know which target relation a
given incoming row refers to, a tableoid junk column is added to
the row identity information. This gets rid of the horrid hack
that was inheritance_planner(), eliminating O(N^2) planning cost
and memory consumption in cases where there were many unprunable
target relations.
Point 2 of course requires point 1, so that there is a uniform
definition of the non-junk columns to be returned by the subplan.
We can't insist on uniform definition of the row identity junk
columns however, if we want to keep the ability to have both
plain and foreign tables in a partitioning hierarchy. Since
it wouldn't scale very far to have every child table have its
own row identity column, this patch includes provisions to merge
similar row identity columns into one column of the subplan result.
In particular, we can merge the whole-row Vars typically used as
row identity by FDWs into one column by pretending they are type
RECORD. (It's still okay for the actual composite Datums to be
labeled with the table's rowtype OID, though.)
There is more that can be done to file down residual inefficiencies
in this patch, but it seems to be committable now.
FDW authors should note several API changes:
* The argument list for AddForeignUpdateTargets() has changed, and so
has the method it must use for adding junk columns to the query. Call
add_row_identity_var() instead of manipulating the parse tree directly.
You might want to reconsider exactly what you're adding, too.
* PlanDirectModify() must now work a little harder to find the
ForeignScan plan node; if the foreign table is part of a partitioning
hierarchy then the ForeignScan might not be the direct child of
ModifyTable. See postgres_fdw for sample code.
* To check whether a relation is a target relation, it's no
longer sufficient to compare its relid to root->parse->resultRelation.
Instead, check it against all_result_relids or leaf_result_relids,
as appropriate.
Amit Langote and Tom Lane
Discussion: https://postgr.es/m/CA+HiwqHpHdqdDn48yCEhynnniahH78rwcrv1rEX65-fsZGBOLQ@mail.gmail.com
2021-03-31 17:52:34 +02:00
|
|
|
case T_RowIdentityVarInfo:
|
|
|
|
_outRowIdentityVarInfo(str, obj);
|
|
|
|
break;
|
2008-10-21 22:42:53 +02:00
|
|
|
case T_PlaceHolderInfo:
|
|
|
|
_outPlaceHolderInfo(str, obj);
|
|
|
|
break;
|
2010-11-04 17:01:17 +01:00
|
|
|
case T_MinMaxAggInfo:
|
|
|
|
_outMinMaxAggInfo(str, obj);
|
|
|
|
break;
|
2007-02-19 08:03:34 +01:00
|
|
|
case T_PlannerParamItem:
|
|
|
|
_outPlannerParamItem(str, obj);
|
|
|
|
break;
|
2017-03-27 05:20:54 +02:00
|
|
|
case T_RollupData:
|
|
|
|
_outRollupData(str, obj);
|
|
|
|
break;
|
|
|
|
case T_GroupingSetData:
|
|
|
|
_outGroupingSetData(str, obj);
|
|
|
|
break;
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
|
|
|
case T_StatisticExtInfo:
|
|
|
|
_outStatisticExtInfo(str, obj);
|
|
|
|
break;
|
Introduce extensible node types.
An extensible node is always tagged T_Extensible, but the extnodename
field identifies it more specifically; it may also include arbitrary
private data. Extensible nodes can be copied, tested for equality,
serialized, and deserialized, but the core system doesn't know
anything about them otherwise. Some extensions may find it useful to
include these nodes in fdw_private or custom_private lists in lieu of
arm-wrestling their data into a format that the core code can
understand.
Along the way, so as not to burden the authors of such extensible
node types too much, expose the functions for writing serialized
tokens, and for serializing and deserializing bitmapsets.
KaiGai Kohei, per a design suggested by me. Reviewed by Andres Freund
and by me, and further edited by me.
2016-02-12 15:31:16 +01:00
|
|
|
case T_ExtensibleNode:
|
|
|
|
_outExtensibleNode(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_CreateStmt:
|
|
|
|
_outCreateStmt(str, obj);
|
|
|
|
break;
|
2011-01-02 05:48:11 +01:00
|
|
|
case T_CreateForeignTableStmt:
|
|
|
|
_outCreateForeignTableStmt(str, obj);
|
|
|
|
break;
|
2014-07-10 21:01:31 +02:00
|
|
|
case T_ImportForeignSchemaStmt:
|
|
|
|
_outImportForeignSchemaStmt(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_IndexStmt:
|
|
|
|
_outIndexStmt(str, obj);
|
|
|
|
break;
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
|
|
|
case T_CreateStatsStmt:
|
|
|
|
_outCreateStatsStmt(str, obj);
|
|
|
|
break;
|
Allow setting statistics target for extended statistics
When building statistics, we need to decide how many rows to sample and
how accurate the resulting statistics should be. Until now, it was not
possible to explicitly define statistics target for extended statistics
objects, the value was always computed from the per-attribute targets
with a fallback to the system-wide default statistics target.
That's a bit inconvenient, as it ties together the statistics target set
for per-column and extended statistics. In some cases it may be useful
to require larger sample / higher accuracy for extended statics (or the
other way around), but with this approach that's not possible.
So this commit introduces a new command, allowing to specify statistics
target for individual extended statistics objects, overriding the value
derived from per-attribute targets (and the system default).
ALTER STATISTICS stat_name SET STATISTICS target_value;
When determining statistics target for an extended statistics object we
first look at this explicitly set value. When this value is -1, we fall
back to the old formula, looking at the per-attribute targets first and
then the system default. This means the behavior is backwards compatible
with older PostgreSQL releases.
Author: Tomas Vondra
Discussion: https://postgr.es/m/20190618213357.vli3i23vpkset2xd@development
Reviewed-by: Kirk Jamison, Dean Rasheed
2019-09-10 20:09:27 +02:00
|
|
|
case T_AlterStatsStmt:
|
|
|
|
_outAlterStatsStmt(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_NotifyStmt:
|
|
|
|
_outNotifyStmt(str, obj);
|
|
|
|
break;
|
2003-03-10 04:53:52 +01:00
|
|
|
case T_DeclareCursorStmt:
|
|
|
|
_outDeclareCursorStmt(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_SelectStmt:
|
|
|
|
_outSelectStmt(str, obj);
|
|
|
|
break;
|
2021-04-07 21:30:08 +02:00
|
|
|
case T_ReturnStmt:
|
|
|
|
_outReturnStmt(str, obj);
|
|
|
|
break;
|
2021-01-04 17:52:00 +01:00
|
|
|
case T_PLAssignStmt:
|
|
|
|
_outPLAssignStmt(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_ColumnDef:
|
|
|
|
_outColumnDef(str, obj);
|
|
|
|
break;
|
|
|
|
case T_TypeName:
|
|
|
|
_outTypeName(str, obj);
|
|
|
|
break;
|
|
|
|
case T_TypeCast:
|
|
|
|
_outTypeCast(str, obj);
|
|
|
|
break;
|
2011-03-11 22:27:51 +01:00
|
|
|
case T_CollateClause:
|
|
|
|
_outCollateClause(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_IndexElem:
|
|
|
|
_outIndexElem(str, obj);
|
|
|
|
break;
|
Extended statistics on expressions
Allow defining extended statistics on expressions, not just just on
simple column references. With this commit, expressions are supported
by all existing extended statistics kinds, improving the same types of
estimates. A simple example may look like this:
CREATE TABLE t (a int);
CREATE STATISTICS s ON mod(a,10), mod(a,20) FROM t;
ANALYZE t;
The collected statistics are useful e.g. to estimate queries with those
expressions in WHERE or GROUP BY clauses:
SELECT * FROM t WHERE mod(a,10) = 0 AND mod(a,20) = 0;
SELECT 1 FROM t GROUP BY mod(a,10), mod(a,20);
This introduces new internal statistics kind 'e' (expressions) which is
built automatically when the statistics object definition includes any
expressions. This represents single-expression statistics, as if there
was an expression index (but without the index maintenance overhead).
The statistics is stored in pg_statistics_ext_data as an array of
composite types, which is possible thanks to 79f6a942bd.
CREATE STATISTICS allows building statistics on a single expression, in
which case in which case it's not possible to specify statistics kinds.
A new system view pg_stats_ext_exprs can be used to display expression
statistics, similarly to pg_stats and pg_stats_ext views.
ALTER TABLE ... ALTER COLUMN ... TYPE now treats indexes the same way it
treats indexes, i.e. it drops and recreates the statistics. This means
all statistics are reset, and we no longer try to preserve at least the
functional dependencies. This should not be a major issue in practice,
as the functional dependencies actually rely on per-column statistics,
which were always reset anyway.
Author: Tomas Vondra
Reviewed-by: Justin Pryzby, Dean Rasheed, Zhihong Yu
Discussion: https://postgr.es/m/ad7891d2-e90c-b446-9fe2-7419143847d7%40enterprisedb.com
2021-03-26 23:22:01 +01:00
|
|
|
case T_StatsElem:
|
|
|
|
_outStatsElem(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_Query:
|
|
|
|
_outQuery(str, obj);
|
|
|
|
break;
|
2013-07-18 23:10:16 +02:00
|
|
|
case T_WithCheckOption:
|
|
|
|
_outWithCheckOption(str, obj);
|
|
|
|
break;
|
2008-08-02 23:32:01 +02:00
|
|
|
case T_SortGroupClause:
|
|
|
|
_outSortGroupClause(str, obj);
|
2002-12-12 16:49:42 +01:00
|
|
|
break;
|
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
|
|
|
case T_GroupingSet:
|
|
|
|
_outGroupingSet(str, obj);
|
|
|
|
break;
|
2008-12-28 19:54:01 +01:00
|
|
|
case T_WindowClause:
|
|
|
|
_outWindowClause(str, obj);
|
|
|
|
break;
|
2006-04-30 20:30:40 +02:00
|
|
|
case T_RowMarkClause:
|
|
|
|
_outRowMarkClause(str, obj);
|
|
|
|
break;
|
2008-10-04 23:56:55 +02:00
|
|
|
case T_WithClause:
|
|
|
|
_outWithClause(str, obj);
|
|
|
|
break;
|
2021-02-01 13:54:59 +01:00
|
|
|
case T_CTESearchClause:
|
|
|
|
_outCTESearchClause(str, obj);
|
|
|
|
break;
|
|
|
|
case T_CTECycleClause:
|
|
|
|
_outCTECycleClause(str, obj);
|
|
|
|
break;
|
2008-10-04 23:56:55 +02:00
|
|
|
case T_CommonTableExpr:
|
|
|
|
_outCommonTableExpr(str, obj);
|
|
|
|
break;
|
2022-03-28 16:45:58 +02:00
|
|
|
case T_MergeWhenClause:
|
|
|
|
_outMergeWhenClause(str, obj);
|
|
|
|
break;
|
|
|
|
case T_MergeAction:
|
|
|
|
_outMergeAction(str, obj);
|
|
|
|
break;
|
2002-12-12 16:49:42 +01:00
|
|
|
case T_SetOperationStmt:
|
|
|
|
_outSetOperationStmt(str, obj);
|
|
|
|
break;
|
|
|
|
case T_RangeTblEntry:
|
|
|
|
_outRangeTblEntry(str, obj);
|
|
|
|
break;
|
Support multi-argument UNNEST(), and TABLE() syntax for multiple functions.
This patch adds the ability to write TABLE( function1(), function2(), ...)
as a single FROM-clause entry. The result is the concatenation of the
first row from each function, followed by the second row from each
function, etc; with NULLs inserted if any function produces fewer rows than
others. This is believed to be a much more useful behavior than what
Postgres currently does with multiple SRFs in a SELECT list.
This syntax also provides a reasonable way to combine use of column
definition lists with WITH ORDINALITY: put the column definition list
inside TABLE(), where it's clear that it doesn't control the ordinality
column as well.
Also implement SQL-compliant multiple-argument UNNEST(), by turning
UNNEST(a,b,c) into TABLE(unnest(a), unnest(b), unnest(c)).
The SQL standard specifies TABLE() with only a single function, not
multiple functions, and it seems to require an implicit UNNEST() which is
not what this patch does. There may be something wrong with that reading
of the spec, though, because if it's right then the spec's TABLE() is just
a pointless alternative spelling of UNNEST(). After further review of
that, we might choose to adopt a different syntax for what this patch does,
but in any case this functionality seems clearly worthwhile.
Andrew Gierth, reviewed by Zoltán Böszörményi and Heikki Linnakangas, and
significantly revised by me
2013-11-22 01:37:02 +01:00
|
|
|
case T_RangeTblFunction:
|
|
|
|
_outRangeTblFunction(str, obj);
|
|
|
|
break;
|
Redesign tablesample method API, and do extensive code review.
The original implementation of TABLESAMPLE modeled the tablesample method
API on index access methods, which wasn't a good choice because, without
specialized DDL commands, there's no way to build an extension that can
implement a TSM. (Raw inserts into system catalogs are not an acceptable
thing to do, because we can't undo them during DROP EXTENSION, nor will
pg_upgrade behave sanely.) Instead adopt an API more like procedural
language handlers or foreign data wrappers, wherein the only SQL-level
support object needed is a single handler function identified by having
a special return type. This lets us get rid of the supporting catalog
altogether, so that no custom DDL support is needed for the feature.
Adjust the API so that it can support non-constant tablesample arguments
(the original coding assumed we could evaluate the argument expressions at
ExecInitSampleScan time, which is undesirable even if it weren't outright
unsafe), and discourage sampling methods from looking at invisible tuples.
Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable
within and across queries, as required by the SQL standard, and deal more
honestly with methods that can't support that requirement.
Make a full code-review pass over the tablesample additions, and fix
assorted bugs, omissions, infelicities, and cosmetic issues (such as
failure to put the added code stanzas in a consistent ordering).
Improve EXPLAIN's output of tablesample plans, too.
Back-patch to 9.5 so that we don't have to support the original API
in production.
2015-07-25 20:39:00 +02:00
|
|
|
case T_TableSampleClause:
|
|
|
|
_outTableSampleClause(str, obj);
|
|
|
|
break;
|
1997-12-23 20:50:54 +01:00
|
|
|
case T_A_Expr:
|
2021-07-21 10:24:06 +02:00
|
|
|
_outA_Expr(str, obj);
|
1997-12-23 20:50:54 +01:00
|
|
|
break;
|
2002-03-21 17:02:16 +01:00
|
|
|
case T_ColumnRef:
|
|
|
|
_outColumnRef(str, obj);
|
|
|
|
break;
|
|
|
|
case T_ParamRef:
|
|
|
|
_outParamRef(str, obj);
|
|
|
|
break;
|
2018-09-16 19:02:47 +02:00
|
|
|
case T_RawStmt:
|
|
|
|
_outRawStmt(str, obj);
|
|
|
|
break;
|
1997-12-23 20:50:54 +01:00
|
|
|
case T_A_Const:
|
2021-07-21 10:24:06 +02:00
|
|
|
_outA_Const(str, obj);
|
1997-12-23 20:50:54 +01:00
|
|
|
break;
|
2008-08-30 03:39:14 +02:00
|
|
|
case T_A_Star:
|
|
|
|
_outA_Star(str, obj);
|
|
|
|
break;
|
2004-06-09 21:08:20 +02:00
|
|
|
case T_A_Indices:
|
|
|
|
_outA_Indices(str, obj);
|
|
|
|
break;
|
|
|
|
case T_A_Indirection:
|
|
|
|
_outA_Indirection(str, obj);
|
|
|
|
break;
|
2008-03-20 22:42:48 +01:00
|
|
|
case T_A_ArrayExpr:
|
|
|
|
_outA_ArrayExpr(str, obj);
|
|
|
|
break;
|
2004-06-09 21:08:20 +02:00
|
|
|
case T_ResTarget:
|
|
|
|
_outResTarget(str, obj);
|
2002-03-21 17:02:16 +01:00
|
|
|
break;
|
Implement UPDATE tab SET (col1,col2,...) = (SELECT ...), ...
This SQL-standard feature allows a sub-SELECT yielding multiple columns
(but only one row) to be used to compute the new values of several columns
to be updated. While the same results can be had with an independent
sub-SELECT per column, such a workaround can require a great deal of
duplicated computation.
The standard actually says that the source for a multi-column assignment
could be any row-valued expression. The implementation used here is
tightly tied to our existing sub-SELECT support and can't handle other
cases; the Bison grammar would have some issues with them too. However,
I don't feel too bad about this since other cases can be converted into
sub-SELECTs. For instance, "SET (a,b,c) = row_valued_function(x)" could
be written "SET (a,b,c) = (SELECT * FROM row_valued_function(x))".
2014-06-18 19:22:25 +02:00
|
|
|
case T_MultiAssignRef:
|
|
|
|
_outMultiAssignRef(str, obj);
|
|
|
|
break;
|
2008-07-17 18:02:12 +02:00
|
|
|
case T_SortBy:
|
|
|
|
_outSortBy(str, obj);
|
|
|
|
break;
|
2008-12-28 19:54:01 +01:00
|
|
|
case T_WindowDef:
|
|
|
|
_outWindowDef(str, obj);
|
|
|
|
break;
|
2008-10-04 23:56:55 +02:00
|
|
|
case T_RangeSubselect:
|
|
|
|
_outRangeSubselect(str, obj);
|
|
|
|
break;
|
|
|
|
case T_RangeFunction:
|
|
|
|
_outRangeFunction(str, obj);
|
|
|
|
break;
|
Redesign tablesample method API, and do extensive code review.
The original implementation of TABLESAMPLE modeled the tablesample method
API on index access methods, which wasn't a good choice because, without
specialized DDL commands, there's no way to build an extension that can
implement a TSM. (Raw inserts into system catalogs are not an acceptable
thing to do, because we can't undo them during DROP EXTENSION, nor will
pg_upgrade behave sanely.) Instead adopt an API more like procedural
language handlers or foreign data wrappers, wherein the only SQL-level
support object needed is a single handler function identified by having
a special return type. This lets us get rid of the supporting catalog
altogether, so that no custom DDL support is needed for the feature.
Adjust the API so that it can support non-constant tablesample arguments
(the original coding assumed we could evaluate the argument expressions at
ExecInitSampleScan time, which is undesirable even if it weren't outright
unsafe), and discourage sampling methods from looking at invisible tuples.
Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable
within and across queries, as required by the SQL standard, and deal more
honestly with methods that can't support that requirement.
Make a full code-review pass over the tablesample additions, and fix
assorted bugs, omissions, infelicities, and cosmetic issues (such as
failure to put the added code stanzas in a consistent ordering).
Improve EXPLAIN's output of tablesample plans, too.
Back-patch to 9.5 so that we don't have to support the original API
in production.
2015-07-25 20:39:00 +02:00
|
|
|
case T_RangeTableSample:
|
|
|
|
_outRangeTableSample(str, obj);
|
|
|
|
break;
|
2017-03-08 16:39:37 +01:00
|
|
|
case T_RangeTableFunc:
|
|
|
|
_outRangeTableFunc(str, obj);
|
|
|
|
break;
|
|
|
|
case T_RangeTableFuncCol:
|
|
|
|
_outRangeTableFuncCol(str, obj);
|
|
|
|
break;
|
1998-12-04 16:34:49 +01:00
|
|
|
case T_Constraint:
|
|
|
|
_outConstraint(str, obj);
|
|
|
|
break;
|
1999-02-23 09:01:47 +01:00
|
|
|
case T_FuncCall:
|
|
|
|
_outFuncCall(str, obj);
|
|
|
|
break;
|
2004-06-09 21:08:20 +02:00
|
|
|
case T_DefElem:
|
|
|
|
_outDefElem(str, obj);
|
|
|
|
break;
|
2012-01-07 13:58:13 +01:00
|
|
|
case T_TableLikeClause:
|
|
|
|
_outTableLikeClause(str, obj);
|
2010-11-13 06:34:45 +01:00
|
|
|
break;
|
2005-08-01 22:31:16 +02:00
|
|
|
case T_LockingClause:
|
|
|
|
_outLockingClause(str, obj);
|
|
|
|
break;
|
2007-02-03 15:06:56 +01:00
|
|
|
case T_XmlSerialize:
|
|
|
|
_outXmlSerialize(str, obj);
|
|
|
|
break;
|
2016-06-18 21:22:34 +02:00
|
|
|
case T_ForeignKeyCacheInfo:
|
|
|
|
_outForeignKeyCacheInfo(str, obj);
|
|
|
|
break;
|
2016-11-04 16:49:50 +01:00
|
|
|
case T_TriggerTransition:
|
|
|
|
_outTriggerTransition(str, obj);
|
|
|
|
break;
|
Implement table partitioning.
Table partitioning is like table inheritance and reuses much of the
existing infrastructure, but there are some important differences.
The parent is called a partitioned table and is always empty; it may
not have indexes or non-inherited constraints, since those make no
sense for a relation with no data of its own. The children are called
partitions and contain all of the actual data. Each partition has an
implicit partitioning constraint. Multiple inheritance is not
allowed, and partitioning and inheritance can't be mixed. Partitions
can't have extra columns and may not allow nulls unless the parent
does. Tuples inserted into the parent are automatically routed to the
correct partition, so tuple-routing ON INSERT triggers are not needed.
Tuple routing isn't yet supported for partitions which are foreign
tables, and it doesn't handle updates that cross partition boundaries.
Currently, tables can be range-partitioned or list-partitioned. List
partitioning is limited to a single column, but range partitioning can
involve multiple columns. A partitioning "column" can be an
expression.
Because table partitioning is less general than table inheritance, it
is hoped that it will be easier to reason about properties of
partitions, and therefore that this will serve as a better foundation
for a variety of possible optimizations, including query planner
optimizations. The tuple routing based which this patch does based on
the implicit partitioning constraints is an example of this, but it
seems likely that many other useful optimizations are also possible.
Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat,
Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova,
Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
|
|
|
case T_PartitionElem:
|
|
|
|
_outPartitionElem(str, obj);
|
|
|
|
break;
|
Code review focused on new node types added by partitioning support.
Fix failure to check that we got a plain Const from const-simplification of
a coercion request. This is the cause of bug #14666 from Tian Bing: there
is an int4 to money cast, but it's only stable not immutable (because of
dependence on lc_monetary), resulting in a FuncExpr that the code was
miserably unequipped to deal with, or indeed even to notice that it was
failing to deal with. Add test cases around this coercion behavior.
In view of the above, sprinkle the code liberally with castNode() macros,
in hope of catching the next such bug a bit sooner. Also, change some
functions that were randomly declared to take Node* to take more specific
pointer types. And change some struct fields that were declared Node*
but could be given more specific types, allowing removal of assorted
explicit casts.
Place PARTITION_MAX_KEYS check a bit closer to the code it's protecting.
Likewise check only-one-key-for-list-partitioning restriction in a less
random place.
Avoid not-per-project-style usages like !strcmp(...).
Fix assorted failures to avoid scribbling on the input of parse
transformation. I'm not sure how necessary this is, but it's entirely
silly for these functions to be expending cycles to avoid that and not
getting it right.
Add guards against partitioning on system columns.
Put backend/nodes/ support code into an order that matches handling
of these node types elsewhere.
Annotate the fact that somebody added location fields to PartitionBoundSpec
and PartitionRangeDatum but forgot to handle them in
outfuncs.c/readfuncs.c. This is fairly harmless for production purposes
(since readfuncs.c would just substitute -1 anyway) but it's still bogus.
It's not worth forcing a post-beta1 initdb just to fix this, but if we
have another reason to force initdb before 10.0, we should go back and
clean this up.
Contrariwise, somebody added location fields to PartitionElem and
PartitionSpec but forgot to teach exprLocation() about them.
Consolidate duplicative code in transformPartitionBound().
Improve a couple of error messages.
Improve assorted commentary.
Re-pgindent the files touched by this patch; this affects a few comment
blocks that must have been added quite recently.
Report: https://postgr.es/m/20170524024550.29935.14396@wrigleys.postgresql.org
2017-05-29 05:20:28 +02:00
|
|
|
case T_PartitionSpec:
|
|
|
|
_outPartitionSpec(str, obj);
|
|
|
|
break;
|
Implement table partitioning.
Table partitioning is like table inheritance and reuses much of the
existing infrastructure, but there are some important differences.
The parent is called a partitioned table and is always empty; it may
not have indexes or non-inherited constraints, since those make no
sense for a relation with no data of its own. The children are called
partitions and contain all of the actual data. Each partition has an
implicit partitioning constraint. Multiple inheritance is not
allowed, and partitioning and inheritance can't be mixed. Partitions
can't have extra columns and may not allow nulls unless the parent
does. Tuples inserted into the parent are automatically routed to the
correct partition, so tuple-routing ON INSERT triggers are not needed.
Tuple routing isn't yet supported for partitions which are foreign
tables, and it doesn't handle updates that cross partition boundaries.
Currently, tables can be range-partitioned or list-partitioned. List
partitioning is limited to a single column, but range partitioning can
involve multiple columns. A partitioning "column" can be an
expression.
Because table partitioning is less general than table inheritance, it
is hoped that it will be easier to reason about properties of
partitions, and therefore that this will serve as a better foundation
for a variety of possible optimizations, including query planner
optimizations. The tuple routing based which this patch does based on
the implicit partitioning constraints is an example of this, but it
seems likely that many other useful optimizations are also possible.
Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat,
Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova,
Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
|
|
|
case T_PartitionBoundSpec:
|
|
|
|
_outPartitionBoundSpec(str, obj);
|
|
|
|
break;
|
|
|
|
case T_PartitionRangeDatum:
|
|
|
|
_outPartitionRangeDatum(str, obj);
|
|
|
|
break;
|
Common SQL/JSON clauses
This introduces some of the building blocks used by the SQL/JSON
constructor and query functions. Specifically, it provides node
executor and grammar support for the FORMAT JSON [ENCODING foo]
clause, and values decorated with it, and for the RETURNING clause.
The following SQL/JSON patches will leverage these.
Nikita Glukhov (who probably deserves an award for perseverance).
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/cd0bb935-0158-78a7-08b5-904886deac4b@postgrespro.ru
2022-03-03 19:00:49 +01:00
|
|
|
case T_JsonFormat:
|
|
|
|
_outJsonFormat(str, obj);
|
|
|
|
break;
|
|
|
|
case T_JsonReturning:
|
|
|
|
_outJsonReturning(str, obj);
|
|
|
|
break;
|
|
|
|
case T_JsonValueExpr:
|
|
|
|
_outJsonValueExpr(str, obj);
|
|
|
|
break;
|
SQL/JSON constructors
This patch introduces the SQL/JSON standard constructors for JSON:
JSON()
JSON_ARRAY()
JSON_ARRAYAGG()
JSON_OBJECT()
JSON_OBJECTAGG()
For the most part these functions provide facilities that mimic
existing json/jsonb functions. However, they also offer some useful
additional functionality. In addition to text input, the JSON() function
accepts bytea input, which it will decode and constuct a json value from.
The other functions provide useful options for handling duplicate keys
and null values.
This series of patches will be followed by a consolidated documentation
patch.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/cd0bb935-0158-78a7-08b5-904886deac4b@postgrespro.ru
2022-03-03 19:02:10 +01:00
|
|
|
case T_JsonConstructorExpr:
|
|
|
|
_outJsonConstructorExpr(str, obj);
|
|
|
|
break;
|
IS JSON predicate
This patch intrdocuces the SQL standard IS JSON predicate. It operates
on text and bytea values representing JSON as well as on the json and
jsonb types. Each test has an IS and IS NOT variant. The tests are:
IS JSON [VALUE]
IS JSON ARRAY
IS JSON OBJECT
IS JSON SCALAR
IS JSON WITH | WITHOUT UNIQUE KEYS
These are mostly self-explanatory, but note that IS JSON WITHOUT UNIQUE
KEYS is true whenever IS JSON is true, and IS JSON WITH UNIQUE KEYS is
true whenever IS JSON is true except it IS JSON OBJECT is true and there
are duplicate keys (which is never the case when applied to jsonb values).
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/cd0bb935-0158-78a7-08b5-904886deac4b@postgrespro.ru
2022-03-03 19:02:53 +01:00
|
|
|
case T_JsonIsPredicate:
|
|
|
|
_outJsonIsPredicate(str, obj);
|
|
|
|
break;
|
SQL/JSON query functions
This introduces the SQL/JSON functions for querying JSON data using
jsonpath expressions. The functions are:
JSON_EXISTS()
JSON_QUERY()
JSON_VALUE()
All of these functions only operate on jsonb. The workaround for now is
to cast the argument to jsonb.
JSON_EXISTS() tests if the jsonpath expression applied to the jsonb
value yields any values. JSON_VALUE() must return a single value, and an
error occurs if it tries to return multiple values. JSON_QUERY() must
return a json object or array, and there are various WRAPPER options for
handling scalar or multi-value results. Both these functions have
options for handling EMPTY and ERROR conditions.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zihong Yu,
Himanshu Upadhyaya, Daniel Gustafsson, Justin Pryzby.
Discussion: https://postgr.es/m/cd0bb935-0158-78a7-08b5-904886deac4b@postgrespro.ru
2022-03-03 19:11:14 +01:00
|
|
|
case T_JsonBehavior:
|
|
|
|
_outJsonBehavior(str, obj);
|
|
|
|
break;
|
|
|
|
case T_JsonExpr:
|
|
|
|
_outJsonExpr(str, obj);
|
|
|
|
break;
|
|
|
|
case T_JsonCoercion:
|
|
|
|
_outJsonCoercion(str, obj);
|
|
|
|
break;
|
|
|
|
case T_JsonItemCoercions:
|
|
|
|
_outJsonItemCoercions(str, obj);
|
|
|
|
break;
|
JSON_TABLE
This feature allows jsonb data to be treated as a table and thus used in
a FROM clause like other tabular data. Data can be selected from the
jsonb using jsonpath expressions, and hoisted out of nested structures
in the jsonb to form multiple rows, more or less like an outer join.
Nikita Glukhov
Reviewers have included (in no particular order) Andres Freund, Alexander
Korotkov, Pavel Stehule, Andrew Alsup, Erik Rijkers, Zhihong Yu (whose
name I previously misspelled), Himanshu Upadhyaya, Daniel Gustafsson,
Justin Pryzby.
Discussion: https://postgr.es/m/7e2cb85d-24cf-4abb-30a5-1a33715959bd@postgrespro.ru
2022-04-04 21:36:03 +02:00
|
|
|
case T_JsonTableParent:
|
|
|
|
_outJsonTableParent(str, obj);
|
|
|
|
break;
|
|
|
|
case T_JsonTableSibling:
|
|
|
|
_outJsonTableSibling(str, obj);
|
|
|
|
break;
|
1999-02-23 09:01:47 +01:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
default:
|
2003-08-04 02:43:34 +02:00
|
|
|
|
2003-07-23 01:30:39 +02:00
|
|
|
/*
|
|
|
|
* This should be an ERROR, but it's too useful to be able to
|
2016-04-08 23:26:36 +02:00
|
|
|
* dump structures that outNode only understands part of.
|
2003-07-23 01:30:39 +02:00
|
|
|
*/
|
|
|
|
elog(WARNING, "could not dump unrecognized node type: %d",
|
|
|
|
(int) nodeTag(obj));
|
1996-07-09 08:22:35 +02:00
|
|
|
break;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2000-01-14 01:53:21 +01:00
|
|
|
appendStringInfoChar(str, '}');
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* nodeToString -
|
1999-04-25 05:19:27 +02:00
|
|
|
* returns the ascii representation of the Node as a palloc'd string
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
char *
|
2011-12-07 20:46:56 +01:00
|
|
|
nodeToString(const void *obj)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
1999-04-25 05:19:27 +02:00
|
|
|
StringInfoData str;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1999-04-25 05:19:27 +02:00
|
|
|
/* see stringinfo.h for an explanation of this maneuver */
|
|
|
|
initStringInfo(&str);
|
2016-04-08 23:26:36 +02:00
|
|
|
outNode(&str, obj);
|
1999-04-25 05:19:27 +02:00
|
|
|
return str.data;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
2016-09-16 15:36:19 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* bmsToString -
|
|
|
|
* returns the ascii representation of the Bitmapset as a palloc'd string
|
|
|
|
*/
|
|
|
|
char *
|
|
|
|
bmsToString(const Bitmapset *bms)
|
|
|
|
{
|
|
|
|
StringInfoData str;
|
|
|
|
|
|
|
|
/* see stringinfo.h for an explanation of this maneuver */
|
|
|
|
initStringInfo(&str);
|
|
|
|
outBitmapset(&str, bms);
|
|
|
|
return str.data;
|
|
|
|
}
|